Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| NumberBlocks One Voice Cloner - HiFi-GAN V2 | |
| 集成 HiFi-GAN vocoder 提升推理音质 | |
| 功能: | |
| 1. 上传音频 → RVC 音色转换(使用 HiFi-GAN vocoder) | |
| 2. 随机采样生成 One 的语音 | |
| 3. 音高调节 | |
| 技术栈: | |
| - RVC 模型 (one_voice_rvc_v2.pth, 60.7MB VITS-like) | |
| - HiFi-GAN Universal Vocoder (预训练) | |
| - Gradio UI | |
| """ | |
| import os | |
| import json | |
| import random | |
| import tempfile | |
| import numpy as np | |
| import soundfile as sf | |
| import librosa | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import gradio as gr | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download, HfApi | |
| # ============================================================ | |
| # 模型定义 - VITS-like RVC Model | |
| # ============================================================ | |
| class PosteriorEncoder(nn.Module): | |
| def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4): | |
| super().__init__() | |
| self.pre = nn.Conv1d(in_channels, hidden_channels, 1) | |
| self.enc = nn.ModuleList() | |
| for _ in range(n_layers): | |
| self.enc.append(nn.Sequential( | |
| nn.Conv1d(hidden_channels, hidden_channels, kernel_size, | |
| padding=(kernel_size - 1) * dilation_rate // 2, | |
| dilation=dilation_rate), | |
| nn.GLU(dim=1), | |
| )) | |
| self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1) | |
| def forward(self, x): | |
| x = self.pre(x) | |
| for layer in self.enc: | |
| x = x + layer(x) | |
| stats = self.proj(x) | |
| m, logs = stats.chunk(2, dim=1) | |
| return m, logs | |
| class ResidualCouplingBlock(nn.Module): | |
| def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4): | |
| super().__init__() | |
| self.flows = nn.ModuleList() | |
| for _ in range(n_flows): | |
| self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers)) | |
| self.flows.append(Flip()) | |
| def forward(self, x, reverse=False): | |
| if not reverse: | |
| for flow in self.flows: | |
| x, _ = flow(x, reverse=reverse) | |
| else: | |
| for flow in reversed(self.flows): | |
| x = flow(x, reverse=reverse) | |
| return x | |
| class ResidualCouplingLayer(nn.Module): | |
| def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4): | |
| super().__init__() | |
| self.pre = nn.Conv1d(channels, hidden_channels, 1) | |
| self.enc = nn.ModuleList() | |
| for _ in range(n_layers): | |
| self.enc.append(nn.Sequential( | |
| nn.Conv1d(hidden_channels, hidden_channels, kernel_size, | |
| padding=(kernel_size - 1) * dilation_rate // 2, | |
| dilation=dilation_rate), | |
| nn.GLU(dim=1), | |
| )) | |
| self.post = nn.Conv1d(hidden_channels, channels * 2, 1) | |
| self.post.weight.data.zero_() | |
| self.post.bias.data.zero_() | |
| def forward(self, x, reverse=False): | |
| h = self.pre(x) | |
| for layer in self.enc: | |
| h = h + layer(h) | |
| stats = self.post(h) | |
| m, logs = stats.chunk(2, dim=1) | |
| if not reverse: | |
| log_s = torch.clamp(logs, -5.0, 5.0) | |
| y = m + x * torch.exp(log_s) | |
| logdet = torch.sum(log_s) | |
| return y, logdet | |
| else: | |
| log_s = torch.clamp(logs, -5.0, 5.0) | |
| y = (x - m) * torch.exp(-log_s) | |
| return y | |
| class Flip(nn.Module): | |
| def forward(self, x, reverse=False): | |
| if not reverse: | |
| return torch.flip(x, [1]), 0 | |
| else: | |
| return torch.flip(x, [1]) | |
| class Decoder(nn.Module): | |
| def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4): | |
| super().__init__() | |
| self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1) | |
| self.dec = nn.ModuleList() | |
| for _ in range(n_layers): | |
| self.dec.append(nn.Sequential( | |
| nn.Conv1d(hidden_channels, hidden_channels, kernel_size, | |
| padding=(kernel_size - 1) * dilation_rate // 2, | |
| dilation=dilation_rate), | |
| nn.GLU(dim=1), | |
| )) | |
| self.proj = nn.Conv1d(hidden_channels, out_channels, 1) | |
| def forward(self, x): | |
| x = self.pre(x) | |
| for layer in self.dec: | |
| x = x + layer(x) | |
| return self.proj(x) | |
| class RVCModel(nn.Module): | |
| """VITS-like RVC v3.0 Model (5.3M params)""" | |
| def __init__(self, n_mels=80, hidden_channels=192): | |
| super().__init__() | |
| self.enc_p = PosteriorEncoder(n_mels, hidden_channels) | |
| self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels) | |
| self.dec = Decoder(hidden_channels, n_mels) | |
| self.n_mels = n_mels | |
| def forward(self, mel): | |
| m, logs = self.enc_p(mel) | |
| z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0 | |
| z_p = self.flow(z) | |
| z_back = self.flow(z_p, reverse=True) | |
| mel_out = self.dec(z_back) | |
| return mel_out | |
| def infer(self, mel, noise_scale=0.0): | |
| m, logs = self.enc_p(mel) | |
| z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale | |
| z_p = self.flow(z) | |
| z_back = self.flow(z_p, reverse=True) | |
| mel_out = self.dec(z_back) | |
| return mel_out | |
| # ============================================================ | |
| # HiFi-GAN Vocoder Definition | |
| # ============================================================ | |
| class ResBlock1(nn.Module): | |
| def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): | |
| super().__init__() | |
| self.convs = nn.ModuleList() | |
| for d in dilation: | |
| self.convs.append(nn.Sequential( | |
| nn.LeakyReLU(0.1), | |
| nn.Conv1d(channels, channels, kernel_size, dilation=d, | |
| padding=(kernel_size - 1) * d // 2), | |
| nn.LeakyReLU(0.1), | |
| nn.Conv1d(channels, channels, kernel_size, dilation=1, | |
| padding=(kernel_size - 1) // 2), | |
| )) | |
| def forward(self, x): | |
| for conv in self.convs: | |
| x = x + conv(x) | |
| return x | |
| class HiFiGANGenerator(nn.Module): | |
| """HiFi-GAN Generator (Universal V1 compatible)""" | |
| def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2), | |
| upsample_kernel_sizes=(16, 16, 4, 4), | |
| upsample_initial_channel=512, | |
| resblock_kernel_sizes=(3, 7, 11), | |
| resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))): | |
| super().__init__() | |
| self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3) | |
| self.num_upsamples = len(upsample_rates) | |
| self.num_kernels = len(resblock_kernel_sizes) | |
| self.ups = nn.ModuleList() | |
| self.resblocks = nn.ModuleList() | |
| ch = upsample_initial_channel | |
| for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): | |
| ch_new = ch // 2 | |
| self.ups.append(nn.ConvTranspose1d(ch, ch_new, k, u, padding=(k - u) // 2)) | |
| for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): | |
| self.resblocks.append(ResBlock1(ch_new, rk, rd)) | |
| ch = ch_new | |
| self.conv_post = nn.Sequential( | |
| nn.LeakyReLU(0.1), | |
| nn.Conv1d(ch, 1, 7, padding=3), | |
| nn.Tanh(), | |
| ) | |
| def forward(self, x): | |
| x = self.conv_pre(x) | |
| for i in range(self.num_upsamples): | |
| x = F.leaky_relu(x, 0.1) | |
| x = self.ups[i](x) | |
| xs = 0 | |
| for j in range(self.num_kernels): | |
| xs += self.resblocks[i * self.num_kernels + j](x) | |
| x = xs / self.num_kernels | |
| x = self.conv_post(x) | |
| return x | |
| # ============================================================ | |
| # Mel-spectrogram utilities | |
| # ============================================================ | |
| def mel_spectrogram(y, n_fft=1024, hop_length=256, win_length=1024, | |
| n_mels=80, sample_rate=40000, fmin=0, fmax=None): | |
| """Compute mel spectrogram""" | |
| if fmax is None: | |
| fmax = sample_rate // 2 | |
| mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mels, | |
| fmin=fmin, fmax=fmax) | |
| window = torch.hann_window(win_length) | |
| # Pad signal | |
| pad_length = (win_length - hop_length) // 2 | |
| y = torch.nn.functional.pad(y, (pad_length, pad_length), mode='reflect') | |
| # STFT | |
| stft = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_length, | |
| window=window, center=False, return_complex=True) | |
| magnitudes = torch.sqrt(stft.real ** 2 + stft.imag ** 2 + 1e-7) | |
| # Mel filterbank | |
| mel_basis_t = torch.tensor(mel_basis, dtype=magnitudes.dtype) | |
| mel = torch.matmul(mel_basis_t, magnitudes) | |
| # Log | |
| mel = torch.log(torch.clamp(mel, min=1e-5)) | |
| return mel | |
| # ============================================================ | |
| # Inference Engine | |
| # ============================================================ | |
| class VoiceCloner: | |
| def __init__(self): | |
| self.device = torch.device('cpu') | |
| self.rvc_model = None | |
| self.hifigan = None | |
| self.sample_rate = 40000 | |
| self.dataset_id = "ayf3/numberblocks-one-voice-dataset" | |
| self.model_loaded = False | |
| self.samples = [] | |
| self.load_models() | |
| def load_models(self): | |
| """Load RVC model + HiFi-GAN vocoder""" | |
| print("Loading RVC model...") | |
| try: | |
| model_path = hf_hub_download( | |
| repo_id=self.dataset_id, | |
| filename="models/one_voice_rvc_v2.pth", | |
| repo_type="dataset" | |
| ) | |
| ckpt = torch.load(model_path, map_location='cpu', weights_only=False) | |
| # Determine model config | |
| if isinstance(ckpt, dict) and 'model' in ckpt: | |
| state_dict = ckpt['model'] | |
| elif isinstance(ckpt, dict) and 'state_dict' in ckpt: | |
| state_dict = ckpt['state_dict'] | |
| else: | |
| state_dict = ckpt | |
| # Auto-detect hidden channels from state_dict | |
| hidden_ch = 192 | |
| for k, v in state_dict.items(): | |
| if 'enc_p.pre.weight' in k: | |
| hidden_ch = v.shape[0] | |
| break | |
| self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch) | |
| self.rvc_model.load_state_dict(state_dict, strict=False) | |
| self.rvc_model.eval() | |
| print(f"✅ RVC model loaded (hidden={hidden_ch})") | |
| except Exception as e: | |
| print(f"❌ RVC model load failed: {e}") | |
| self.rvc_model = None | |
| print("Loading HiFi-GAN vocoder...") | |
| try: | |
| # Try loading from local or download | |
| hifigan_path = self._get_hifigan() | |
| if hifigan_path: | |
| ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False) | |
| if isinstance(ckpt, dict) and 'generator' in ckpt: | |
| state_dict = ckpt['generator'] | |
| elif isinstance(ckpt, dict) and 'state_dict' in ckpt: | |
| state_dict = {k.replace('generator.', ''): v | |
| for k, v in ckpt['state_dict'].items() | |
| if k.startswith('generator.')} | |
| else: | |
| state_dict = ckpt | |
| self.hifigan = HiFiGANGenerator() | |
| self.hifigan.load_state_dict(state_dict, strict=False) | |
| self.hifigan.eval() | |
| print("✅ HiFi-GAN vocoder loaded") | |
| else: | |
| print("⚠️ HiFi-GAN not available, will use Griffin-Lim fallback") | |
| except Exception as e: | |
| print(f"⚠️ HiFi-GAN load failed: {e}, using Griffin-Lim fallback") | |
| self.hifigan = None | |
| # Load sample list for random generation | |
| try: | |
| api = HfApi() | |
| files = api.list_repo_files(self.dataset_id, repo_type="dataset") | |
| self.samples = [f for f in files if f.startswith('models/top_') | |
| and f.endswith('.wav') | |
| and '_p+' not in f and '_p-' not in f and '_s+' not in f] | |
| print(f"✅ Found {len(self.samples)} sample audio files") | |
| except Exception as e: | |
| print(f"⚠️ Could not list samples: {e}") | |
| self.samples = [] | |
| self.model_loaded = self.rvc_model is not None | |
| def _get_hifigan(self): | |
| """Get HiFi-GAN model - download if needed""" | |
| # Try downloading from jik876/hifi-gan | |
| try: | |
| path = hf_hub_download( | |
| repo_id="jik876/hifi-gan", | |
| filename="UNIVERSAL_V1/g_02500000", | |
| ) | |
| return path | |
| except: | |
| pass | |
| # Try alternative location | |
| try: | |
| path = hf_hub_download( | |
| repo_id="facebook/hifigan-universal-v1", | |
| filename="hifigan.pt", | |
| ) | |
| return path | |
| except: | |
| pass | |
| return None | |
| def mel_to_audio_hifigan(self, mel): | |
| """Convert mel spectrogram to audio using HiFi-GAN""" | |
| with torch.no_grad(): | |
| audio = self.hifigan(mel.unsqueeze(0)) | |
| return audio.squeeze(0).squeeze(0).cpu().numpy() | |
| def mel_to_audio_griffinlim(self, mel, sr=40000, n_fft=1024, hop_length=256, n_iter=32): | |
| """Fallback: Convert mel to audio using Griffin-Lim""" | |
| mel_np = mel.cpu().numpy() | |
| S = librosa.feature.inverse.mel_to_stft( | |
| mel_np, sr=sr, n_fft=n_fft, power=2.0 | |
| ) | |
| y = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length, win_length=n_fft) | |
| return y | |
| def process_audio(self, input_audio, pitch_shift=0): | |
| """ | |
| Process audio through RVC model + HiFi-GAN vocoder | |
| Args: | |
| input_audio: path to input audio file | |
| pitch_shift: semitone shift | |
| Returns: | |
| output audio path, status message | |
| """ | |
| if not self.model_loaded: | |
| return None, "❌ 模型未加载" | |
| try: | |
| # Load audio | |
| y, sr = librosa.load(input_audio, sr=self.sample_rate) | |
| # Apply pitch shift | |
| if pitch_shift != 0: | |
| y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift) | |
| # Trim silence | |
| y, _ = librosa.effects.trim(y, top_db=20) | |
| # Limit length | |
| max_len = 10 * self.sample_rate # 10 seconds max | |
| if len(y) > max_len: | |
| y = y[:max_len] | |
| # Compute mel spectrogram | |
| y_tensor = torch.tensor(y, dtype=torch.float32) | |
| mel = mel_spectrogram(y_tensor, sample_rate=self.sample_rate, n_mels=80) | |
| # RVC inference | |
| with torch.no_grad(): | |
| mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0) | |
| mel_out = mel_out.squeeze(0) | |
| # Vocoder | |
| if self.hifigan is not None: | |
| audio_out = self.mel_to_audio_hifigan(mel_out) | |
| vocoder_name = "HiFi-GAN" | |
| else: | |
| audio_out = self.mel_to_audio_griffinlim(mel_out, sr=self.sample_rate) | |
| vocoder_name = "Griffin-Lim" | |
| # Normalize | |
| audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95 | |
| # Save | |
| output_path = tempfile.mktemp(suffix='.wav') | |
| sf.write(output_path, audio_out, self.sample_rate) | |
| return output_path, f"✅ 转换成功 ({vocoder_name}) | 输入: {len(y)/sr:.1f}s → 输出: {len(audio_out)/self.sample_rate:.1f}s" | |
| except Exception as e: | |
| return None, f"❌ 转换失败: {str(e)}" | |
| def generate_random(self): | |
| """Generate audio from a random sample""" | |
| if not self.samples: | |
| return None, "❌ 没有可用的样本" | |
| try: | |
| sample = random.choice(self.samples) | |
| sample_path = hf_hub_download( | |
| repo_id=self.dataset_id, | |
| filename=sample, | |
| repo_type="dataset" | |
| ) | |
| output, msg = self.process_audio(sample_path) | |
| if output: | |
| return output, f"✅ {msg}\n采样: {Path(sample).name}" | |
| return output, msg | |
| except Exception as e: | |
| return None, f"❌ 生成失败: {str(e)}" | |
| # ============================================================ | |
| # Gradio UI | |
| # ============================================================ | |
| print("🚀 Initializing NumberBlocks One Voice Cloner...") | |
| cloner = VoiceCloner() | |
| with gr.Blocks( | |
| title="NumberBlocks One Voice", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .header { text-align: center; margin-bottom: 1rem; } | |
| .header h1 { color: #ff6b6b; } | |
| """ | |
| ) as demo: | |
| gr.HTML(""" | |
| <div class="header"> | |
| <h1>🎭 NumberBlocks One Voice Cloner</h1> | |
| <p>RVC v2 Model (60.7MB) + HiFi-GAN Vocoder</p> | |
| </div> | |
| """) | |
| with gr.Tab("🎤 Voice Conversion"): | |
| gr.Markdown("### 上传音频 → 转换为 One 的声音") | |
| with gr.Row(): | |
| with gr.Column(): | |
| vc_input = gr.Audio(label="上传音频", type="filepath", sources=["upload", "microphone"]) | |
| vc_pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="音高偏移 (半音)") | |
| vc_btn = gr.Button("🎙️ 转换", variant="primary", size="lg") | |
| with gr.Column(): | |
| vc_output = gr.Audio(label="转换结果", type="filepath") | |
| vc_status = gr.Textbox(label="状态") | |
| vc_btn.click( | |
| fn=cloner.process_audio, | |
| inputs=[vc_input, vc_pitch], | |
| outputs=[vc_output, vc_status] | |
| ) | |
| with gr.Tab("🎲 Random Sample"): | |
| gr.Markdown("### 随机采样 + RVC 转换") | |
| with gr.Row(): | |
| rand_btn = gr.Button("🎲 随机生成", variant="primary", size="lg") | |
| with gr.Row(): | |
| rand_output = gr.Audio(label="生成结果", type="filepath") | |
| rand_status = gr.Textbox(label="状态") | |
| rand_btn.click( | |
| fn=cloner.generate_random, | |
| inputs=[], | |
| outputs=[rand_output, rand_status] | |
| ) | |
| with gr.Tab("ℹ️ About"): | |
| model_status = "✅ 已加载" if cloner.model_loaded else "❌ 未加载" | |
| hifigan_status = "✅ HiFi-GAN" if cloner.hifigan else "⚠️ Griffin-Lim (fallback)" | |
| gr.Markdown(f""" | |
| ### NumberBlocks One Voice Cloner V2 | |
| **模型**: RVC v3.0 (VITS-like, 5.3M params, 60.7MB) | |
| **Vocoder**: {hifigan_status} | |
| **采样率**: 40kHz | |
| **模型状态**: {model_status} | |
| **训练数据**: 100 源文件 → 1,334 chunks, 500 steps | |
| **Dataset**: [ayf3/numberblocks-one-voice-dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset) | |
| **功能**: | |
| - ✅ 上传音频 → One 音色转换 | |
| - ✅ 随机采样生成 | |
| - ✅ 音高调节 (-12 ~ +12 半音) | |
| - ✅ HiFi-GAN 高品质 vocoder | |
| **限制**: | |
| - CPU 推理,速度较慢 | |
| - 输入建议 < 10 秒 | |
| - 音质取决于输入质量 | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |