#!/usr/bin/env python3 """ NumberBlocks One Voice Cloner - RVC Inference with HiFi-GAN Vocoder Uses the trained RVC v2 model + pretrained HiFi-GAN universal vocoder for high-quality synthesis. """ import os, json import gradio as gr import numpy as np import torch import torch.nn as nn import torch.nn.functional as F # ════════════════════════════════════════════════════════════ # HiFi-GAN Generator (exact match to pretrained weights) # ════════════════════════════════════════════════════════════ class HiFiGANResBlock(nn.Module): def __init__(self, channels, kernel_size, dilation_sizes): super().__init__() # Store padding values for manual padding (original HiFi-GAN doesn't use Conv1d padding) self.paddings1 = [] self.convs1 = nn.ModuleList() for d in dilation_sizes: self.convs1.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0))) self.paddings1.append((kernel_size - 1) * d // 2) self.paddings2 = [] self.convs2 = nn.ModuleList() for d in dilation_sizes: self.convs2.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0))) self.paddings2.append((kernel_size - 1) * d // 2) def forward(self, x): for c1, p1, c2, p2 in zip(self.convs1, self.paddings1, self.convs2, self.paddings2): xt = F.leaky_relu(x, 0.1) xt = F.pad(xt, (p1, p1)) xt = c1(xt) xt = F.leaky_relu(xt, 0.1) xt = F.pad(xt, (p2, p2)) xt = c2(xt) x = xt + x return x class HiFiGANGenerator(nn.Module): def __init__(self, config): super().__init__() self.num_kernels = len(config["resblock_kernel_sizes"]) self.conv_pre = nn.utils.weight_norm( nn.Conv1d(80, config["upsample_initial_channel"], 7, 1, padding=3) ) self.ups = nn.ModuleList() for u, k in zip(config["upsample_rates"], config["upsample_kernel_sizes"]): ch_in = config["upsample_initial_channel"] // (2 ** (len(self.ups))) ch_out = ch_in // 2 self.ups.append( nn.utils.weight_norm( nn.ConvTranspose1d(ch_in, ch_out, k, u, padding=(k - u) // 2) ) ) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = config["upsample_initial_channel"] // (2 ** (i + 1)) for k, d in zip(config["resblock_kernel_sizes"], config["resblock_dilation_sizes"]): self.resblocks.append(HiFiGANResBlock(ch, k, d)) ch_out = config["upsample_initial_channel"] // (2 ** len(self.ups)) self.conv_post = nn.utils.weight_norm(nn.Conv1d(ch_out, 1, 7, 1, padding=3)) def forward(self, mel): x = self.conv_pre(mel) for i, up in enumerate(self.ups): x = F.leaky_relu(x, 0.1) x = up(x) xs = 0 for j in range(self.num_kernels): xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = F.leaky_relu(x, 0.1) x = self.conv_post(x) x = torch.tanh(x) return x # ════════════════════════════════════════════════════════════ # Voice Model (VITS-like from training) # ════════════════════════════════════════════════════════════ class VoiceModel(nn.Module): def __init__(self, n_mels, hd): super().__init__() self.encoder = self._build_encoder(n_mels, hd) self.posterior = self._build_posterior(hd) self.flow = self._build_flow(hd) self.decoder = self._build_decoder(hd) def _build_encoder(self, n_mels, hd): layers = [] ch_in = n_mels for ch_out, ks in [(hd,5),(hd,5),(hd,5),(hd*2,5),(hd*2,3)]: layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)), nn.BatchNorm1d(ch_out), nn.ReLU()]) ch_in = ch_out layers.append(nn.LayerNorm(hd*2)) class Enc(nn.Module): def __init__(self, seq): super().__init__(); self.seq = seq def forward(self, x): x = self.seq[:-1](x) x = self.seq[-1](x.transpose(1,2)).transpose(1,2) return x return Enc(nn.Sequential(*layers)) def _build_posterior(self, hd): class Post(nn.Module): def __init__(s): super().__init__(); s.conv = nn.utils.weight_norm(nn.Conv1d(hd*2, 384, 1)) def forward(s, x): stats = s.conv(x); m, logs = torch.split(stats, 192, dim=1) z = m + torch.randn_like(m)*torch.exp(logs); return z, m, logs return Post() def _build_flow(self, hd): class Flow(nn.Module): def __init__(s): super().__init__() s.net = nn.Sequential(nn.Conv1d(96,hd,1), nn.ReLU(), nn.Conv1d(hd,hd,1), nn.ReLU(), nn.Conv1d(hd,192,1)) def forward(s, z): z1, z2 = torch.split(z, 96, dim=1); return z + s.net(z1) return Flow() def _build_decoder(self, hd): layers = [] ch_in = 192 for ch_out, ks in [(hd*2,5),(hd*2,5),(hd,5),(hd,3)]: layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)), nn.BatchNorm1d(ch_out), nn.ReLU()]) ch_in = ch_out layers.append(nn.utils.weight_norm(nn.Conv1d(hd, 80, 1))) return nn.Sequential(*layers) # ════════════════════════════════════════════════════════════ # Model Loading # ════════════════════════════════════════════════════════════ def download_file(repo_id, filename, repo_type="dataset"): from huggingface_hub import hf_hub_download return hf_hub_download(repo_id=repo_id, filename=filename, repo_type=repo_type) def load_hifigan(): cfg_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "config.json", repo_type="model") weights_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "g_02500000", repo_type="model") with open(cfg_path) as f: hfg_cfg = json.load(f) vocoder = HiFiGANGenerator(hfg_cfg) ckpt = torch.load(weights_path, map_location="cpu", weights_only=False) vocoder.load_state_dict(ckpt["generator"]) vocoder.eval() return vocoder, hfg_cfg def load_voice_model(): model_file = download_file("ayf3/numberblocks-one-voice-dataset", "models/one_voice_rvc_v2.pth") ckpt = torch.load(model_file, map_location="cpu", weights_only=False) cfg = ckpt['config'] sd = ckpt['model_state_dict'] model = VoiceModel(cfg['n_mels'], cfg['hidden_dim']) model.load_state_dict(sd, strict=False) model.eval() return model, cfg # ════════════════════════════════════════════════════════════ # Audio Processing # ════════════════════════════════════════════════════════════ def mel_spectrogram(audio, sr, n_mels=80, hop_length=256, win_length=1024, n_fft=1024): import librosa mel = librosa.feature.melspectrogram( y=audio.astype(np.float32), sr=sr, n_mels=n_mels, hop_length=hop_length, win_length=win_length, n_fft=n_fft, fmax=8000 ) mel_db = librosa.power_to_db(mel, ref=np.max) return mel_db def mel_to_audio_hifigan(vocoder, mel_tensor): with torch.no_grad(): audio = vocoder(mel_tensor) return audio.squeeze().cpu().numpy() # ════════════════════════════════════════════════════════════ # Globals & Init # ════════════════════════════════════════════════════════════ voice_model = None voice_config = None hifigan = None hifigan_config = None def init_models(): global voice_model, voice_config, hifigan, hifigan_config if voice_model is None: print("Loading voice model...") voice_model, voice_config = load_voice_model() print("Voice model loaded.") if hifigan is None: print("Loading HiFi-GAN vocoder...") hifigan, hifigan_config = load_hifigan() print("HiFi-GAN vocoder loaded.") # ════════════════════════════════════════════════════════════ # Core Functions # ════════════════════════════════════════════════════════════ def convert_voice(audio_input, transpose=0): init_models() import librosa if audio_input is None: return None, "❌ 请上传音频文件" sr_in, data = audio_input[0], audio_input[1] # Resample to 44100 for HiFi-GAN if sr_in != 44100: data = librosa.resample(data.astype(np.float32), orig_sr=sr_in, target_sr=44100) if len(data.shape) > 1: data = data.mean(axis=1) # Trim to max 30 seconds max_samples = 44100 * 30 if len(data) > max_samples: data = data[:max_samples] # Compute mel spectrogram mel = mel_spectrogram(data, 44100) mel_norm = (mel - mel.mean()) / (mel.std() + 1e-8) if transpose != 0: mel_norm = np.roll(mel_norm, transpose, axis=0) # Voice model timbre transfer with torch.no_grad(): mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0) mel_out, _, _ = voice_model(mel_tensor) mel_out_np = mel_out.squeeze().cpu().numpy() mel_out_np = np.clip(mel_out_np, -4.0, 4.0) # HiFi-GAN vocoding with torch.no_grad(): audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0)) mx = np.max(np.abs(audio_out)) if mx > 0: audio_out = audio_out / mx * 0.85 return (44100, audio_out.astype(np.float32)), \ f"✅ 转换完成! (HiFi-GAN vocoder)\n输入: {len(data)/44100:.1f}s → 输出: {len(audio_out)/44100:.1f}s" def generate_sample(): init_models() n_frames = 400 with torch.no_grad(): z = torch.randn(1, 192, n_frames) * 0.5 z = voice_model.flow(z) mel_out = voice_model.decoder(z) mel_out_np = np.clip(mel_out.squeeze().cpu().numpy(), -4.0, 4.0) with torch.no_grad(): audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0)) mx = np.max(np.abs(audio_out)) if mx > 0: audio_out = audio_out / mx * 0.85 return (44100, audio_out.astype(np.float32)), \ f"✅ 生成完成! (HiFi-GAN vocoder)\n时长: {len(audio_out)/44100:.1f}s" # ════════════════════════════════════════════════════════════ # Gradio UI # ════════════════════════════════════════════════════════════ with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo: gr.HTML("""

🎙️ NumberBlocks One 语音克隆

RVC v2 Model + HiFi-GAN Vocoder — High Quality Voice Conversion

""") with gr.Tab("🔊 Voice Conversion"): gr.Markdown("上传音频,将其转换为 One 的声音(使用 HiFi-GAN 神经声码器提升音质)") audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"]) pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)") convert_btn = gr.Button("🔄 转换", variant="primary") audio_out = gr.Audio(label="输出音频 (HiFi-GAN)") status = gr.Textbox(label="状态") convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status]) with gr.Tab("🎵 Sample Generation"): gr.Markdown("生成 One 的随机语音样本(使用 HiFi-GAN 神经声码器)") gen_btn = gr.Button("🎵 生成样本", variant="primary") gen_out = gr.Audio(label="生成音频 (HiFi-GAN)") gen_status = gr.Textbox(label="状态") gen_btn.click(generate_sample, outputs=[gen_out, gen_status]) with gr.Tab("ℹ️ About"): gr.Markdown(""" ### Model Info - **Voice Model**: VITS-like (Encoder + Posterior + Flow + Decoder) — 5.3M params - **Vocoder**: HiFi-GAN Universal (44100Hz) — 928K params - **Sample Rate**: 44100 Hz - **Training Data**: 100 source files, 1,334 chunks - **Training Steps**: 500 ### What's New - ✅ Integrated HiFi-GAN neural vocoder replacing overlap-add - ✅ Significantly improved audio quality and naturalness - ✅ Proper mel→audio conversion with learned upsampling ### Links - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset) - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer) """) demo.launch(server_name="0.0.0.0", server_port=7860)