#!/usr/bin/env python3 """ NumberBlocks One Voice Cloner - V3 (librosa-free) 用 torchaudio + scipy 替代 librosa,避免 numba 兼容问题 """ import os import random import tempfile import numpy as np import soundfile as sf import torch import torch.nn as nn import torch.nn.functional as F import torchaudio import gradio as gr from pathlib import Path from huggingface_hub import hf_hub_download, HfApi # ============================================================ # 模型定义 - VITS-like RVC Model # ============================================================ class PosteriorEncoder(nn.Module): def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4): super().__init__() self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.enc = nn.ModuleList() for _ in range(n_layers): self.enc.append(nn.Sequential( nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size - 1) * dilation_rate // 2, dilation=dilation_rate), nn.GLU(dim=1), )) self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1) def forward(self, x): x = self.pre(x) for layer in self.enc: x = x + layer(x) stats = self.proj(x) m, logs = stats.chunk(2, dim=1) return m, logs class ResidualCouplingBlock(nn.Module): def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4): super().__init__() self.flows = nn.ModuleList() for _ in range(n_flows): self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers)) self.flows.append(Flip()) def forward(self, x, reverse=False): if not reverse: for flow in self.flows: x, _ = flow(x, reverse=reverse) else: for flow in reversed(self.flows): x = flow(x, reverse=reverse) return x class ResidualCouplingLayer(nn.Module): def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4): super().__init__() self.pre = nn.Conv1d(channels, hidden_channels, 1) self.enc = nn.ModuleList() for _ in range(n_layers): self.enc.append(nn.Sequential( nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size - 1) * dilation_rate // 2, dilation=dilation_rate), nn.GLU(dim=1), )) self.post = nn.Conv1d(hidden_channels, channels * 2, 1) self.post.weight.data.zero_() self.post.bias.data.zero_() def forward(self, x, reverse=False): h = self.pre(x) for layer in self.enc: h = h + layer(h) stats = self.post(h) m, logs = stats.chunk(2, dim=1) if not reverse: log_s = torch.clamp(logs, -5.0, 5.0) y = m + x * torch.exp(log_s) logdet = torch.sum(log_s) return y, logdet else: log_s = torch.clamp(logs, -5.0, 5.0) y = (x - m) * torch.exp(-log_s) return y class Flip(nn.Module): def forward(self, x, reverse=False): if not reverse: return torch.flip(x, [1]), 0 else: return torch.flip(x, [1]) class Decoder(nn.Module): def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4): super().__init__() self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1) self.dec = nn.ModuleList() for _ in range(n_layers): self.dec.append(nn.Sequential( nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=(kernel_size - 1) * dilation_rate // 2, dilation=dilation_rate), nn.GLU(dim=1), )) self.proj = nn.Conv1d(hidden_channels, out_channels, 1) def forward(self, x): x = self.pre(x) for layer in self.dec: x = x + layer(x) return self.proj(x) class RVCModel(nn.Module): """VITS-like RVC v3.0 Model (5.3M params)""" def __init__(self, n_mels=80, hidden_channels=192): super().__init__() self.enc_p = PosteriorEncoder(n_mels, hidden_channels) self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels) self.dec = Decoder(hidden_channels, n_mels) self.n_mels = n_mels def forward(self, mel): m, logs = self.enc_p(mel) z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0 z_p = self.flow(z) z_back = self.flow(z_p, reverse=True) mel_out = self.dec(z_back) return mel_out def infer(self, mel, noise_scale=0.0): m, logs = self.enc_p(mel) z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale z_p = self.flow(z) z_back = self.flow(z_p, reverse=True) mel_out = self.dec(z_back) return mel_out # ============================================================ # HiFi-GAN Vocoder Definition # ============================================================ class ResBlock1(nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): super().__init__() self.convs = nn.ModuleList() for d in dilation: self.convs.append(nn.Sequential( nn.LeakyReLU(0.1), nn.Conv1d(channels, channels, kernel_size, dilation=d, padding=(kernel_size - 1) * d // 2), nn.LeakyReLU(0.1), nn.Conv1d(channels, channels, kernel_size, dilation=1, padding=(kernel_size - 1) // 2), )) def forward(self, x): for conv in self.convs: x = x + conv(x) return x class HiFiGANGenerator(nn.Module): """HiFi-GAN Generator (Universal V1 compatible)""" def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2), upsample_kernel_sizes=(16, 16, 4, 4), upsample_initial_channel=512, resblock_kernel_sizes=(3, 7, 11), resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))): super().__init__() self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3) self.num_upsamples = len(upsample_rates) self.num_kernels = len(resblock_kernel_sizes) self.ups = nn.ModuleList() self.resblocks = nn.ModuleList() ch = upsample_initial_channel for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): ch_new = ch // 2 self.ups.append(nn.ConvTranspose1d(ch, ch_new, k, u, padding=(k - u) // 2)) for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(ResBlock1(ch_new, rk, rd)) ch = ch_new self.conv_post = nn.Sequential( nn.LeakyReLU(0.1), nn.Conv1d(ch, 1, 7, padding=3), nn.Tanh(), ) def forward(self, x): x = self.conv_pre(x) for i in range(self.num_upsamples): x = F.leaky_relu(x, 0.1) x = self.ups[i](x) xs = 0 for j in range(self.num_kernels): xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = self.conv_post(x) return x # ============================================================ # Mel-spectrogram utilities (torchaudio-based, no librosa) # ============================================================ def make_mel_transform(sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80, f_min=0.0, f_max=None): """Create torchaudio mel spectrogram transform""" return torchaudio.transforms.MelSpectrogram( sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, f_min=f_min, f_max=f_max if f_max else float(sample_rate // 2), power=2.0, norm=None, mel_scale="htk", ) def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80): """Compute log mel spectrogram using torchaudio""" mel_transform = make_mel_transform(sample_rate, n_fft, hop_length, n_mels) mel = mel_transform(y) mel = torch.log(torch.clamp(mel, min=1e-5)) return mel def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32): """Griffin-Lim: mel -> audio using torchaudio""" # Create inverse mel transform inverse_mel = torchaudio.transforms.InverseMelScale( n_stft=n_fft // 2 + 1, n_mels=mel.shape[0], sample_rate=sample_rate, f_min=0, f_max=float(sample_rate // 2), mel_scale="htk", ) # Convert from log mel back to power spectrogram mel_power = torch.exp(mel) spec = inverse_mel(mel_power) # Griffin-Lim griffin_lim = torchaudio.transforms.GriffinLim( n_fft=n_fft, hop_length=hop_length, n_iter=n_iter, ) audio = griffin_lim(spec) return audio.numpy() # ============================================================ # Inference Engine # ============================================================ class VoiceCloner: def __init__(self): self.device = torch.device('cpu') self.rvc_model = None self.hifigan = None self.sample_rate = 40000 self.dataset_id = "ayf3/numberblocks-one-voice-dataset" self.model_loaded = False self.samples = [] self.load_models() def load_models(self): """Load RVC model + HiFi-GAN vocoder""" print("Loading RVC model...") try: model_path = hf_hub_download( repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset" ) ckpt = torch.load(model_path, map_location='cpu', weights_only=False) if isinstance(ckpt, dict) and 'model' in ckpt: state_dict = ckpt['model'] elif isinstance(ckpt, dict) and 'state_dict' in ckpt: state_dict = ckpt['state_dict'] else: state_dict = ckpt hidden_ch = 192 for k, v in state_dict.items(): if 'enc_p.pre.weight' in k: hidden_ch = v.shape[0] break self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch) self.rvc_model.load_state_dict(state_dict, strict=False) self.rvc_model.eval() print(f"RVC model loaded (hidden={hidden_ch})") except Exception as e: print(f"RVC model load failed: {e}") self.rvc_model = None print("Loading HiFi-GAN vocoder...") try: hifigan_path = self._get_hifigan() if hifigan_path: ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False) if isinstance(ckpt, dict) and 'generator' in ckpt: state_dict = ckpt['generator'] elif isinstance(ckpt, dict) and 'state_dict' in ckpt: state_dict = {k.replace('generator.', ''): v for k, v in ckpt['state_dict'].items() if k.startswith('generator.')} else: state_dict = ckpt self.hifigan = HiFiGANGenerator() self.hifigan.load_state_dict(state_dict, strict=False) self.hifigan.eval() print("HiFi-GAN vocoder loaded") else: print("HiFi-GAN not available, will use Griffin-Lim fallback") except Exception as e: print(f"HiFi-GAN load failed: {e}, using Griffin-Lim fallback") self.hifigan = None try: api = HfApi() files = api.list_repo_files(self.dataset_id, repo_type="dataset") self.samples = [f for f in files if f.startswith('models/top_') and f.endswith('.wav') and '_p+' not in f and '_p-' not in f and '_s+' not in f] print(f"Found {len(self.samples)} sample audio files") except Exception as e: print(f"Could not list samples: {e}") self.samples = [] self.model_loaded = self.rvc_model is not None def _get_hifigan(self): """Get HiFi-GAN model""" try: path = hf_hub_download( repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000", ) return path except: pass try: path = hf_hub_download( repo_id="facebook/hifigan-universal-v1", filename="hifigan.pt", ) return path except: pass return None def mel_to_audio_hifigan(self, mel): """Convert mel spectrogram to audio using HiFi-GAN""" with torch.no_grad(): audio = self.hifigan(mel.unsqueeze(0)) return audio.squeeze(0).squeeze(0).cpu().numpy() def process_audio(self, input_audio, pitch_shift=0): """Process audio through RVC model + vocoder""" if not self.model_loaded: return None, "Model not loaded" if input_audio is None: return None, "Please upload an audio file" try: # Load audio with torchaudio y, sr = torchaudio.load(input_audio) # Convert to mono if y.shape[0] > 1: y = y.mean(dim=0) else: y = y.squeeze(0) # Resample if sr != self.sample_rate: resampler = torchaudio.transforms.Resample(sr, self.sample_rate) y = resampler(y) sr = self.sample_rate # Pitch shift using torchaudio (simple resample trick) if pitch_shift != 0: # Pitch shift by resampling: shift up by N semitones = speed up by 2^(N/12) factor = 2.0 ** (abs(pitch_shift) / 12.0) if pitch_shift > 0: new_len = int(len(y) / factor) else: new_len = int(len(y) * factor) y = torch.nn.functional.interpolate( y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear' ).squeeze(0).squeeze(0) # Trim silence (simple energy-based) energy = y ** 2 window_size = int(0.1 * sr) # 100ms window if len(energy) > window_size: kernel = torch.ones(window_size) / window_size smooth_energy = torch.nn.functional.conv1d( energy.unsqueeze(0).unsqueeze(0), kernel.unsqueeze(0).unsqueeze(0), padding=window_size // 2 ).squeeze() threshold = smooth_energy.max() * (10 ** (-20 / 10)) # -20dB active = torch.where(smooth_energy > threshold)[0] if len(active) > 0: y = y[active[0]:active[-1] + 1] max_len = 10 * self.sample_rate if len(y) > max_len: y = y[:max_len] mel = compute_mel(y, sample_rate=self.sample_rate, n_mels=80) with torch.no_grad(): mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0) mel_out = mel_out.squeeze(0) if self.hifigan is not None: audio_out = self.mel_to_audio_hifigan(mel_out) vocoder_name = "HiFi-GAN" else: audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate) vocoder_name = "Griffin-Lim" audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95 output_path = tempfile.mktemp(suffix='.wav') sf.write(output_path, audio_out, self.sample_rate) return output_path, f"Success ({vocoder_name}) | Input: {len(y)/sr:.1f}s -> Output: {len(audio_out)/self.sample_rate:.1f}s" except Exception as e: import traceback traceback.print_exc() return None, f"Conversion failed: {str(e)}" def generate_random(self): """Generate audio from a random sample""" if not self.samples: return None, "No samples available" try: sample = random.choice(self.samples) sample_path = hf_hub_download( repo_id=self.dataset_id, filename=sample, repo_type="dataset" ) output, msg = self.process_audio(sample_path) if output: return output, f"{msg}\nSample: {Path(sample).name}" return output, msg except Exception as e: return None, f"Generation failed: {str(e)}" # ============================================================ # Gradio UI # ============================================================ print("Initializing NumberBlocks One Voice Cloner...") cloner = VoiceCloner() with gr.Blocks( title="NumberBlocks One Voice", theme=gr.themes.Soft(), ) as demo: gr.Markdown("# NumberBlocks One Voice Cloner") gr.Markdown("RVC v2 Model (60.7MB) + HiFi-GAN Vocoder") with gr.Tab("Voice Conversion"): gr.Markdown("### Upload audio -> Convert to One's voice") with gr.Row(): with gr.Column(): vc_input = gr.Audio(label="Upload Audio", type="filepath") vc_pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)") vc_btn = gr.Button("Convert", variant="primary") with gr.Column(): vc_output = gr.Audio(label="Result", type="filepath") vc_status = gr.Textbox(label="Status") vc_btn.click( fn=cloner.process_audio, inputs=[vc_input, vc_pitch], outputs=[vc_output, vc_status] ) with gr.Tab("Random Sample"): gr.Markdown("### Random sample + RVC conversion") rand_btn = gr.Button("Generate Random", variant="primary") rand_output = gr.Audio(label="Result", type="filepath") rand_status = gr.Textbox(label="Status") rand_btn.click( fn=cloner.generate_random, inputs=[], outputs=[rand_output, rand_status] ) with gr.Tab("About"): model_status = "Loaded" if cloner.model_loaded else "Not loaded" hifigan_status = "HiFi-GAN" if cloner.hifigan else "Griffin-Lim (fallback)" gr.Markdown(f""" ### NumberBlocks One Voice Cloner V3 (librosa-free) - **Model**: RVC v3.0 (VITS-like, 5.3M params, 60.7MB) - **Vocoder**: {hifigan_status} - **Sample Rate**: 40kHz - **Model Status**: {model_status} - **Training Data**: 100 source files -> 1,334 chunks, 500 steps - **Dataset**: [ayf3/numberblocks-one-voice-dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset) **Features**: - Upload audio -> One voice conversion - Random sample generation - Pitch adjustment (-12 to +12 semitones) - HiFi-GAN high quality vocoder **Limitations**: - CPU inference (slow) - Input recommended < 10 seconds """) demo.launch(server_name="0.0.0.0", server_port=7860)