#!/usr/bin/env python3 """ NumberBlocks One Voice Cloner - RVC Inference Service Uses the trained RVC v2 model for voice conversion. """ import os, json, subprocess, sys import gradio as gr import numpy as np import struct # Install RVC on first run def setup_rvc(): rvc_dir = "/app/RVC" if not os.path.exists(os.path.join(rvc_dir, ".git")): subprocess.run(["git", "clone", "--depth", "1", "https://github.com/RVC-Project/Retrieval-based-Voice-Conversion.git", rvc_dir], check=False, timeout=300) return rvc_dir def download_model(): """Download the trained model from dataset""" from huggingface_hub import hf_hub_download model_path = hf_hub_download( repo_id="ayf3/numberblocks-one-voice-dataset", filename="models/one_voice_rvc_v2.pth", repo_type="dataset", ) return model_path def load_model(): """Load the voice model""" import torch import torch.nn as nn model_file = download_model() ckpt = torch.load(model_file, map_location="cpu", weights_only=False) cfg = ckpt['config'] sd = ckpt['model_state_dict'] n_mels, hd = cfg['n_mels'], cfg['hidden_dim'] class Encoder(nn.Module): def __init__(self): super().__init__() self.conv1, self.bn1 = nn.Conv1d(n_mels,hd,5,padding=2), nn.BatchNorm1d(hd) self.conv2, self.bn2 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd) self.conv3, self.bn3 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd) self.conv4, self.bn4 = nn.Conv1d(hd,hd*2,5,padding=2), nn.BatchNorm1d(hd*2) self.conv5, self.bn5 = nn.Conv1d(hd*2,hd*2,3,padding=1), nn.BatchNorm1d(hd*2) self.ln = nn.LayerNorm(hd*2) def forward(self, x): for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4),(self.conv5,self.bn5)]: x = torch.relu(b(c(x))) return self.ln(x.transpose(1,2)).transpose(1,2) class Posterior(nn.Module): def __init__(self): super().__init__() self.conv = nn.Conv1d(hd*2, 384, 1) def forward(self, x): stats = self.conv(x) m, logs = torch.split(stats, 192, dim=1) z = m + torch.randn_like(m)*torch.exp(logs) return z, m, logs class Flow(nn.Module): def __init__(self): super().__init__() self.net = nn.Sequential( nn.Conv1d(96,hd,1), nn.ReLU(), nn.Conv1d(hd,hd,1), nn.ReLU(), nn.Conv1d(hd,192,1), ) def forward(self, z): z1, z2 = torch.split(z, 96, dim=1) return z + self.net(z1) class Decoder(nn.Module): def __init__(self): super().__init__() self.conv1, self.bn1 = nn.Conv1d(192,hd*2,5,padding=2), nn.BatchNorm1d(hd*2) self.conv2, self.bn2 = nn.Conv1d(hd*2,hd*2,5,padding=2), nn.BatchNorm1d(hd*2) self.conv3, self.bn3 = nn.Conv1d(hd*2,hd,5,padding=2), nn.BatchNorm1d(hd) self.conv4, self.bn4 = nn.Conv1d(hd,hd,3,padding=1), nn.BatchNorm1d(hd) self.conv5 = nn.Conv1d(hd,128,1) def forward(self, z): for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4)]: z = torch.relu(b(c(z))) return self.conv5(z) class VoiceModel(nn.Module): def __init__(self): super().__init__() self.encoder, self.posterior, self.flow, self.decoder = Encoder(), Posterior(), Flow(), Decoder() def forward(self, mel): h = self.encoder(mel) z, m, logs = self.posterior(h) z = self.flow(z) return self.decoder(z), m, logs model = VoiceModel() model.load_state_dict(sd, strict=False) model.eval() return model, cfg # Global model model = None config = None def init(): global model, config if model is None: model, config = load_model() def mel_to_audio_simple(mel_np, sr=40000, hop=256, win=1024): """Simple mel-to-audio conversion""" n_frames = mel_np.shape[1] audio = np.zeros(n_frames * hop) for i in range(n_frames): energy = np.mean(np.exp(np.clip(mel_np[:64, i], -10, 10))) s, e = i * hop, i * hop + win if e <= len(audio): audio[s:e] += energy * 0.01 mx = np.max(np.abs(audio)) if mx > 0: audio = audio / mx * 0.5 return audio def convert_voice(audio_input, transpose=0): """Convert input audio to One's voice""" init() import torch sr = config['sample_rate'] hop = config['hop_length'] if audio_input is None: return None, "❌ 请上传音频文件" sr_in, data = audio_input[0], audio_input[1] # Resample if needed if sr_in != sr: import subprocess # Simple resampling via sox/ffmpeg would be better but let's keep it simple ratio = sr / sr_in n_samples = int(len(data) * ratio) indices = np.linspace(0, len(data)-1, n_samples).astype(int) data = data[indices] if len(data.shape) > 1: data = data.mean(axis=1) # Compute mel spectrogram import librosa mel = librosa.feature.melspectrogram( y=data.astype(np.float32), sr=sr, n_mels=config['n_mels'], hop_length=hop, win_length=config['win_length'], n_fft=config['n_fft'], ) mel_db = librosa.power_to_db(mel, ref=np.max) mel_norm = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-8) # Apply pitch shift if requested if transpose != 0: mel_norm = np.roll(mel_norm, transpose, axis=0) # Run through model with torch.no_grad(): mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0) mel_out, _, _ = model(mel_tensor) mel_out_np = mel_out.squeeze().numpy() audio_out = mel_to_audio_simple(mel_out_np, sr, hop) return (sr, audio_out), f"✅ 转换完成! 输入: {len(data)/sr_in:.1f}s → 输出: {len(audio_out)/sr:.1f}s" def generate_sample(): """Generate a sample of One's voice""" init() import torch n_frames = 400 # ~2.5s with torch.no_grad(): z = torch.randn(1, 192, n_frames) * 0.5 z = model.flow(z) mel_out = model.decoder(z) mel_np = mel_out.squeeze().numpy() audio = mel_to_audio_simple(mel_np) return (config['sample_rate'], audio), "✅ 生成完成! (随机采样模式)" # Create Gradio UI with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo: gr.HTML("""

🎙️ NumberBlocks One 语音克隆

RVC v2 Model — Voice Conversion & Generation

""") with gr.Tab("🔊 Voice Conversion"): gr.Markdown("上传音频,将其转换为 One 的声音") audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"]) pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)") convert_btn = gr.Button("🔄 转换", variant="primary") audio_out = gr.Audio(label="输出音频") status = gr.Textbox(label="状态") convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status]) with gr.Tab("🎵 Sample Generation"): gr.Markdown("生成 One 的随机语音样本") gen_btn = gr.Button("🎵 生成样本", variant="primary") gen_out = gr.Audio(label="生成音频") gen_status = gr.Textbox(label="状态") gen_btn.click(generate_sample, outputs=[gen_out, gen_status]) with gr.Tab("ℹ️ About"): gr.Markdown(""" ### Model Info - **Architecture**: VITS-like (Encoder + Posterior + Flow + Decoder) - **Parameters**: 5,296,064 (5.3M) - **Sample Rate**: 40kHz - **Training Data**: 100 source files, 1,334 chunks - **Training Steps**: 500 - **Final Loss**: 0.0009 ### Links - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset) - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer) ⚠️ Note: Audio quality is limited without a neural vocoder (HiFi-GAN). """) demo.launch(server_name="0.0.0.0", server_port=7860)