Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| NumberBlocks One Voice Cloner - RVC Inference Service | |
| Uses the trained RVC v2 model for voice conversion. | |
| """ | |
| import os, json, subprocess, sys | |
| import gradio as gr | |
| import numpy as np | |
| import struct | |
| # Install RVC on first run | |
| def setup_rvc(): | |
| rvc_dir = "/app/RVC" | |
| if not os.path.exists(os.path.join(rvc_dir, ".git")): | |
| subprocess.run(["git", "clone", "--depth", "1", | |
| "https://github.com/RVC-Project/Retrieval-based-Voice-Conversion.git", | |
| rvc_dir], check=False, timeout=300) | |
| return rvc_dir | |
| def download_model(): | |
| """Download the trained model from dataset""" | |
| from huggingface_hub import hf_hub_download | |
| model_path = hf_hub_download( | |
| repo_id="ayf3/numberblocks-one-voice-dataset", | |
| filename="models/one_voice_rvc_v2.pth", | |
| repo_type="dataset", | |
| ) | |
| return model_path | |
| def load_model(): | |
| """Load the voice model""" | |
| import torch | |
| import torch.nn as nn | |
| model_file = download_model() | |
| ckpt = torch.load(model_file, map_location="cpu", weights_only=False) | |
| cfg = ckpt['config'] | |
| sd = ckpt['model_state_dict'] | |
| n_mels, hd = cfg['n_mels'], cfg['hidden_dim'] | |
| class Encoder(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.conv1, self.bn1 = nn.Conv1d(n_mels,hd,5,padding=2), nn.BatchNorm1d(hd) | |
| self.conv2, self.bn2 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd) | |
| self.conv3, self.bn3 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd) | |
| self.conv4, self.bn4 = nn.Conv1d(hd,hd*2,5,padding=2), nn.BatchNorm1d(hd*2) | |
| self.conv5, self.bn5 = nn.Conv1d(hd*2,hd*2,3,padding=1), nn.BatchNorm1d(hd*2) | |
| self.ln = nn.LayerNorm(hd*2) | |
| def forward(self, x): | |
| for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4),(self.conv5,self.bn5)]: | |
| x = torch.relu(b(c(x))) | |
| return self.ln(x.transpose(1,2)).transpose(1,2) | |
| class Posterior(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.conv = nn.Conv1d(hd*2, 384, 1) | |
| def forward(self, x): | |
| stats = self.conv(x) | |
| m, logs = torch.split(stats, 192, dim=1) | |
| z = m + torch.randn_like(m)*torch.exp(logs) | |
| return z, m, logs | |
| class Flow(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.net = nn.Sequential( | |
| nn.Conv1d(96,hd,1), nn.ReLU(), | |
| nn.Conv1d(hd,hd,1), nn.ReLU(), | |
| nn.Conv1d(hd,192,1), | |
| ) | |
| def forward(self, z): | |
| z1, z2 = torch.split(z, 96, dim=1) | |
| return z + self.net(z1) | |
| class Decoder(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.conv1, self.bn1 = nn.Conv1d(192,hd*2,5,padding=2), nn.BatchNorm1d(hd*2) | |
| self.conv2, self.bn2 = nn.Conv1d(hd*2,hd*2,5,padding=2), nn.BatchNorm1d(hd*2) | |
| self.conv3, self.bn3 = nn.Conv1d(hd*2,hd,5,padding=2), nn.BatchNorm1d(hd) | |
| self.conv4, self.bn4 = nn.Conv1d(hd,hd,3,padding=1), nn.BatchNorm1d(hd) | |
| self.conv5 = nn.Conv1d(hd,128,1) | |
| def forward(self, z): | |
| for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4)]: | |
| z = torch.relu(b(c(z))) | |
| return self.conv5(z) | |
| class VoiceModel(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.encoder, self.posterior, self.flow, self.decoder = Encoder(), Posterior(), Flow(), Decoder() | |
| def forward(self, mel): | |
| h = self.encoder(mel) | |
| z, m, logs = self.posterior(h) | |
| z = self.flow(z) | |
| return self.decoder(z), m, logs | |
| model = VoiceModel() | |
| model.load_state_dict(sd, strict=False) | |
| model.eval() | |
| return model, cfg | |
| # Global model | |
| model = None | |
| config = None | |
| def init(): | |
| global model, config | |
| if model is None: | |
| model, config = load_model() | |
| def mel_to_audio_simple(mel_np, sr=40000, hop=256, win=1024): | |
| """Simple mel-to-audio conversion""" | |
| n_frames = mel_np.shape[1] | |
| audio = np.zeros(n_frames * hop) | |
| for i in range(n_frames): | |
| energy = np.mean(np.exp(np.clip(mel_np[:64, i], -10, 10))) | |
| s, e = i * hop, i * hop + win | |
| if e <= len(audio): | |
| audio[s:e] += energy * 0.01 | |
| mx = np.max(np.abs(audio)) | |
| if mx > 0: | |
| audio = audio / mx * 0.5 | |
| return audio | |
| def convert_voice(audio_input, transpose=0): | |
| """Convert input audio to One's voice""" | |
| init() | |
| import torch | |
| sr = config['sample_rate'] | |
| hop = config['hop_length'] | |
| if audio_input is None: | |
| return None, "❌ 请上传音频文件" | |
| sr_in, data = audio_input[0], audio_input[1] | |
| # Resample if needed | |
| if sr_in != sr: | |
| import subprocess | |
| # Simple resampling via sox/ffmpeg would be better but let's keep it simple | |
| ratio = sr / sr_in | |
| n_samples = int(len(data) * ratio) | |
| indices = np.linspace(0, len(data)-1, n_samples).astype(int) | |
| data = data[indices] | |
| if len(data.shape) > 1: | |
| data = data.mean(axis=1) | |
| # Compute mel spectrogram | |
| import librosa | |
| mel = librosa.feature.melspectrogram( | |
| y=data.astype(np.float32), | |
| sr=sr, | |
| n_mels=config['n_mels'], | |
| hop_length=hop, | |
| win_length=config['win_length'], | |
| n_fft=config['n_fft'], | |
| ) | |
| mel_db = librosa.power_to_db(mel, ref=np.max) | |
| mel_norm = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-8) | |
| # Apply pitch shift if requested | |
| if transpose != 0: | |
| mel_norm = np.roll(mel_norm, transpose, axis=0) | |
| # Run through model | |
| with torch.no_grad(): | |
| mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0) | |
| mel_out, _, _ = model(mel_tensor) | |
| mel_out_np = mel_out.squeeze().numpy() | |
| audio_out = mel_to_audio_simple(mel_out_np, sr, hop) | |
| return (sr, audio_out), f"✅ 转换完成! 输入: {len(data)/sr_in:.1f}s → 输出: {len(audio_out)/sr:.1f}s" | |
| def generate_sample(): | |
| """Generate a sample of One's voice""" | |
| init() | |
| import torch | |
| n_frames = 400 # ~2.5s | |
| with torch.no_grad(): | |
| z = torch.randn(1, 192, n_frames) * 0.5 | |
| z = model.flow(z) | |
| mel_out = model.decoder(z) | |
| mel_np = mel_out.squeeze().numpy() | |
| audio = mel_to_audio_simple(mel_np) | |
| return (config['sample_rate'], audio), "✅ 生成完成! (随机采样模式)" | |
| # Create Gradio UI | |
| with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div style="text-align:center; margin-bottom:1rem"> | |
| <h1 style="color:#ff6b6b">🎙️ NumberBlocks One 语音克隆</h1> | |
| <p>RVC v2 Model — Voice Conversion & Generation</p> | |
| </div> | |
| """) | |
| with gr.Tab("🔊 Voice Conversion"): | |
| gr.Markdown("上传音频,将其转换为 One 的声音") | |
| audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"]) | |
| pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)") | |
| convert_btn = gr.Button("🔄 转换", variant="primary") | |
| audio_out = gr.Audio(label="输出音频") | |
| status = gr.Textbox(label="状态") | |
| convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status]) | |
| with gr.Tab("🎵 Sample Generation"): | |
| gr.Markdown("生成 One 的随机语音样本") | |
| gen_btn = gr.Button("🎵 生成样本", variant="primary") | |
| gen_out = gr.Audio(label="生成音频") | |
| gen_status = gr.Textbox(label="状态") | |
| gen_btn.click(generate_sample, outputs=[gen_out, gen_status]) | |
| with gr.Tab("ℹ️ About"): | |
| gr.Markdown(""" | |
| ### Model Info | |
| - **Architecture**: VITS-like (Encoder + Posterior + Flow + Decoder) | |
| - **Parameters**: 5,296,064 (5.3M) | |
| - **Sample Rate**: 40kHz | |
| - **Training Data**: 100 source files, 1,334 chunks | |
| - **Training Steps**: 500 | |
| - **Final Loss**: 0.0009 | |
| ### Links | |
| - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset) | |
| - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer) | |
| ⚠️ Note: Audio quality is limited without a neural vocoder (HiFi-GAN). | |
| """) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |