Numberblocks1Voice

Sleeping

App Files Files Community

ayf3 commited on Apr 1

Commit

4428f5f

verified ·

1 Parent(s): ad8ecfd

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +238 -0

app.py ADDED Viewed

	@@ -0,0 +1,238 @@

+#!/usr/bin/env python3
+"""
+NumberBlocks One Voice Cloner - RVC Inference Service
+Uses the trained RVC v2 model for voice conversion.
+"""
+import os, json, subprocess, sys
+import gradio as gr
+import numpy as np
+import struct
+# Install RVC on first run
+def setup_rvc():
+    rvc_dir = "/app/RVC"
+    if not os.path.exists(os.path.join(rvc_dir, ".git")):
+        subprocess.run(["git", "clone", "--depth", "1",
+                       "https://github.com/RVC-Project/Retrieval-based-Voice-Conversion.git",
+                       rvc_dir], check=False, timeout=300)
+    return rvc_dir
+def download_model():
+    """Download the trained model from dataset"""
+    from huggingface_hub import hf_hub_download
+    model_path = hf_hub_download(
+        repo_id="ayf3/numberblocks-one-voice-dataset",
+        filename="models/one_voice_rvc_v2.pth",
+        repo_type="dataset",
+    )
+    return model_path
+def load_model():
+    """Load the voice model"""
+    import torch
+    import torch.nn as nn
+    model_file = download_model()
+    ckpt = torch.load(model_file, map_location="cpu", weights_only=False)
+    cfg = ckpt['config']
+    sd = ckpt['model_state_dict']
+    n_mels, hd = cfg['n_mels'], cfg['hidden_dim']
+    class Encoder(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1, self.bn1 = nn.Conv1d(n_mels,hd,5,padding=2), nn.BatchNorm1d(hd)
+            self.conv2, self.bn2 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd)
+            self.conv3, self.bn3 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd)
+            self.conv4, self.bn4 = nn.Conv1d(hd,hd*2,5,padding=2), nn.BatchNorm1d(hd*2)
+            self.conv5, self.bn5 = nn.Conv1d(hd*2,hd*2,3,padding=1), nn.BatchNorm1d(hd*2)
+            self.ln = nn.LayerNorm(hd*2)
+        def forward(self, x):
+            for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4),(self.conv5,self.bn5)]:
+                x = torch.relu(b(c(x)))
+            return self.ln(x.transpose(1,2)).transpose(1,2)
+    class Posterior(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv1d(hd*2, 384, 1)
+        def forward(self, x):
+            stats = self.conv(x)
+            m, logs = torch.split(stats, 192, dim=1)
+            z = m + torch.randn_like(m)*torch.exp(logs)
+            return z, m, logs
+    class Flow(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.net = nn.Sequential(
+                nn.Conv1d(96,hd,1), nn.ReLU(),
+                nn.Conv1d(hd,hd,1), nn.ReLU(),
+                nn.Conv1d(hd,192,1),
+            )
+        def forward(self, z):
+            z1, z2 = torch.split(z, 96, dim=1)
+            return z + self.net(z1)
+    class Decoder(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1, self.bn1 = nn.Conv1d(192,hd*2,5,padding=2), nn.BatchNorm1d(hd*2)
+            self.conv2, self.bn2 = nn.Conv1d(hd*2,hd*2,5,padding=2), nn.BatchNorm1d(hd*2)
+            self.conv3, self.bn3 = nn.Conv1d(hd*2,hd,5,padding=2), nn.BatchNorm1d(hd)
+            self.conv4, self.bn4 = nn.Conv1d(hd,hd,3,padding=1), nn.BatchNorm1d(hd)
+            self.conv5 = nn.Conv1d(hd,128,1)
+        def forward(self, z):
+            for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4)]:
+                z = torch.relu(b(c(z)))
+            return self.conv5(z)
+    class VoiceModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.encoder, self.posterior, self.flow, self.decoder = Encoder(), Posterior(), Flow(), Decoder()
+        def forward(self, mel):
+            h = self.encoder(mel)
+            z, m, logs = self.posterior(h)
+            z = self.flow(z)
+            return self.decoder(z), m, logs
+    model = VoiceModel()
+    model.load_state_dict(sd, strict=False)
+    model.eval()
+    return model, cfg
+# Global model
+model = None
+config = None
+def init():
+    global model, config
+    if model is None:
+        model, config = load_model()
+def mel_to_audio_simple(mel_np, sr=40000, hop=256, win=1024):
+    """Simple mel-to-audio conversion"""
+    n_frames = mel_np.shape[1]
+    audio = np.zeros(n_frames * hop)
+    for i in range(n_frames):
+        energy = np.mean(np.exp(np.clip(mel_np[:64, i], -10, 10)))
+        s, e = i * hop, i * hop + win
+        if e <= len(audio):
+            audio[s:e] += energy * 0.01
+    mx = np.max(np.abs(audio))
+    if mx > 0:
+        audio = audio / mx * 0.5
+    return audio
+def convert_voice(audio_input, transpose=0):
+    """Convert input audio to One's voice"""
+    init()
+    import torch
+    sr = config['sample_rate']
+    hop = config['hop_length']
+    if audio_input is None:
+        return None, "❌ 请上传音频文件"
+    sr_in, data = audio_input[0], audio_input[1]
+    # Resample if needed
+    if sr_in != sr:
+        import subprocess
+        # Simple resampling via sox/ffmpeg would be better but let's keep it simple
+        ratio = sr / sr_in
+        n_samples = int(len(data) * ratio)
+        indices = np.linspace(0, len(data)-1, n_samples).astype(int)
+        data = data[indices]
+    if len(data.shape) > 1:
+        data = data.mean(axis=1)
+    # Compute mel spectrogram
+    import librosa
+    mel = librosa.feature.melspectrogram(
+        y=data.astype(np.float32),
+        sr=sr,
+        n_mels=config['n_mels'],
+        hop_length=hop,
+        win_length=config['win_length'],
+        n_fft=config['n_fft'],
+    )
+    mel_db = librosa.power_to_db(mel, ref=np.max)
+    mel_norm = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-8)
+    # Apply pitch shift if requested
+    if transpose != 0:
+        mel_norm = np.roll(mel_norm, transpose, axis=0)
+    # Run through model
+    with torch.no_grad():
+        mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0)
+        mel_out, _, _ = model(mel_tensor)
+    mel_out_np = mel_out.squeeze().numpy()
+    audio_out = mel_to_audio_simple(mel_out_np, sr, hop)
+    return (sr, audio_out), f"✅ 转换完成! 输入: {len(data)/sr_in:.1f}s → 输出: {len(audio_out)/sr:.1f}s"
+def generate_sample():
+    """Generate a sample of One's voice"""
+    init()
+    import torch
+    n_frames = 400  # ~2.5s
+    with torch.no_grad():
+        z = torch.randn(1, 192, n_frames) * 0.5
+        z = model.flow(z)
+        mel_out = model.decoder(z)
+    mel_np = mel_out.squeeze().numpy()
+    audio = mel_to_audio_simple(mel_np)
+    return (config['sample_rate'], audio), "✅ 生成完成! (随机采样模式)"
+# Create Gradio UI
+with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <div style="text-align:center; margin-bottom:1rem">
+        <h1 style="color:#ff6b6b">🎙️ NumberBlocks One 语音克隆</h1>
+        <p>RVC v2 Model — Voice Conversion & Generation</p>
+    </div>
+    """)
+    with gr.Tab("🔊 Voice Conversion"):
+        gr.Markdown("上传音频，将其转换为 One 的声音")
+        audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"])
+        pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)")
+        convert_btn = gr.Button("🔄 转换", variant="primary")
+        audio_out = gr.Audio(label="输出音频")
+        status = gr.Textbox(label="状态")
+        convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status])
+    with gr.Tab("🎵 Sample Generation"):
+        gr.Markdown("生成 One 的随机语音样本")
+        gen_btn = gr.Button("🎵 生成样本", variant="primary")
+        gen_out = gr.Audio(label="生成音频")
+        gen_status = gr.Textbox(label="状态")
+        gen_btn.click(generate_sample, outputs=[gen_out, gen_status])
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+        ### Model Info
+        - **Architecture**: VITS-like (Encoder + Posterior + Flow + Decoder)
+        - **Parameters**: 5,296,064 (5.3M)
+        - **Sample Rate**: 40kHz
+        - **Training Data**: 100 source files, 1,334 chunks
+        - **Training Steps**: 500
+        - **Final Loss**: 0.0009
+        ### Links
+        - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
+        - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer)
+        ⚠️ Note: Audio quality is limited without a neural vocoder (HiFi-GAN).
+        """)
+demo.launch(server_name="0.0.0.0", server_port=7860)