Numberblocks1Voice

Sleeping

App Files Files Community

ayf3 commited on Apr 2

Commit

71ee5ef

verified ·

1 Parent(s): 8a9f5a9

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +500 -283

app.py CHANGED Viewed

@@ -1,332 +1,549 @@
 #!/usr/bin/env python3
 """
-NumberBlocks One Voice Cloner - RVC Inference with HiFi-GAN Vocoder
-Uses the trained RVC v2 model + pretrained HiFi-GAN universal vocoder for high-quality synthesis.
 """
-import os, json
-import gradio as gr
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-# ════════════════════════════════════════════════════════════
-# HiFi-GAN Generator (exact match to pretrained weights)
-# ════════════════════════════════════════════════════════════
-class HiFiGANResBlock(nn.Module):
-    def __init__(self, channels, kernel_size, dilation_sizes):
         super().__init__()
-        # Store padding values for manual padding (original HiFi-GAN doesn't use Conv1d padding)
-        self.paddings1 = []
-        self.convs1 = nn.ModuleList()
-        for d in dilation_sizes:
-            self.convs1.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0)))
-            self.paddings1.append((kernel_size - 1) * d // 2)
-        self.paddings2 = []
-        self.convs2 = nn.ModuleList()
-        for d in dilation_sizes:
-            self.convs2.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0)))
-            self.paddings2.append((kernel_size - 1) * d // 2)
     def forward(self, x):
-        for c1, p1, c2, p2 in zip(self.convs1, self.paddings1, self.convs2, self.paddings2):
-            xt = F.leaky_relu(x, 0.1)
-            xt = F.pad(xt, (p1, p1))
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, 0.1)
-            xt = F.pad(xt, (p2, p2))
-            xt = c2(xt)
-            x = xt + x
-        return x
-class HiFiGANGenerator(nn.Module):
-    def __init__(self, config):
         super().__init__()
-        self.num_kernels = len(config["resblock_kernel_sizes"])
-        self.conv_pre = nn.utils.weight_norm(
-            nn.Conv1d(80, config["upsample_initial_channel"], 7, 1, padding=3)
-        )
-        self.ups = nn.ModuleList()
-        for u, k in zip(config["upsample_rates"], config["upsample_kernel_sizes"]):
-            ch_in = config["upsample_initial_channel"] // (2 ** (len(self.ups)))
-            ch_out = ch_in // 2
-            self.ups.append(
-                nn.utils.weight_norm(
-                    nn.ConvTranspose1d(ch_in, ch_out, k, u, padding=(k - u) // 2)
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = config["upsample_initial_channel"] // (2 ** (i + 1))
-            for k, d in zip(config["resblock_kernel_sizes"], config["resblock_dilation_sizes"]):
-                self.resblocks.append(HiFiGANResBlock(ch, k, d))
-        ch_out = config["upsample_initial_channel"] // (2 ** len(self.ups))
-        self.conv_post = nn.utils.weight_norm(nn.Conv1d(ch_out, 1, 7, 1, padding=3))
-    def forward(self, mel):
-        x = self.conv_pre(mel)
-        for i, up in enumerate(self.ups):
-            x = F.leaky_relu(x, 0.1)
-            x = up(x)
-            xs = 0
-            for j in range(self.num_kernels):
-                xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x, 0.1)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
         return x
-# ════════════════════════════════════════════════════════════
-# Voice Model (VITS-like from training)
-# ════════════════════════════════════════════════════════════
-class VoiceModel(nn.Module):
-    def __init__(self, n_mels, hd):
         super().__init__()
-        self.encoder = self._build_encoder(n_mels, hd)
-        self.posterior = self._build_posterior(hd)
-        self.flow = self._build_flow(hd)
-        self.decoder = self._build_decoder(hd)
-    def _build_encoder(self, n_mels, hd):
-        layers = []
-        ch_in = n_mels
-        for ch_out, ks in [(hd,5),(hd,5),(hd,5),(hd*2,5),(hd*2,3)]:
-            layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)),
-                           nn.BatchNorm1d(ch_out), nn.ReLU()])
-            ch_in = ch_out
-        layers.append(nn.LayerNorm(hd*2))
-        class Enc(nn.Module):
-            def __init__(self, seq):
-                super().__init__(); self.seq = seq
-            def forward(self, x):
-                x = self.seq[:-1](x)
-                x = self.seq[-1](x.transpose(1,2)).transpose(1,2)
-                return x
-        return Enc(nn.Sequential(*layers))
-    def _build_posterior(self, hd):
-        class Post(nn.Module):
-            def __init__(s):
-                super().__init__(); s.conv = nn.utils.weight_norm(nn.Conv1d(hd*2, 384, 1))
-            def forward(s, x):
-                stats = s.conv(x); m, logs = torch.split(stats, 192, dim=1)
-                z = m + torch.randn_like(m)*torch.exp(logs); return z, m, logs
-        return Post()
-    def _build_flow(self, hd):
-        class Flow(nn.Module):
-            def __init__(s):
-                super().__init__()
-                s.net = nn.Sequential(nn.Conv1d(96,hd,1), nn.ReLU(), nn.Conv1d(hd,hd,1), nn.ReLU(), nn.Conv1d(hd,192,1))
-            def forward(s, z):
-                z1, z2 = torch.split(z, 96, dim=1); return z + s.net(z1)
-        return Flow()
-    def _build_decoder(self, hd):
-        layers = []
-        ch_in = 192
-        for ch_out, ks in [(hd*2,5),(hd*2,5),(hd,5),(hd,3)]:
-            layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)),
-                           nn.BatchNorm1d(ch_out), nn.ReLU()])
-            ch_in = ch_out
-        layers.append(nn.utils.weight_norm(nn.Conv1d(hd, 80, 1)))
-        return nn.Sequential(*layers)
-# ════════════════════════════════════════════════════════════
-# Model Loading
-# ════════════════════════════════════════════════════════════
-def download_file(repo_id, filename, repo_type="dataset"):
-    from huggingface_hub import hf_hub_download
-    return hf_hub_download(repo_id=repo_id, filename=filename, repo_type=repo_type)
-def load_hifigan():
-    cfg_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "config.json", repo_type="model")
-    weights_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "g_02500000", repo_type="model")
-    with open(cfg_path) as f:
-        hfg_cfg = json.load(f)
-    vocoder = HiFiGANGenerator(hfg_cfg)
-    ckpt = torch.load(weights_path, map_location="cpu", weights_only=False)
-    vocoder.load_state_dict(ckpt["generator"])
-    vocoder.eval()
-    return vocoder, hfg_cfg
-def load_voice_model():
-    model_file = download_file("ayf3/numberblocks-one-voice-dataset", "models/one_voice_rvc_v2.pth")
-    ckpt = torch.load(model_file, map_location="cpu", weights_only=False)
-    cfg = ckpt['config']
-    sd = ckpt['model_state_dict']
-    model = VoiceModel(cfg['n_mels'], cfg['hidden_dim'])
-    model.load_state_dict(sd, strict=False)
-    model.eval()
-    return model, cfg
-# ═════════════════════════════════════════════════════���══════
-# Audio Processing
-# ════════════════════════════════════════════════════════════
-def mel_spectrogram(audio, sr, n_mels=80, hop_length=256, win_length=1024, n_fft=1024):
-    import librosa
-    mel = librosa.feature.melspectrogram(
-        y=audio.astype(np.float32), sr=sr, n_mels=n_mels,
-        hop_length=hop_length, win_length=win_length, n_fft=n_fft, fmax=8000
-    )
-    mel_db = librosa.power_to_db(mel, ref=np.max)
-    return mel_db
-def mel_to_audio_hifigan(vocoder, mel_tensor):
-    with torch.no_grad():
-        audio = vocoder(mel_tensor)
-    return audio.squeeze().cpu().numpy()
-# ════════════════════════════════════════════════════════════
-# Globals & Init
-# ════════════════════════════════════════════════════════════
-voice_model = None
-voice_config = None
-hifigan = None
-hifigan_config = None
-def init_models():
-    global voice_model, voice_config, hifigan, hifigan_config
-    if voice_model is None:
-        print("Loading voice model...")
-        voice_model, voice_config = load_voice_model()
-        print("Voice model loaded.")
-    if hifigan is None:
-        print("Loading HiFi-GAN vocoder...")
-        hifigan, hifigan_config = load_hifigan()
-        print("HiFi-GAN vocoder loaded.")
-# ════════════════════════════════════════════════════════════
-# Core Functions
-# ════════════════════════════════════════════════════════════
-def convert_voice(audio_input, transpose=0):
-    init_models()
-    import librosa
-    if audio_input is None:
-        return None, "❌ 请上传音频文件"
-    sr_in, data = audio_input[0], audio_input[1]
-    # Resample to 44100 for HiFi-GAN
-    if sr_in != 44100:
-        data = librosa.resample(data.astype(np.float32), orig_sr=sr_in, target_sr=44100)
-    if len(data.shape) > 1:
-        data = data.mean(axis=1)
-    # Trim to max 30 seconds
-    max_samples = 44100 * 30
-    if len(data) > max_samples:
-        data = data[:max_samples]
-    # Compute mel spectrogram
-    mel = mel_spectrogram(data, 44100)
-    mel_norm = (mel - mel.mean()) / (mel.std() + 1e-8)
-    if transpose != 0:
-        mel_norm = np.roll(mel_norm, transpose, axis=0)
-    # Voice model timbre transfer
-    with torch.no_grad():
-        mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0)
-        mel_out, _, _ = voice_model(mel_tensor)
-    mel_out_np = mel_out.squeeze().cpu().numpy()
-    mel_out_np = np.clip(mel_out_np, -4.0, 4.0)
-    # HiFi-GAN vocoding
-    with torch.no_grad():
-        audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0))
-    mx = np.max(np.abs(audio_out))
-    if mx > 0:
-        audio_out = audio_out / mx * 0.85
-    return (44100, audio_out.astype(np.float32)), \
-        f"✅ 转换完成! (HiFi-GAN vocoder)\n输入: {len(data)/44100:.1f}s → 输出: {len(audio_out)/44100:.1f}s"
-def generate_sample():
-    init_models()
-    n_frames = 400
-    with torch.no_grad():
-        z = torch.randn(1, 192, n_frames) * 0.5
-        z = voice_model.flow(z)
-        mel_out = voice_model.decoder(z)
-    mel_out_np = np.clip(mel_out.squeeze().cpu().numpy(), -4.0, 4.0)
-    with torch.no_grad():
-        audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0))
-    mx = np.max(np.abs(audio_out))
-    if mx > 0:
-        audio_out = audio_out / mx * 0.85
-    return (44100, audio_out.astype(np.float32)), \
-        f"✅ 生成完成! (HiFi-GAN vocoder)\n时长: {len(audio_out)/44100:.1f}s"
-# ════════════════════════════════════════════════════════════
 # Gradio UI
-# ════════════════════════════════════════════════════════════
-with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo:
     gr.HTML("""
-    <div style="text-align:center; margin-bottom:1rem">
-        <h1 style="color:#ff6b6b">🎙️ NumberBlocks One 语音克隆</h1>
-        <p>RVC v2 Model + HiFi-GAN Vocoder — High Quality Voice Conversion</p>
     </div>
     """)
-    with gr.Tab("🔊 Voice Conversion"):
-        gr.Markdown("上传音频，将其转换为 One 的声音（使用 HiFi-GAN 神经声码器提升音质）")
-        audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"])
-        pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)")
-        convert_btn = gr.Button("🔄 转换", variant="primary")
-        audio_out = gr.Audio(label="输出音频 (HiFi-GAN)")
-        status = gr.Textbox(label="状态")
-        convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status])
-    with gr.Tab("🎵 Sample Generation"):
-        gr.Markdown("生成 One 的随机语音样本（使用 HiFi-GAN 神经声码器）")
-        gen_btn = gr.Button("🎵 生成样本", variant="primary")
-        gen_out = gr.Audio(label="生成音频 (HiFi-GAN)")
-        gen_status = gr.Textbox(label="状态")
-        gen_btn.click(generate_sample, outputs=[gen_out, gen_status])
     with gr.Tab("ℹ️ About"):
-        gr.Markdown("""
-        ### Model Info
-        - **Voice Model**: VITS-like (Encoder + Posterior + Flow + Decoder) — 5.3M params
-        - **Vocoder**: HiFi-GAN Universal (44100Hz) — 928K params
-        - **Sample Rate**: 44100 Hz
-        - **Training Data**: 100 source files, 1,334 chunks
-        - **Training Steps**: 500
-        ### What's New
-        - ✅ Integrated HiFi-GAN neural vocoder replacing overlap-add
-        - ✅ Significantly improved audio quality and naturalness
-        - ✅ Proper mel→audio conversion with learned upsampling
-        ### Links
-        - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
-        - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer)
         """)
-demo.launch(server_name="0.0.0.0", server_port=7860)

 #!/usr/bin/env python3
 """
+NumberBlocks One Voice Cloner - HiFi-GAN V2
+集成 HiFi-GAN vocoder 提升推理音质
+功能：
+1. 上传音频 → RVC 音色转换（使用 HiFi-GAN vocoder）
+2. 随机采样生成 One 的语音
+3. 音高调节
+技术栈：
+- RVC 模型 (one_voice_rvc_v2.pth, 60.7MB VITS-like)
+- HiFi-GAN Universal Vocoder (预训练)
+- Gradio UI
 """
+import os
+import json
+import random
+import tempfile
 import numpy as np
+import soundfile as sf
+import librosa
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import gradio as gr
+from pathlib import Path
+from huggingface_hub import hf_hub_download, HfApi
+# ============================================================
+# 模型定义 - VITS-like RVC Model
+# ============================================================
+class PosteriorEncoder(nn.Module):
+    def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = nn.ModuleList()
+        for _ in range(n_layers):
+            self.enc.append(nn.Sequential(
+                nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
+                          padding=(kernel_size - 1) * dilation_rate // 2,
+                          dilation=dilation_rate),
+                nn.GLU(dim=1),
+            ))
+        self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1)
     def forward(self, x):
+        x = self.pre(x)
+        for layer in self.enc:
+            x = x + layer(x)
+        stats = self.proj(x)
+        m, logs = stats.chunk(2, dim=1)
+        return m, logs
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
         super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(n_flows):
+            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
+            self.flows.append(Flip())
+    def forward(self, x, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, reverse=reverse)
         return x
+class ResidualCouplingLayer(nn.Module):
+    def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
+        self.pre = nn.Conv1d(channels, hidden_channels, 1)
+        self.enc = nn.ModuleList()
+        for _ in range(n_layers):
+            self.enc.append(nn.Sequential(
+                nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
+                          padding=(kernel_size - 1) * dilation_rate // 2,
+                          dilation=dilation_rate),
+                nn.GLU(dim=1),
+            ))
+        self.post = nn.Conv1d(hidden_channels, channels * 2, 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, reverse=False):
+        h = self.pre(x)
+        for layer in self.enc:
+            h = h + layer(h)
+        stats = self.post(h)
+        m, logs = stats.chunk(2, dim=1)
+        if not reverse:
+            log_s = torch.clamp(logs, -5.0, 5.0)
+            y = m + x * torch.exp(log_s)
+            logdet = torch.sum(log_s)
+            return y, logdet
+        else:
+            log_s = torch.clamp(logs, -5.0, 5.0)
+            y = (x - m) * torch.exp(-log_s)
+            return y
+class Flip(nn.Module):
+    def forward(self, x, reverse=False):
+        if not reverse:
+            return torch.flip(x, [1]), 0
+        else:
+            return torch.flip(x, [1])
+class Decoder(nn.Module):
+    def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
+        super().__init__()
+        self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1)
+        self.dec = nn.ModuleList()
+        for _ in range(n_layers):
+            self.dec.append(nn.Sequential(
+                nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
+                          padding=(kernel_size - 1) * dilation_rate // 2,
+                          dilation=dilation_rate),
+                nn.GLU(dim=1),
+            ))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    def forward(self, x):
+        x = self.pre(x)
+        for layer in self.dec:
+            x = x + layer(x)
+        return self.proj(x)
+class RVCModel(nn.Module):
+    """VITS-like RVC v3.0 Model (5.3M params)"""
+    def __init__(self, n_mels=80, hidden_channels=192):
+        super().__init__()
+        self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
+        self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels)
+        self.dec = Decoder(hidden_channels, n_mels)
+        self.n_mels = n_mels
+    def forward(self, mel):
+        m, logs = self.enc_p(mel)
+        z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0
+        z_p = self.flow(z)
+        z_back = self.flow(z_p, reverse=True)
+        mel_out = self.dec(z_back)
+        return mel_out
+    def infer(self, mel, noise_scale=0.0):
+        m, logs = self.enc_p(mel)
+        z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale
+        z_p = self.flow(z)
+        z_back = self.flow(z_p, reverse=True)
+        mel_out = self.dec(z_back)
+        return mel_out
+# ============================================================
+# HiFi-GAN Vocoder Definition
+# ============================================================
+class ResBlock1(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.convs = nn.ModuleList()
+        for d in dilation:
+            self.convs.append(nn.Sequential(
+                nn.LeakyReLU(0.1),
+                nn.Conv1d(channels, channels, kernel_size, dilation=d,
+                          padding=(kernel_size - 1) * d // 2),
+                nn.LeakyReLU(0.1),
+                nn.Conv1d(channels, channels, kernel_size, dilation=1,
+                          padding=(kernel_size - 1) // 2),
+            ))
+    def forward(self, x):
+        for conv in self.convs:
+            x = x + conv(x)
+        return x
+class HiFiGANGenerator(nn.Module):
+    """HiFi-GAN Generator (Universal V1 compatible)"""
+    def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
+                 upsample_kernel_sizes=(16, 16, 4, 4),
+                 upsample_initial_channel=512,
+                 resblock_kernel_sizes=(3, 7, 11),
+                 resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
+        super().__init__()
+        self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
+        self.num_upsamples = len(upsample_rates)
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.ups = nn.ModuleList()
+        self.resblocks = nn.ModuleList()
+        ch = upsample_initial_channel
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            ch_new = ch // 2
+            self.ups.append(nn.ConvTranspose1d(ch, ch_new, k, u, padding=(k - u) // 2))
+            for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock1(ch_new, rk, rd))
+            ch = ch_new
+        self.conv_post = nn.Sequential(
+            nn.LeakyReLU(0.1),
+            nn.Conv1d(ch, 1, 7, padding=3),
+            nn.Tanh(),
+        )
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, 0.1)
+            x = self.ups[i](x)
+            xs = 0
+            for j in range(self.num_kernels):
+                xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = self.conv_post(x)
+        return x
+# ============================================================
+# Mel-spectrogram utilities
+# ============================================================
+def mel_spectrogram(y, n_fft=1024, hop_length=256, win_length=1024,
+                    n_mels=80, sample_rate=40000, fmin=0, fmax=None):
+    """Compute mel spectrogram"""
+    if fmax is None:
+        fmax = sample_rate // 2
+    mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mels,
+                                     fmin=fmin, fmax=fmax)
+    window = torch.hann_window(win_length)
+    # Pad signal
+    pad_length = (win_length - hop_length) // 2
+    y = torch.nn.functional.pad(y, (pad_length, pad_length), mode='reflect')
+    # STFT
+    stft = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_length,
+                       window=window, center=False, return_complex=True)
+    magnitudes = torch.sqrt(stft.real ** 2 + stft.imag ** 2 + 1e-7)
+    # Mel filterbank
+    mel_basis_t = torch.tensor(mel_basis, dtype=magnitudes.dtype)
+    mel = torch.matmul(mel_basis_t, magnitudes)
+    # Log
+    mel = torch.log(torch.clamp(mel, min=1e-5))
+    return mel
+# ============================================================
+# Inference Engine
+# ============================================================
+class VoiceCloner:
+    def __init__(self):
+        self.device = torch.device('cpu')
+        self.rvc_model = None
+        self.hifigan = None
+        self.sample_rate = 40000
+        self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
+        self.model_loaded = False
+        self.samples = []
+        self.load_models()
+    def load_models(self):
+        """Load RVC model + HiFi-GAN vocoder"""
+        print("Loading RVC model...")
+        try:
+            model_path = hf_hub_download(
+                repo_id=self.dataset_id,
+                filename="models/one_voice_rvc_v2.pth",
+                repo_type="dataset"
+            )
+            ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
+            # Determine model config
+            if isinstance(ckpt, dict) and 'model' in ckpt:
+                state_dict = ckpt['model']
+            elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
+                state_dict = ckpt['state_dict']
+            else:
+                state_dict = ckpt
+            # Auto-detect hidden channels from state_dict
+            hidden_ch = 192
+            for k, v in state_dict.items():
+                if 'enc_p.pre.weight' in k:
+                    hidden_ch = v.shape[0]
+                    break
+            self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
+            self.rvc_model.load_state_dict(state_dict, strict=False)
+            self.rvc_model.eval()
+            print(f"✅ RVC model loaded (hidden={hidden_ch})")
+        except Exception as e:
+            print(f"❌ RVC model load failed: {e}")
+            self.rvc_model = None
+        print("Loading HiFi-GAN vocoder...")
+        try:
+            # Try loading from local or download
+            hifigan_path = self._get_hifigan()
+            if hifigan_path:
+                ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
+                if isinstance(ckpt, dict) and 'generator' in ckpt:
+                    state_dict = ckpt['generator']
+                elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
+                    state_dict = {k.replace('generator.', ''): v
+                                  for k, v in ckpt['state_dict'].items()
+                                  if k.startswith('generator.')}
+                else:
+                    state_dict = ckpt
+                self.hifigan = HiFiGANGenerator()
+                self.hifigan.load_state_dict(state_dict, strict=False)
+                self.hifigan.eval()
+                print("✅ HiFi-GAN vocoder loaded")
+            else:
+                print("⚠️ HiFi-GAN not available, will use Griffin-Lim fallback")
+        except Exception as e:
+            print(f"⚠️ HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
+            self.hifigan = None
+        # Load sample list for random generation
+        try:
+            api = HfApi()
+            files = api.list_repo_files(self.dataset_id, repo_type="dataset")
+            self.samples = [f for f in files if f.startswith('models/top_')
+                           and f.endswith('.wav')
+                           and '_p+' not in f and '_p-' not in f and '_s+' not in f]
+            print(f"✅ Found {len(self.samples)} sample audio files")
+        except Exception as e:
+            print(f"⚠️ Could not list samples: {e}")
+            self.samples = []
+        self.model_loaded = self.rvc_model is not None
+    def _get_hifigan(self):
+        """Get HiFi-GAN model - download if needed"""
+        # Try downloading from jik876/hifi-gan
+        try:
+            path = hf_hub_download(
+                repo_id="jik876/hifi-gan",
+                filename="UNIVERSAL_V1/g_02500000",
+            )
+            return path
+        except:
+            pass
+        # Try alternative location
+        try:
+            path = hf_hub_download(
+                repo_id="facebook/hifigan-universal-v1",
+                filename="hifigan.pt",
+            )
+            return path
+        except:
+            pass
+        return None
+    def mel_to_audio_hifigan(self, mel):
+        """Convert mel spectrogram to audio using HiFi-GAN"""
+        with torch.no_grad():
+            audio = self.hifigan(mel.unsqueeze(0))
+        return audio.squeeze(0).squeeze(0).cpu().numpy()
+    def mel_to_audio_griffinlim(self, mel, sr=40000, n_fft=1024, hop_length=256, n_iter=32):
+        """Fallback: Convert mel to audio using Griffin-Lim"""
+        mel_np = mel.cpu().numpy()
+        S = librosa.feature.inverse.mel_to_stft(
+            mel_np, sr=sr, n_fft=n_fft, power=2.0
+        )
+        y = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length, win_length=n_fft)
+        return y
+    def process_audio(self, input_audio, pitch_shift=0):
+        """
+        Process audio through RVC model + HiFi-GAN vocoder
+        Args:
+            input_audio: path to input audio file
+            pitch_shift: semitone shift
+        Returns:
+            output audio path, status message
+        """
+        if not self.model_loaded:
+            return None, "❌ 模型未加载"
+        try:
+            # Load audio
+            y, sr = librosa.load(input_audio, sr=self.sample_rate)
+            # Apply pitch shift
+            if pitch_shift != 0:
+                y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
+            # Trim silence
+            y, _ = librosa.effects.trim(y, top_db=20)
+            # Limit length
+            max_len = 10 * self.sample_rate  # 10 seconds max
+            if len(y) > max_len:
+                y = y[:max_len]
+            # Compute mel spectrogram
+            y_tensor = torch.tensor(y, dtype=torch.float32)
+            mel = mel_spectrogram(y_tensor, sample_rate=self.sample_rate, n_mels=80)
+            # RVC inference
+            with torch.no_grad():
+                mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
+                mel_out = mel_out.squeeze(0)
+            # Vocoder
+            if self.hifigan is not None:
+                audio_out = self.mel_to_audio_hifigan(mel_out)
+                vocoder_name = "HiFi-GAN"
+            else:
+                audio_out = self.mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
+                vocoder_name = "Griffin-Lim"
+            # Normalize
+            audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
+            # Save
+            output_path = tempfile.mktemp(suffix='.wav')
+            sf.write(output_path, audio_out, self.sample_rate)
+            return output_path, f"✅ 转换成功 ({vocoder_name}) | 输入: {len(y)/sr:.1f}s → 输出: {len(audio_out)/self.sample_rate:.1f}s"
+        except Exception as e:
+            return None, f"❌ 转换失败: {str(e)}"
+    def generate_random(self):
+        """Generate audio from a random sample"""
+        if not self.samples:
+            return None, "❌ 没有可用的样本"
+        try:
+            sample = random.choice(self.samples)
+            sample_path = hf_hub_download(
+                repo_id=self.dataset_id,
+                filename=sample,
+                repo_type="dataset"
+            )
+            output, msg = self.process_audio(sample_path)
+            if output:
+                return output, f"✅ {msg}\n采样: {Path(sample).name}"
+            return output, msg
+        except Exception as e:
+            return None, f"❌ 生成失败: {str(e)}"
+# ============================================================
 # Gradio UI
+# ============================================================
+print("🚀 Initializing NumberBlocks One Voice Cloner...")
+cloner = VoiceCloner()
+with gr.Blocks(
+    title="NumberBlocks One Voice",
+    theme=gr.themes.Soft(),
+    css="""
+    .header { text-align: center; margin-bottom: 1rem; }
+    .header h1 { color: #ff6b6b; }
+    """
+) as demo:
     gr.HTML("""
+    <div class="header">
+        <h1>🎭 NumberBlocks One Voice Cloner</h1>
+        <p>RVC v2 Model (60.7MB) + HiFi-GAN Vocoder</p>
     </div>
     """)
+    with gr.Tab("🎤 Voice Conversion"):
+        gr.Markdown("### 上传音频 → 转换为 One 的声音")
+        with gr.Row():
+            with gr.Column():
+                vc_input = gr.Audio(label="上传音频", type="filepath", sources=["upload", "microphone"])
+                vc_pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="音高偏移 (半音)")
+                vc_btn = gr.Button("🎙️ 转换", variant="primary", size="lg")
+            with gr.Column():
+                vc_output = gr.Audio(label="转换结果", type="filepath")
+                vc_status = gr.Textbox(label="状态")
+        vc_btn.click(
+            fn=cloner.process_audio,
+            inputs=[vc_input, vc_pitch],
+            outputs=[vc_output, vc_status]
+        )
+    with gr.Tab("🎲 Random Sample"):
+        gr.Markdown("### 随机采样 + RVC 转换")
+        with gr.Row():
+            rand_btn = gr.Button("🎲 随机生成", variant="primary", size="lg")
+        with gr.Row():
+            rand_output = gr.Audio(label="生成结果", type="filepath")
+            rand_status = gr.Textbox(label="状态")
+        rand_btn.click(
+            fn=cloner.generate_random,
+            inputs=[],
+            outputs=[rand_output, rand_status]
+        )
     with gr.Tab("ℹ️ About"):
+        model_status = "✅ 已加载" if cloner.model_loaded else "❌ 未加载"
+        hifigan_status = "✅ HiFi-GAN" if cloner.hifigan else "⚠️ Griffin-Lim (fallback)"
+        gr.Markdown(f"""
+        ### NumberBlocks One Voice Cloner V2
+        **模型**: RVC v3.0 (VITS-like, 5.3M params, 60.7MB)
+        **Vocoder**: {hifigan_status}
+        **采样率**: 40kHz
+        **模型状态**: {model_status}
+        **训练数据**: 100 源文件 → 1,334 chunks, 500 steps
+        **Dataset**: [ayf3/numberblocks-one-voice-dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
+        **功能**:
+        - ✅ 上传音频 → One 音色转换
+        - ✅ 随机采样生成
+        - ✅ 音高调节 (-12 ~ +12 半音)
+        - ✅ HiFi-GAN 高品质 vocoder
+        **限制**:
+        - CPU 推理，速度较慢
+        - 输入建议 < 10 秒
+        - 音质取决于输入质量
         """)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)