ayf3 commited on
Commit
71ee5ef
·
verified ·
1 Parent(s): 8a9f5a9

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +500 -283
app.py CHANGED
@@ -1,332 +1,549 @@
1
  #!/usr/bin/env python3
2
  """
3
- NumberBlocks One Voice Cloner - RVC Inference with HiFi-GAN Vocoder
4
- Uses the trained RVC v2 model + pretrained HiFi-GAN universal vocoder for high-quality synthesis.
 
 
 
 
 
 
 
 
 
 
5
  """
6
- import os, json
7
- import gradio as gr
 
 
 
8
  import numpy as np
 
 
9
  import torch
10
  import torch.nn as nn
11
  import torch.nn.functional as F
 
 
 
12
 
13
- # ════════════════════════════════════════════════════════════
14
- # HiFi-GAN Generator (exact match to pretrained weights)
15
- # ════════════════════════════════════════════════════════════
16
 
17
- class HiFiGANResBlock(nn.Module):
18
- def __init__(self, channels, kernel_size, dilation_sizes):
19
  super().__init__()
20
- # Store padding values for manual padding (original HiFi-GAN doesn't use Conv1d padding)
21
- self.paddings1 = []
22
- self.convs1 = nn.ModuleList()
23
- for d in dilation_sizes:
24
- self.convs1.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0)))
25
- self.paddings1.append((kernel_size - 1) * d // 2)
26
- self.paddings2 = []
27
- self.convs2 = nn.ModuleList()
28
- for d in dilation_sizes:
29
- self.convs2.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0)))
30
- self.paddings2.append((kernel_size - 1) * d // 2)
31
 
32
  def forward(self, x):
33
- for c1, p1, c2, p2 in zip(self.convs1, self.paddings1, self.convs2, self.paddings2):
34
- xt = F.leaky_relu(x, 0.1)
35
- xt = F.pad(xt, (p1, p1))
36
- xt = c1(xt)
37
- xt = F.leaky_relu(xt, 0.1)
38
- xt = F.pad(xt, (p2, p2))
39
- xt = c2(xt)
40
- x = xt + x
41
- return x
42
 
43
 
44
- class HiFiGANGenerator(nn.Module):
45
- def __init__(self, config):
46
  super().__init__()
47
- self.num_kernels = len(config["resblock_kernel_sizes"])
48
- self.conv_pre = nn.utils.weight_norm(
49
- nn.Conv1d(80, config["upsample_initial_channel"], 7, 1, padding=3)
50
- )
51
- self.ups = nn.ModuleList()
52
- for u, k in zip(config["upsample_rates"], config["upsample_kernel_sizes"]):
53
- ch_in = config["upsample_initial_channel"] // (2 ** (len(self.ups)))
54
- ch_out = ch_in // 2
55
- self.ups.append(
56
- nn.utils.weight_norm(
57
- nn.ConvTranspose1d(ch_in, ch_out, k, u, padding=(k - u) // 2)
58
- )
59
- )
60
- self.resblocks = nn.ModuleList()
61
- for i in range(len(self.ups)):
62
- ch = config["upsample_initial_channel"] // (2 ** (i + 1))
63
- for k, d in zip(config["resblock_kernel_sizes"], config["resblock_dilation_sizes"]):
64
- self.resblocks.append(HiFiGANResBlock(ch, k, d))
65
- ch_out = config["upsample_initial_channel"] // (2 ** len(self.ups))
66
- self.conv_post = nn.utils.weight_norm(nn.Conv1d(ch_out, 1, 7, 1, padding=3))
67
-
68
- def forward(self, mel):
69
- x = self.conv_pre(mel)
70
- for i, up in enumerate(self.ups):
71
- x = F.leaky_relu(x, 0.1)
72
- x = up(x)
73
- xs = 0
74
- for j in range(self.num_kernels):
75
- xs += self.resblocks[i * self.num_kernels + j](x)
76
- x = xs / self.num_kernels
77
- x = F.leaky_relu(x, 0.1)
78
- x = self.conv_post(x)
79
- x = torch.tanh(x)
80
  return x
81
 
82
 
83
- # ════════════════════════════════════════════════════════════
84
- # Voice Model (VITS-like from training)
85
- # ════════════════════════════════════════════════════════════
86
-
87
- class VoiceModel(nn.Module):
88
- def __init__(self, n_mels, hd):
89
  super().__init__()
90
- self.encoder = self._build_encoder(n_mels, hd)
91
- self.posterior = self._build_posterior(hd)
92
- self.flow = self._build_flow(hd)
93
- self.decoder = self._build_decoder(hd)
94
-
95
- def _build_encoder(self, n_mels, hd):
96
- layers = []
97
- ch_in = n_mels
98
- for ch_out, ks in [(hd,5),(hd,5),(hd,5),(hd*2,5),(hd*2,3)]:
99
- layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)),
100
- nn.BatchNorm1d(ch_out), nn.ReLU()])
101
- ch_in = ch_out
102
- layers.append(nn.LayerNorm(hd*2))
103
- class Enc(nn.Module):
104
- def __init__(self, seq):
105
- super().__init__(); self.seq = seq
106
- def forward(self, x):
107
- x = self.seq[:-1](x)
108
- x = self.seq[-1](x.transpose(1,2)).transpose(1,2)
109
- return x
110
- return Enc(nn.Sequential(*layers))
111
-
112
- def _build_posterior(self, hd):
113
- class Post(nn.Module):
114
- def __init__(s):
115
- super().__init__(); s.conv = nn.utils.weight_norm(nn.Conv1d(hd*2, 384, 1))
116
- def forward(s, x):
117
- stats = s.conv(x); m, logs = torch.split(stats, 192, dim=1)
118
- z = m + torch.randn_like(m)*torch.exp(logs); return z, m, logs
119
- return Post()
120
-
121
- def _build_flow(self, hd):
122
- class Flow(nn.Module):
123
- def __init__(s):
124
- super().__init__()
125
- s.net = nn.Sequential(nn.Conv1d(96,hd,1), nn.ReLU(), nn.Conv1d(hd,hd,1), nn.ReLU(), nn.Conv1d(hd,192,1))
126
- def forward(s, z):
127
- z1, z2 = torch.split(z, 96, dim=1); return z + s.net(z1)
128
- return Flow()
129
-
130
- def _build_decoder(self, hd):
131
- layers = []
132
- ch_in = 192
133
- for ch_out, ks in [(hd*2,5),(hd*2,5),(hd,5),(hd,3)]:
134
- layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)),
135
- nn.BatchNorm1d(ch_out), nn.ReLU()])
136
- ch_in = ch_out
137
- layers.append(nn.utils.weight_norm(nn.Conv1d(hd, 80, 1)))
138
- return nn.Sequential(*layers)
139
-
140
-
141
- # ════════════════════════════════════════════════════════════
142
- # Model Loading
143
- # ════════════════════════════════════════════════════════════
144
-
145
- def download_file(repo_id, filename, repo_type="dataset"):
146
- from huggingface_hub import hf_hub_download
147
- return hf_hub_download(repo_id=repo_id, filename=filename, repo_type=repo_type)
148
-
149
- def load_hifigan():
150
- cfg_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "config.json", repo_type="model")
151
- weights_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "g_02500000", repo_type="model")
152
- with open(cfg_path) as f:
153
- hfg_cfg = json.load(f)
154
- vocoder = HiFiGANGenerator(hfg_cfg)
155
- ckpt = torch.load(weights_path, map_location="cpu", weights_only=False)
156
- vocoder.load_state_dict(ckpt["generator"])
157
- vocoder.eval()
158
- return vocoder, hfg_cfg
159
-
160
- def load_voice_model():
161
- model_file = download_file("ayf3/numberblocks-one-voice-dataset", "models/one_voice_rvc_v2.pth")
162
- ckpt = torch.load(model_file, map_location="cpu", weights_only=False)
163
- cfg = ckpt['config']
164
- sd = ckpt['model_state_dict']
165
- model = VoiceModel(cfg['n_mels'], cfg['hidden_dim'])
166
- model.load_state_dict(sd, strict=False)
167
- model.eval()
168
- return model, cfg
169
-
170
-
171
- # ═════════════════════════════════════════════════════���══════
172
- # Audio Processing
173
- # ════════════════════════════════════════════════════════════
174
-
175
- def mel_spectrogram(audio, sr, n_mels=80, hop_length=256, win_length=1024, n_fft=1024):
176
- import librosa
177
- mel = librosa.feature.melspectrogram(
178
- y=audio.astype(np.float32), sr=sr, n_mels=n_mels,
179
- hop_length=hop_length, win_length=win_length, n_fft=n_fft, fmax=8000
180
- )
181
- mel_db = librosa.power_to_db(mel, ref=np.max)
182
- return mel_db
183
-
184
- def mel_to_audio_hifigan(vocoder, mel_tensor):
185
- with torch.no_grad():
186
- audio = vocoder(mel_tensor)
187
- return audio.squeeze().cpu().numpy()
188
-
189
-
190
- # ════════════════════════════════════════════════════════════
191
- # Globals & Init
192
- # ════════════════════════════════════════════════════════════
193
-
194
- voice_model = None
195
- voice_config = None
196
- hifigan = None
197
- hifigan_config = None
198
-
199
- def init_models():
200
- global voice_model, voice_config, hifigan, hifigan_config
201
- if voice_model is None:
202
- print("Loading voice model...")
203
- voice_model, voice_config = load_voice_model()
204
- print("Voice model loaded.")
205
- if hifigan is None:
206
- print("Loading HiFi-GAN vocoder...")
207
- hifigan, hifigan_config = load_hifigan()
208
- print("HiFi-GAN vocoder loaded.")
209
-
210
-
211
- # ════════════════════════════════════════════════════════════
212
- # Core Functions
213
- # ════════════════════════════════════════════════════════════
214
-
215
- def convert_voice(audio_input, transpose=0):
216
- init_models()
217
- import librosa
218
 
219
- if audio_input is None:
220
- return None, "❌ 请上传音频文件"
 
 
 
221
 
222
- sr_in, data = audio_input[0], audio_input[1]
223
 
224
- # Resample to 44100 for HiFi-GAN
225
- if sr_in != 44100:
226
- data = librosa.resample(data.astype(np.float32), orig_sr=sr_in, target_sr=44100)
 
 
 
 
 
227
 
228
- if len(data.shape) > 1:
229
- data = data.mean(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
- # Trim to max 30 seconds
232
- max_samples = 44100 * 30
233
- if len(data) > max_samples:
234
- data = data[:max_samples]
235
 
236
- # Compute mel spectrogram
237
- mel = mel_spectrogram(data, 44100)
238
- mel_norm = (mel - mel.mean()) / (mel.std() + 1e-8)
239
 
240
- if transpose != 0:
241
- mel_norm = np.roll(mel_norm, transpose, axis=0)
 
 
 
 
 
 
 
242
 
243
- # Voice model timbre transfer
244
- with torch.no_grad():
245
- mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0)
246
- mel_out, _, _ = voice_model(mel_tensor)
247
 
248
- mel_out_np = mel_out.squeeze().cpu().numpy()
249
- mel_out_np = np.clip(mel_out_np, -4.0, 4.0)
250
 
251
- # HiFi-GAN vocoding
252
- with torch.no_grad():
253
- audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0))
 
 
 
 
 
 
 
 
 
 
254
 
255
- mx = np.max(np.abs(audio_out))
256
- if mx > 0:
257
- audio_out = audio_out / mx * 0.85
 
 
 
 
 
 
 
 
258
 
259
- return (44100, audio_out.astype(np.float32)), \
260
- f"✅ 转换完成! (HiFi-GAN vocoder)\n输入: {len(data)/44100:.1f}s → 输出: {len(audio_out)/44100:.1f}s"
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- def generate_sample():
264
- init_models()
265
 
266
- n_frames = 400
267
- with torch.no_grad():
268
- z = torch.randn(1, 192, n_frames) * 0.5
269
- z = voice_model.flow(z)
270
- mel_out = voice_model.decoder(z)
 
 
271
 
272
- mel_out_np = np.clip(mel_out.squeeze().cpu().numpy(), -4.0, 4.0)
 
 
 
 
 
273
 
274
- with torch.no_grad():
275
- audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0))
 
 
276
 
277
- mx = np.max(np.abs(audio_out))
278
- if mx > 0:
279
- audio_out = audio_out / mx * 0.85
280
 
281
- return (44100, audio_out.astype(np.float32)), \
282
- f"✅ 生成完成! (HiFi-GAN vocoder)\n时长: {len(audio_out)/44100:.1f}s"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
 
285
- # ════════════════════════════════════════════════════════════
286
  # Gradio UI
287
- # ════════════════════════════════════════════════════════════
288
-
289
- with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
 
290
  gr.HTML("""
291
- <div style="text-align:center; margin-bottom:1rem">
292
- <h1 style="color:#ff6b6b">🎙️ NumberBlocks One 语音克隆</h1>
293
- <p>RVC v2 Model + HiFi-GAN Vocoder — High Quality Voice Conversion</p>
294
  </div>
295
  """)
296
 
297
- with gr.Tab("🔊 Voice Conversion"):
298
- gr.Markdown("上传音频,将其转换为 One 的声音(使用 HiFi-GAN 神经声码器提升音质)")
299
- audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"])
300
- pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)")
301
- convert_btn = gr.Button("🔄 转换", variant="primary")
302
- audio_out = gr.Audio(label="输出 (HiFi-GAN)")
303
- status = gr.Textbox(label="状态")
304
- convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status])
305
-
306
- with gr.Tab("🎵 Sample Generation"):
307
- gr.Markdown("生成 One 的随机语音样本(使用 HiFi-GAN 神经声码器)")
308
- gen_btn = gr.Button("🎵 生成样本", variant="primary")
309
- gen_out = gr.Audio(label="生成音频 (HiFi-GAN)")
310
- gen_status = gr.Textbox(label="状态")
311
- gen_btn.click(generate_sample, outputs=[gen_out, gen_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
  with gr.Tab("ℹ️ About"):
314
- gr.Markdown("""
315
- ### Model Info
316
- - **Voice Model**: VITS-like (Encoder + Posterior + Flow + Decoder) — 5.3M params
317
- - **Vocoder**: HiFi-GAN Universal (44100Hz) — 928K params
318
- - **Sample Rate**: 44100 Hz
319
- - **Training Data**: 100 source files, 1,334 chunks
320
- - **Training Steps**: 500
321
-
322
- ### What's New
323
- - Integrated HiFi-GAN neural vocoder replacing overlap-add
324
- - ✅ Significantly improved audio quality and naturalness
325
- - ✅ Proper mel→audio conversion with learned upsampling
326
-
327
- ### Links
328
- - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
329
- - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer)
 
 
 
 
 
 
330
  """)
331
 
332
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ NumberBlocks One Voice Cloner - HiFi-GAN V2
4
+ 集成 HiFi-GAN vocoder 提升推理音质
5
+
6
+ 功能:
7
+ 1. 上传音频 → RVC 音色转换(使用 HiFi-GAN vocoder)
8
+ 2. 随机采样生成 One 的语音
9
+ 3. 音高调节
10
+
11
+ 技术栈:
12
+ - RVC 模型 (one_voice_rvc_v2.pth, 60.7MB VITS-like)
13
+ - HiFi-GAN Universal Vocoder (预训练)
14
+ - Gradio UI
15
  """
16
+
17
+ import os
18
+ import json
19
+ import random
20
+ import tempfile
21
  import numpy as np
22
+ import soundfile as sf
23
+ import librosa
24
  import torch
25
  import torch.nn as nn
26
  import torch.nn.functional as F
27
+ import gradio as gr
28
+ from pathlib import Path
29
+ from huggingface_hub import hf_hub_download, HfApi
30
 
31
+ # ============================================================
32
+ # 模型定义 - VITS-like RVC Model
33
+ # ============================================================
34
 
35
+ class PosteriorEncoder(nn.Module):
36
+ def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
37
  super().__init__()
38
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
39
+ self.enc = nn.ModuleList()
40
+ for _ in range(n_layers):
41
+ self.enc.append(nn.Sequential(
42
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
43
+ padding=(kernel_size - 1) * dilation_rate // 2,
44
+ dilation=dilation_rate),
45
+ nn.GLU(dim=1),
46
+ ))
47
+ self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1)
 
48
 
49
  def forward(self, x):
50
+ x = self.pre(x)
51
+ for layer in self.enc:
52
+ x = x + layer(x)
53
+ stats = self.proj(x)
54
+ m, logs = stats.chunk(2, dim=1)
55
+ return m, logs
 
 
 
56
 
57
 
58
+ class ResidualCouplingBlock(nn.Module):
59
+ def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
60
  super().__init__()
61
+ self.flows = nn.ModuleList()
62
+ for _ in range(n_flows):
63
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
64
+ self.flows.append(Flip())
65
+
66
+ def forward(self, x, reverse=False):
67
+ if not reverse:
68
+ for flow in self.flows:
69
+ x, _ = flow(x, reverse=reverse)
70
+ else:
71
+ for flow in reversed(self.flows):
72
+ x = flow(x, reverse=reverse)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return x
74
 
75
 
76
+ class ResidualCouplingLayer(nn.Module):
77
+ def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
 
 
 
 
78
  super().__init__()
79
+ self.pre = nn.Conv1d(channels, hidden_channels, 1)
80
+ self.enc = nn.ModuleList()
81
+ for _ in range(n_layers):
82
+ self.enc.append(nn.Sequential(
83
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
84
+ padding=(kernel_size - 1) * dilation_rate // 2,
85
+ dilation=dilation_rate),
86
+ nn.GLU(dim=1),
87
+ ))
88
+ self.post = nn.Conv1d(hidden_channels, channels * 2, 1)
89
+ self.post.weight.data.zero_()
90
+ self.post.bias.data.zero_()
91
+
92
+ def forward(self, x, reverse=False):
93
+ h = self.pre(x)
94
+ for layer in self.enc:
95
+ h = h + layer(h)
96
+ stats = self.post(h)
97
+ m, logs = stats.chunk(2, dim=1)
98
+ if not reverse:
99
+ log_s = torch.clamp(logs, -5.0, 5.0)
100
+ y = m + x * torch.exp(log_s)
101
+ logdet = torch.sum(log_s)
102
+ return y, logdet
103
+ else:
104
+ log_s = torch.clamp(logs, -5.0, 5.0)
105
+ y = (x - m) * torch.exp(-log_s)
106
+ return y
107
+
108
+
109
+ class Flip(nn.Module):
110
+ def forward(self, x, reverse=False):
111
+ if not reverse:
112
+ return torch.flip(x, [1]), 0
113
+ else:
114
+ return torch.flip(x, [1])
115
+
116
+
117
+ class Decoder(nn.Module):
118
+ def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
119
+ super().__init__()
120
+ self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1)
121
+ self.dec = nn.ModuleList()
122
+ for _ in range(n_layers):
123
+ self.dec.append(nn.Sequential(
124
+ nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
125
+ padding=(kernel_size - 1) * dilation_rate // 2,
126
+ dilation=dilation_rate),
127
+ nn.GLU(dim=1),
128
+ ))
129
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ def forward(self, x):
132
+ x = self.pre(x)
133
+ for layer in self.dec:
134
+ x = x + layer(x)
135
+ return self.proj(x)
136
 
 
137
 
138
+ class RVCModel(nn.Module):
139
+ """VITS-like RVC v3.0 Model (5.3M params)"""
140
+ def __init__(self, n_mels=80, hidden_channels=192):
141
+ super().__init__()
142
+ self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
143
+ self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels)
144
+ self.dec = Decoder(hidden_channels, n_mels)
145
+ self.n_mels = n_mels
146
 
147
+ def forward(self, mel):
148
+ m, logs = self.enc_p(mel)
149
+ z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0
150
+ z_p = self.flow(z)
151
+ z_back = self.flow(z_p, reverse=True)
152
+ mel_out = self.dec(z_back)
153
+ return mel_out
154
+
155
+ def infer(self, mel, noise_scale=0.0):
156
+ m, logs = self.enc_p(mel)
157
+ z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale
158
+ z_p = self.flow(z)
159
+ z_back = self.flow(z_p, reverse=True)
160
+ mel_out = self.dec(z_back)
161
+ return mel_out
162
+
163
+
164
+ # ============================================================
165
+ # HiFi-GAN Vocoder Definition
166
+ # ============================================================
167
+
168
+ class ResBlock1(nn.Module):
169
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
170
+ super().__init__()
171
+ self.convs = nn.ModuleList()
172
+ for d in dilation:
173
+ self.convs.append(nn.Sequential(
174
+ nn.LeakyReLU(0.1),
175
+ nn.Conv1d(channels, channels, kernel_size, dilation=d,
176
+ padding=(kernel_size - 1) * d // 2),
177
+ nn.LeakyReLU(0.1),
178
+ nn.Conv1d(channels, channels, kernel_size, dilation=1,
179
+ padding=(kernel_size - 1) // 2),
180
+ ))
181
 
182
+ def forward(self, x):
183
+ for conv in self.convs:
184
+ x = x + conv(x)
185
+ return x
186
 
 
 
 
187
 
188
+ class HiFiGANGenerator(nn.Module):
189
+ """HiFi-GAN Generator (Universal V1 compatible)"""
190
+ def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
191
+ upsample_kernel_sizes=(16, 16, 4, 4),
192
+ upsample_initial_channel=512,
193
+ resblock_kernel_sizes=(3, 7, 11),
194
+ resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
195
+ super().__init__()
196
+ self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
197
 
198
+ self.num_upsamples = len(upsample_rates)
199
+ self.num_kernels = len(resblock_kernel_sizes)
 
 
200
 
201
+ self.ups = nn.ModuleList()
202
+ self.resblocks = nn.ModuleList()
203
 
204
+ ch = upsample_initial_channel
205
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
206
+ ch_new = ch // 2
207
+ self.ups.append(nn.ConvTranspose1d(ch, ch_new, k, u, padding=(k - u) // 2))
208
+ for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
209
+ self.resblocks.append(ResBlock1(ch_new, rk, rd))
210
+ ch = ch_new
211
+
212
+ self.conv_post = nn.Sequential(
213
+ nn.LeakyReLU(0.1),
214
+ nn.Conv1d(ch, 1, 7, padding=3),
215
+ nn.Tanh(),
216
+ )
217
 
218
+ def forward(self, x):
219
+ x = self.conv_pre(x)
220
+ for i in range(self.num_upsamples):
221
+ x = F.leaky_relu(x, 0.1)
222
+ x = self.ups[i](x)
223
+ xs = 0
224
+ for j in range(self.num_kernels):
225
+ xs += self.resblocks[i * self.num_kernels + j](x)
226
+ x = xs / self.num_kernels
227
+ x = self.conv_post(x)
228
+ return x
229
 
 
 
230
 
231
+ # ============================================================
232
+ # Mel-spectrogram utilities
233
+ # ============================================================
234
+
235
+ def mel_spectrogram(y, n_fft=1024, hop_length=256, win_length=1024,
236
+ n_mels=80, sample_rate=40000, fmin=0, fmax=None):
237
+ """Compute mel spectrogram"""
238
+ if fmax is None:
239
+ fmax = sample_rate // 2
240
+ mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mels,
241
+ fmin=fmin, fmax=fmax)
242
+ window = torch.hann_window(win_length)
243
+
244
+ # Pad signal
245
+ pad_length = (win_length - hop_length) // 2
246
+ y = torch.nn.functional.pad(y, (pad_length, pad_length), mode='reflect')
247
+
248
+ # STFT
249
+ stft = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_length,
250
+ window=window, center=False, return_complex=True)
251
+ magnitudes = torch.sqrt(stft.real ** 2 + stft.imag ** 2 + 1e-7)
252
+
253
+ # Mel filterbank
254
+ mel_basis_t = torch.tensor(mel_basis, dtype=magnitudes.dtype)
255
+ mel = torch.matmul(mel_basis_t, magnitudes)
256
+
257
+ # Log
258
+ mel = torch.log(torch.clamp(mel, min=1e-5))
259
+ return mel
260
+
261
+
262
+ # ============================================================
263
+ # Inference Engine
264
+ # ============================================================
265
+
266
+ class VoiceCloner:
267
+ def __init__(self):
268
+ self.device = torch.device('cpu')
269
+ self.rvc_model = None
270
+ self.hifigan = None
271
+ self.sample_rate = 40000
272
+ self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
273
+ self.model_loaded = False
274
+ self.samples = []
275
+ self.load_models()
276
+
277
+ def load_models(self):
278
+ """Load RVC model + HiFi-GAN vocoder"""
279
+ print("Loading RVC model...")
280
+ try:
281
+ model_path = hf_hub_download(
282
+ repo_id=self.dataset_id,
283
+ filename="models/one_voice_rvc_v2.pth",
284
+ repo_type="dataset"
285
+ )
286
 
287
+ ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
 
288
 
289
+ # Determine model config
290
+ if isinstance(ckpt, dict) and 'model' in ckpt:
291
+ state_dict = ckpt['model']
292
+ elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
293
+ state_dict = ckpt['state_dict']
294
+ else:
295
+ state_dict = ckpt
296
 
297
+ # Auto-detect hidden channels from state_dict
298
+ hidden_ch = 192
299
+ for k, v in state_dict.items():
300
+ if 'enc_p.pre.weight' in k:
301
+ hidden_ch = v.shape[0]
302
+ break
303
 
304
+ self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
305
+ self.rvc_model.load_state_dict(state_dict, strict=False)
306
+ self.rvc_model.eval()
307
+ print(f"✅ RVC model loaded (hidden={hidden_ch})")
308
 
309
+ except Exception as e:
310
+ print(f"❌ RVC model load failed: {e}")
311
+ self.rvc_model = None
312
 
313
+ print("Loading HiFi-GAN vocoder...")
314
+ try:
315
+ # Try loading from local or download
316
+ hifigan_path = self._get_hifigan()
317
+ if hifigan_path:
318
+ ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
319
+ if isinstance(ckpt, dict) and 'generator' in ckpt:
320
+ state_dict = ckpt['generator']
321
+ elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
322
+ state_dict = {k.replace('generator.', ''): v
323
+ for k, v in ckpt['state_dict'].items()
324
+ if k.startswith('generator.')}
325
+ else:
326
+ state_dict = ckpt
327
+
328
+ self.hifigan = HiFiGANGenerator()
329
+ self.hifigan.load_state_dict(state_dict, strict=False)
330
+ self.hifigan.eval()
331
+ print("✅ HiFi-GAN vocoder loaded")
332
+ else:
333
+ print("⚠️ HiFi-GAN not available, will use Griffin-Lim fallback")
334
+ except Exception as e:
335
+ print(f"⚠️ HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
336
+ self.hifigan = None
337
+
338
+ # Load sample list for random generation
339
+ try:
340
+ api = HfApi()
341
+ files = api.list_repo_files(self.dataset_id, repo_type="dataset")
342
+ self.samples = [f for f in files if f.startswith('models/top_')
343
+ and f.endswith('.wav')
344
+ and '_p+' not in f and '_p-' not in f and '_s+' not in f]
345
+ print(f"✅ Found {len(self.samples)} sample audio files")
346
+ except Exception as e:
347
+ print(f"⚠️ Could not list samples: {e}")
348
+ self.samples = []
349
+
350
+ self.model_loaded = self.rvc_model is not None
351
+
352
+ def _get_hifigan(self):
353
+ """Get HiFi-GAN model - download if needed"""
354
+ # Try downloading from jik876/hifi-gan
355
+ try:
356
+ path = hf_hub_download(
357
+ repo_id="jik876/hifi-gan",
358
+ filename="UNIVERSAL_V1/g_02500000",
359
+ )
360
+ return path
361
+ except:
362
+ pass
363
+
364
+ # Try alternative location
365
+ try:
366
+ path = hf_hub_download(
367
+ repo_id="facebook/hifigan-universal-v1",
368
+ filename="hifigan.pt",
369
+ )
370
+ return path
371
+ except:
372
+ pass
373
+
374
+ return None
375
+
376
+ def mel_to_audio_hifigan(self, mel):
377
+ """Convert mel spectrogram to audio using HiFi-GAN"""
378
+ with torch.no_grad():
379
+ audio = self.hifigan(mel.unsqueeze(0))
380
+ return audio.squeeze(0).squeeze(0).cpu().numpy()
381
+
382
+ def mel_to_audio_griffinlim(self, mel, sr=40000, n_fft=1024, hop_length=256, n_iter=32):
383
+ """Fallback: Convert mel to audio using Griffin-Lim"""
384
+ mel_np = mel.cpu().numpy()
385
+ S = librosa.feature.inverse.mel_to_stft(
386
+ mel_np, sr=sr, n_fft=n_fft, power=2.0
387
+ )
388
+ y = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length, win_length=n_fft)
389
+ return y
390
+
391
+ def process_audio(self, input_audio, pitch_shift=0):
392
+ """
393
+ Process audio through RVC model + HiFi-GAN vocoder
394
+
395
+ Args:
396
+ input_audio: path to input audio file
397
+ pitch_shift: semitone shift
398
+
399
+ Returns:
400
+ output audio path, status message
401
+ """
402
+ if not self.model_loaded:
403
+ return None, "❌ 模型未加载"
404
+
405
+ try:
406
+ # Load audio
407
+ y, sr = librosa.load(input_audio, sr=self.sample_rate)
408
+
409
+ # Apply pitch shift
410
+ if pitch_shift != 0:
411
+ y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
412
+
413
+ # Trim silence
414
+ y, _ = librosa.effects.trim(y, top_db=20)
415
+
416
+ # Limit length
417
+ max_len = 10 * self.sample_rate # 10 seconds max
418
+ if len(y) > max_len:
419
+ y = y[:max_len]
420
+
421
+ # Compute mel spectrogram
422
+ y_tensor = torch.tensor(y, dtype=torch.float32)
423
+ mel = mel_spectrogram(y_tensor, sample_rate=self.sample_rate, n_mels=80)
424
+
425
+ # RVC inference
426
+ with torch.no_grad():
427
+ mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
428
+ mel_out = mel_out.squeeze(0)
429
+
430
+ # Vocoder
431
+ if self.hifigan is not None:
432
+ audio_out = self.mel_to_audio_hifigan(mel_out)
433
+ vocoder_name = "HiFi-GAN"
434
+ else:
435
+ audio_out = self.mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
436
+ vocoder_name = "Griffin-Lim"
437
+
438
+ # Normalize
439
+ audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
440
+
441
+ # Save
442
+ output_path = tempfile.mktemp(suffix='.wav')
443
+ sf.write(output_path, audio_out, self.sample_rate)
444
+
445
+ return output_path, f"✅ 转换成功 ({vocoder_name}) | 输入: {len(y)/sr:.1f}s → 输出: {len(audio_out)/self.sample_rate:.1f}s"
446
+
447
+ except Exception as e:
448
+ return None, f"❌ 转换失败: {str(e)}"
449
+
450
+ def generate_random(self):
451
+ """Generate audio from a random sample"""
452
+ if not self.samples:
453
+ return None, "❌ 没有可用的样本"
454
+
455
+ try:
456
+ sample = random.choice(self.samples)
457
+ sample_path = hf_hub_download(
458
+ repo_id=self.dataset_id,
459
+ filename=sample,
460
+ repo_type="dataset"
461
+ )
462
+ output, msg = self.process_audio(sample_path)
463
+ if output:
464
+ return output, f"✅ {msg}\n采样: {Path(sample).name}"
465
+ return output, msg
466
+ except Exception as e:
467
+ return None, f"❌ 生成失败: {str(e)}"
468
 
469
 
470
+ # ============================================================
471
  # Gradio UI
472
+ # ============================================================
473
+
474
+ print("🚀 Initializing NumberBlocks One Voice Cloner...")
475
+ cloner = VoiceCloner()
476
+
477
+ with gr.Blocks(
478
+ title="NumberBlocks One Voice",
479
+ theme=gr.themes.Soft(),
480
+ css="""
481
+ .header { text-align: center; margin-bottom: 1rem; }
482
+ .header h1 { color: #ff6b6b; }
483
+ """
484
+ ) as demo:
485
  gr.HTML("""
486
+ <div class="header">
487
+ <h1>🎭 NumberBlocks One Voice Cloner</h1>
488
+ <p>RVC v2 Model (60.7MB) + HiFi-GAN Vocoder</p>
489
  </div>
490
  """)
491
 
492
+ with gr.Tab("🎤 Voice Conversion"):
493
+ gr.Markdown("### 上传音频转换为 One 的声音")
494
+ with gr.Row():
495
+ with gr.Column():
496
+ vc_input = gr.Audio(label="上传音频", type="filepath", sources=["upload", "microphone"])
497
+ vc_pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="音高偏移 (半音)")
498
+ vc_btn = gr.Button("🎙️ 转换", variant="primary", size="lg")
499
+ with gr.Column():
500
+ vc_output = gr.Audio(label="转换结果", type="filepath")
501
+ vc_status = gr.Textbox(label="状态")
502
+
503
+ vc_btn.click(
504
+ fn=cloner.process_audio,
505
+ inputs=[vc_input, vc_pitch],
506
+ outputs=[vc_output, vc_status]
507
+ )
508
+
509
+ with gr.Tab("🎲 Random Sample"):
510
+ gr.Markdown("### 随机采样 + RVC 转换")
511
+ with gr.Row():
512
+ rand_btn = gr.Button("🎲 随机生成", variant="primary", size="lg")
513
+ with gr.Row():
514
+ rand_output = gr.Audio(label="生成结果", type="filepath")
515
+ rand_status = gr.Textbox(label="状态")
516
+
517
+ rand_btn.click(
518
+ fn=cloner.generate_random,
519
+ inputs=[],
520
+ outputs=[rand_output, rand_status]
521
+ )
522
 
523
  with gr.Tab("ℹ️ About"):
524
+ model_status = "✅ 已加载" if cloner.model_loaded else "❌ 未加载"
525
+ hifigan_status = "✅ HiFi-GAN" if cloner.hifigan else "⚠️ Griffin-Lim (fallback)"
526
+ gr.Markdown(f"""
527
+ ### NumberBlocks One Voice Cloner V2
528
+
529
+ **模型**: RVC v3.0 (VITS-like, 5.3M params, 60.7MB)
530
+ **Vocoder**: {hifigan_status}
531
+ **采样率**: 40kHz
532
+ **模型状态**: {model_status}
533
+ **训练数据**: 100 源文件 1,334 chunks, 500 steps
534
+ **Dataset**: [ayf3/numberblocks-one-voice-dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
535
+
536
+ **功能**:
537
+ - ✅ 上传音频 → One 音色转换
538
+ - ✅ 随机采样生成
539
+ - 音高调节 (-12 ~ +12 半音)
540
+ - ✅ HiFi-GAN 高品质 vocoder
541
+
542
+ **限制**:
543
+ - CPU 推理,速度较慢
544
+ - 输入建议 < 10 秒
545
+ - 音质取决于输入质量
546
  """)
547
 
548
+ if __name__ == "__main__":
549
+ demo.launch(server_name="0.0.0.0", server_port=7860)