ayf3 commited on
Commit
8b5510c
·
verified ·
1 Parent(s): fa1a0b0

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +177 -463
app.py CHANGED
@@ -1,483 +1,197 @@
1
  #!/usr/bin/env python3
2
  """
3
- NumberBlocks One Voice Cloner - V7 Architecture Fix
4
- CRITICAL FIX: Model classes now match the actual checkpoint architecture.
5
- - n_mels=128 (was 80), hidden=256 (was 192), enc_out=512, z_channels=192
6
- - Encoder: 5 Conv+BN+LayerNorm (not PosteriorEncoder)
7
- - Flow: single AffineCouplingFlow (not ResidualCouplingBlock)
8
- - Decoder: 5 Conv+BN (not generic Decoder)
9
  """
10
 
11
  import os
12
- import random
13
  import tempfile
14
- try:
15
- import numpy as np
16
- except ImportError:
17
- # Fallback: use torch operations instead
18
- np = None
19
- print("[WARN] numpy not available, using torch fallback")
20
  import soundfile as sf
21
- import torch
22
- import torch.nn as nn
23
- import torch.nn.functional as F
24
- import torchaudio
25
  from pathlib import Path
26
- from huggingface_hub import hf_hub_download, HfApi
27
- import gradio as gr
28
-
29
- print("=== NumberBlocks One Voice Cloner V7 (Architecture Fix) ===")
30
-
31
- # ============================================================
32
- # CORRECT Model Architecture
33
- # ============================================================
34
-
35
- class Encoder(nn.Module):
36
- def __init__(self, in_channels=128, hidden=256, out_channels=512):
37
- super().__init__()
38
- self.conv1 = nn.Conv1d(in_channels, hidden, 5, padding=2)
39
- self.bn1 = nn.BatchNorm1d(hidden)
40
- self.conv2 = nn.Conv1d(hidden, hidden, 5, padding=2)
41
- self.bn2 = nn.BatchNorm1d(hidden)
42
- self.conv3 = nn.Conv1d(hidden, hidden, 5, padding=2)
43
- self.bn3 = nn.BatchNorm1d(hidden)
44
- self.conv4 = nn.Conv1d(hidden, out_channels, 5, padding=2)
45
- self.bn4 = nn.BatchNorm1d(out_channels)
46
- self.conv5 = nn.Conv1d(out_channels, out_channels, 3, padding=1)
47
- self.bn5 = nn.BatchNorm1d(out_channels)
48
- self.ln = nn.LayerNorm(out_channels)
49
-
50
- def forward(self, x):
51
- x = F.relu(self.bn1(self.conv1(x)))
52
- x = F.relu(self.bn2(self.conv2(x)))
53
- x = F.relu(self.bn3(self.conv3(x)))
54
- x = F.relu(self.bn4(self.conv4(x)))
55
- x = F.relu(self.bn5(self.conv5(x)))
56
- x = x.permute(0, 2, 1)
57
- x = self.ln(x)
58
- x = x.permute(0, 2, 1)
59
- return x
60
-
61
-
62
- class Posterior(nn.Module):
63
- def __init__(self, in_channels=512, z_channels=192):
64
- super().__init__()
65
- self.conv = nn.Conv1d(in_channels, z_channels * 2, 1)
66
-
67
- def forward(self, x):
68
- h = self.conv(x)
69
- mu, logvar = h.chunk(2, dim=1)
70
- return mu, logvar
71
-
72
-
73
- class AffineCouplingFlow(nn.Module):
74
- def __init__(self, z_channels=192, hidden=256):
75
- super().__init__()
76
- self.net = nn.Sequential(
77
- nn.Conv1d(z_channels // 2, hidden, 1),
78
- nn.ReLU(),
79
- nn.Conv1d(hidden, hidden, 1),
80
- nn.ReLU(),
81
- nn.Conv1d(hidden, z_channels, 1),
82
- )
83
-
84
- def forward(self, z, reverse=False):
85
- z1, z2 = z.chunk(2, dim=1)
86
- sb = self.net(z1)
87
- s, b = sb.chunk(2, dim=1)
88
- s = torch.clamp(s, -5.0, 5.0)
89
- if not reverse:
90
- z2_new = z2 * torch.exp(s) + b
91
- z_out = torch.cat([z1, z2_new], dim=1)
92
- logdet = torch.sum(s)
93
- return z_out, logdet
94
- else:
95
- z2_new = (z2 - b) * torch.exp(-s)
96
- z_out = torch.cat([z1, z2_new], dim=1)
97
- return z_out
98
-
99
-
100
- class Decoder(nn.Module):
101
- def __init__(self, in_channels=192, out_channels=128):
102
- super().__init__()
103
- self.conv1 = nn.Conv1d(in_channels, 512, 5, padding=2)
104
- self.bn1 = nn.BatchNorm1d(512)
105
- self.conv2 = nn.Conv1d(512, 512, 5, padding=2)
106
- self.bn2 = nn.BatchNorm1d(512)
107
- self.conv3 = nn.Conv1d(512, 256, 5, padding=2)
108
- self.bn3 = nn.BatchNorm1d(256)
109
- self.conv4 = nn.Conv1d(256, 256, 3, padding=1)
110
- self.bn4 = nn.BatchNorm1d(256)
111
- self.conv5 = nn.Conv1d(256, out_channels, 1)
112
-
113
- def forward(self, x):
114
- x = F.relu(self.bn1(self.conv1(x)))
115
- x = F.relu(self.bn2(self.conv2(x)))
116
- x = F.relu(self.bn3(self.conv3(x)))
117
- x = F.relu(self.bn4(self.conv4(x)))
118
- x = self.conv5(x)
119
- return x
120
-
121
-
122
- class RVCModel(nn.Module):
123
- def __init__(self, n_mels=128, hidden=256, enc_out=512, z_channels=192):
124
- super().__init__()
125
- self.n_mels = n_mels
126
- self.encoder = Encoder(n_mels, hidden, enc_out)
127
- self.posterior = Posterior(enc_out, z_channels)
128
- self.flow = AffineCouplingFlow(z_channels, hidden)
129
- self.decoder = Decoder(z_channels, n_mels)
130
-
131
- def forward(self, mel):
132
- h = self.encoder(mel)
133
- mu, logvar = self.posterior(h)
134
- z = mu + torch.randn_like(logvar) * torch.exp(logvar) * 0.0
135
- z_p, _ = self.flow(z)
136
- z_back = self.flow(z_p, reverse=True)
137
- mel_out = self.decoder(z_back)
138
- return mel_out
139
-
140
- def infer(self, mel, noise_scale=0.0):
141
- h = self.encoder(mel)
142
- mu, logvar = self.posterior(h)
143
- z = mu + torch.randn_like(logvar) * torch.exp(logvar) * noise_scale
144
- z_p, _ = self.flow(z)
145
- z_back = self.flow(z_p, reverse=True)
146
- mel_out = self.decoder(z_back)
147
- return mel_out
148
-
149
-
150
- # ============================================================
151
- # HiFi-GAN Vocoder
152
- # ============================================================
153
-
154
- class ResBlock1(nn.Module):
155
- def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
156
- super().__init__()
157
- self.convs = nn.ModuleList()
158
- for d in dilation:
159
- self.convs.append(nn.Sequential(
160
- nn.LeakyReLU(0.1),
161
- nn.Conv1d(channels, channels, kernel_size, dilation=d,
162
- padding=(kernel_size - 1) * d // 2),
163
- nn.LeakyReLU(0.1),
164
- nn.Conv1d(channels, channels, kernel_size, dilation=1,
165
- padding=(kernel_size - 1) // 2),
166
- ))
167
-
168
- def forward(self, x):
169
- for conv in self.convs:
170
- x = x + conv(x)
171
- return x
172
-
173
 
174
- class HiFiGANGenerator(nn.Module):
175
- def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
176
- upsample_kernel_sizes=(16, 16, 4, 4),
177
- upsample_initial_channel=512,
178
- resblock_kernel_sizes=(3, 7, 11),
179
- resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
180
- super().__init__()
181
- self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
182
- self.num_upsamples = len(upsample_rates)
183
- self.num_kernels = len(resblock_kernel_sizes)
184
- self.ups = nn.ModuleList()
185
- self.resblocks = nn.ModuleList()
186
- ch = upsample_initial_channel
187
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
188
- ch_new = ch // 2
189
- self.ups.append(nn.ConvTranspose1d(ch, ch_new, k, u, padding=(k - u) // 2))
190
- for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
191
- self.resblocks.append(ResBlock1(ch_new, rk, rd))
192
- ch = ch_new
193
- self.conv_post = nn.Sequential(
194
- nn.LeakyReLU(0.1),
195
- nn.Conv1d(ch, 1, 7, padding=3),
196
- nn.Tanh(),
197
- )
198
-
199
- def forward(self, x):
200
- x = self.conv_pre(x)
201
- for i in range(self.num_upsamples):
202
- x = F.leaky_relu(x, 0.1)
203
- x = self.ups[i](x)
204
- xs = 0
205
- for j in range(self.num_kernels):
206
- xs += self.resblocks[i * self.num_kernels + j](x)
207
- x = xs / self.num_kernels
208
- x = self.conv_post(x)
209
- return x
210
-
211
-
212
- # ============================================================
213
- # Mel utilities
214
- # ============================================================
215
-
216
- SAMPLE_RATE = 40000
217
- N_MELS = 128 # MATCHES MODEL
218
-
219
- def compute_mel(y, sr=SAMPLE_RATE):
220
- mel_transform = torchaudio.transforms.MelSpectrogram(
221
- sample_rate=sr, n_fft=1024, hop_length=256,
222
- n_mels=N_MELS, f_min=0.0, f_max=float(sr // 2),
223
- power=2.0, norm=None, mel_scale="htk",
224
- )
225
- mel = mel_transform(y)
226
- mel = torch.log(torch.clamp(mel, min=1e-5))
227
- return mel
228
-
229
-
230
- def _get_mel_fb_pinv(sr=SAMPLE_RATE, n_mels=N_MELS):
231
- """Compute pseudo-inverse of mel filterbank (cached)."""
232
- _melscale_fn = getattr(torchaudio.functional, 'melscale_filterbanks', None) or \
233
- getattr(torchaudio.functional, 'melscale_fbanks', None)
234
- if _melscale_fn is None:
235
- # Fallback: create a MelSpectrogram and extract its filterbank
236
- m = torchaudio.transforms.MelSpectrogram(
237
- sample_rate=sr, n_fft=1024, hop_length=256,
238
- n_mels=n_mels, f_min=0, f_max=float(sr // 2),
239
- norm=None, mel_scale="htk",
240
- )
241
- fb = m.fb if hasattr(m, 'fb') else m.mel_scale.fb
242
- else:
243
- fb = _melscale_fn(
244
- n_freqs=513, f_min=0, f_max=float(sr // 2),
245
- n_mels=n_mels, sample_rate=sr, norm=None, mel_scale="htk",
246
- )
247
- return torch.linalg.pinv(fb) # (513, n_mels)
248
-
249
-
250
- _FB_PINV_CACHE = {}
251
 
252
- def mel_to_audio_griffinlim(mel, sr=SAMPLE_RATE, n_iter=60):
253
- key = (sr, mel.shape[0])
254
- if key not in _FB_PINV_CACHE:
255
- _FB_PINV_CACHE[key] = _get_mel_fb_pinv(sr=sr, n_mels=mel.shape[0])
256
- fb_pinv = _FB_PINV_CACHE[key]
257
-
258
- mel_power = torch.exp(mel)
259
- spec = fb_pinv @ mel_power
260
- spec = torch.clamp(spec, min=0)
261
-
262
- gl = torchaudio.transforms.GriffinLim(n_fft=1024, hop_length=256, n_iter=n_iter)
263
- audio = gl(spec)
264
- return audio.detach().cpu().numpy() if np is not None else audio.detach().cpu().tolist()
265
-
266
-
267
- # ============================================================
268
- # Inference Engine
269
- # ============================================================
270
-
271
- class VoiceCloner:
272
- def __init__(self):
273
- self.device = torch.device("cpu")
274
- self.rvc_model = None
275
- self.hifigan = None
276
- self._hifigan_loaded = False
277
- self.model_loaded = False
278
- self.samples = None
279
- self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
280
- self._load_rvc()
281
-
282
- def _load_rvc(self):
283
- print("[STARTUP] Loading RVC model (V7 correct architecture)...")
284
  try:
285
- model_path = hf_hub_download(
286
- repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
287
- )
288
- ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
289
- sd = ckpt["model_state_dict"]
290
-
291
- model = RVCModel(n_mels=128, hidden=256, enc_out=512, z_channels=192)
292
- result = model.load_state_dict(sd, strict=True)
293
- print(f"[STARTUP] strict=True: missing={result.missing_keys}, unexpected={result.unexpected_keys}")
294
- model.eval()
295
- self.rvc_model = model
296
- self.model_loaded = True
297
- print(f"[STARTUP] RVC model loaded OK (5,296,064 params, strict=True)")
298
  except Exception as e:
299
- print(f"[STARTUP] RVC model load FAILED: {e}")
300
- import traceback
301
  traceback.print_exc()
302
-
303
- def _ensure_hifigan(self):
304
- if self._hifigan_loaded:
305
- return
306
- self._hifigan_loaded = True
307
- print("[LAZY] Loading HiFi-GAN vocoder...")
308
- try:
309
- hifigan_path = hf_hub_download(
310
- repo_id="csdc-atl/hifigan-universal_v1", filename="g_02500000"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  )
312
- ckpt = torch.load(hifigan_path, map_location="cpu", weights_only=False)
313
- state_dict = ckpt.get("generator", ckpt.get("state_dict", ckpt))
314
- if any(k.startswith("generator.") for k in state_dict):
315
- state_dict = {k.replace("generator.", ""): v for k, v in state_dict.items() if k.startswith("generator.")}
316
- self.hifigan = HiFiGANGenerator()
317
- self.hifigan.load_state_dict(state_dict, strict=False)
318
- self.hifigan.eval()
319
- print("[LAZY] HiFi-GAN loaded OK (Griffin-Lim fallback for mel conversion)")
320
- except Exception as e:
321
- print(f"[LAZY] HiFi-GAN FAILED: {e}")
322
- self.hifigan = None
323
-
324
- def _ensure_samples(self):
325
- if self.samples is not None:
326
- return
327
- self.samples = []
328
- try:
329
- api = HfApi()
330
- files = api.list_repo_files(self.dataset_id, repo_type="dataset")
331
- # Look for cleaned audio files as samples
332
- self.samples = [f for f in files if f.startswith("audio/") and f.endswith("_cleaned.wav")]
333
- if not self.samples:
334
- self.samples = [f for f in files if f.startswith("audio/") and f.endswith(".wav") and not f.endswith("_cleaned.wav")][:10]
335
- print(f"[LAZY] Found {len(self.samples)} samples")
336
- except Exception as e:
337
- print(f"[LAZY] Could not list samples: {e}")
338
-
339
- def _mel_to_audio(self, mel_out):
340
- """Convert mel spectrogram back to audio.
341
- RVC model outputs 128-bin mel @ 40kHz.
342
- HiFi-GAN expects 80-bin mel @ 22.05kHz.
343
- Pipeline: Griffin-Lim(128bin@40k) → audio → resample(22.05k) → mel(80bin) → HiFi-GAN → audio
344
- """
345
- if self.hifigan is not None:
346
- try:
347
- # Step 1: Griffin-Lim to get rough audio at 40kHz
348
- audio_gl = mel_to_audio_griffinlim(mel_out, sr=SAMPLE_RATE)
349
- audio_tensor = torch.as_tensor(audio_gl, dtype=torch.float32) if isinstance(audio_gl, torch.Tensor) else torch.from_numpy(audio_gl).float() if np is not None else torch.tensor(audio_gl, dtype=torch.float32)
350
-
351
- # Step 2: Resample 40kHz → 22.05kHz
352
- resampler = torchaudio.transforms.Resample(SAMPLE_RATE, 22050)
353
- audio_22k = resampler(audio_tensor)
354
-
355
- # Step 3: Compute 80-bin mel @ 22.05kHz for HiFi-GAN
356
- mel_80 = torchaudio.transforms.MelSpectrogram(
357
- sample_rate=22050, n_fft=1024, hop_length=256,
358
- n_mels=80, f_min=0.0, f_max=8000.0,
359
- power=2.0, norm=None, mel_scale="htk",
360
- )(audio_22k)
361
- mel_80 = torch.log(torch.clamp(mel_80, min=1e-5))
362
-
363
- # Step 4: HiFi-GAN
364
- with torch.no_grad():
365
- audio_out = self.hifigan(mel_80.unsqueeze(0))
366
- audio_out = audio_out.squeeze(0).squeeze(0).detach().cpu().numpy() if np is not None else audio_out.squeeze(0).squeeze(0).detach().cpu().tolist()
367
- return audio_out, 22050, "HiFi-GAN+GL"
368
- except Exception as e:
369
- print(f"HiFi-GAN pipeline failed, falling back to Griffin-Lim: {e}")
370
 
371
- # Fallback: Griffin-Lim only
372
- audio_out = mel_to_audio_griffinlim(mel_out, sr=SAMPLE_RATE)
373
- return audio_out, SAMPLE_RATE, "Griffin-Lim"
374
-
375
- def process_audio(self, input_audio, pitch_shift=0):
376
- if not self.model_loaded:
377
- return None, "Model not loaded. Check logs."
378
- if input_audio is None:
379
- return None, "Please upload an audio file."
380
-
381
- self._ensure_hifigan()
382
-
383
- try:
384
- audio_data, sr = sf.read(input_audio, dtype="float32")
385
- if audio_data.ndim > 1:
386
- audio_data = audio_data.mean(axis=1)
387
- y = torch.from_numpy(audio_data) if np is not None else torch.tensor(audio_data, dtype=torch.float32)
388
- if sr != SAMPLE_RATE:
389
- y = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(y)
390
- sr = SAMPLE_RATE
391
-
392
- if pitch_shift != 0:
393
- factor = 2.0 ** (abs(pitch_shift) / 12.0)
394
- new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
395
- y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode="linear").squeeze(0).squeeze(0)
396
-
397
- # Trim silence
398
- energy = y ** 2
399
- window_size = int(0.1 * sr)
400
- if len(energy) > window_size:
401
- kernel = torch.ones(window_size) / window_size
402
- smooth_energy = F.conv1d(
403
- energy.unsqueeze(0).unsqueeze(0), kernel.unsqueeze(0).unsqueeze(0), padding=window_size // 2
404
- ).squeeze()
405
- threshold = smooth_energy.max() * (10 ** (-20 / 10))
406
- active = torch.where(smooth_energy > threshold)[0]
407
- if len(active) > 0:
408
- y = y[active[0]:active[-1] + 1]
409
-
410
- max_len = 10 * SAMPLE_RATE
411
- if len(y) > max_len:
412
- y = y[:max_len]
413
-
414
- mel = compute_mel(y, sr=SAMPLE_RATE)
415
-
416
- with torch.no_grad():
417
- mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
418
- mel_out = mel_out.squeeze(0)
419
-
420
- audio_out, out_sr, vocoder_name = self._mel_to_audio(mel_out)
421
- audio_out = audio_out / (torch.max(torch.abs(torch.tensor(audio_out) if not isinstance(audio_out, torch.Tensor) else audio_out)) + 1e-7).item() * 0.95
422
- output_path = tempfile.mktemp(suffix=".wav")
423
- sf.write(output_path, audio_out, out_sr)
424
- return output_path, f"✅ {vocoder_name} | {len(y)/SAMPLE_RATE:.1f}s → {len(audio_out)/out_sr:.1f}s | Model: strict=True, 128-mel"
425
- except Exception as e:
426
- import traceback
427
- traceback.print_exc()
428
- return None, f"❌ Error: {str(e)}"
429
-
430
- def generate_random(self):
431
- self._ensure_samples()
432
- if not self.samples:
433
- return None, "No samples available"
434
- try:
435
- sample = random.choice(self.samples)
436
- sample_path = hf_hub_download(repo_id=self.dataset_id, filename=sample, repo_type="dataset")
437
- output, msg = self.process_audio(sample_path)
438
- if output:
439
- return output, f"{msg}\nSample: {Path(sample).name}"
440
- return output, msg
441
- except Exception as e:
442
- return None, f"❌ Error: {str(e)}"
443
-
444
-
445
- # ============================================================
446
- # Gradio UI
447
- # ============================================================
448
-
449
- print("[STARTUP] Creating VoiceCloner (V7 correct architecture)...")
450
- cloner = VoiceCloner()
451
- print(f"[STARTUP] Ready. model_loaded={cloner.model_loaded}")
452
-
453
- demo = gr.Blocks(title="NumberBlocks One Voice Cloner V7")
454
-
455
- with demo:
456
- gr.Markdown("# 🎤 NumberBlocks One Voice Cloner V7")
457
- gr.Markdown("RVC v2 Model (60.7MB, strict=True, 128-mel) + HiFi-GAN Vocoder | Upload audio → convert to One's voice")
458
 
459
- with gr.Tab("Voice Conversion"):
460
- with gr.Row():
461
- input_audio = gr.Audio(label="Upload Audio", type="filepath")
462
- output_audio = gr.Audio(label="Result", type="filepath")
463
- pitch_slider = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)")
464
- convert_btn = gr.Button("🎤 Convert Voice", variant="primary")
465
- status_text = gr.Textbox(label="Status")
466
- convert_btn.click(
467
- fn=cloner.process_audio,
468
- inputs=[input_audio, pitch_slider],
469
- outputs=[output_audio, status_text],
470
  )
471
-
472
- with gr.Tab("Random Sample"):
473
- rand_audio = gr.Audio(label="Result", type="filepath")
474
- rand_status = gr.Textbox(label="Status")
475
- rand_btn = gr.Button("🎲 Generate Random", variant="primary")
476
- rand_btn.click(
477
- fn=cloner.generate_random,
478
- inputs=[],
479
- outputs=[rand_audio, rand_status],
480
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
  if __name__ == "__main__":
483
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ NumberBlocks One Voice Cloning Space - VoxCPM V3
4
+ 使用 VoxCPM 2 模型进行音色克隆推理
 
 
 
 
5
  """
6
 
7
  import os
8
+ import gradio as gr
9
  import tempfile
 
 
 
 
 
 
10
  import soundfile as sf
11
+ import traceback
 
 
 
12
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # 环境变量检查
15
+ HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def load_model():
18
+ """加载 VoxCPM 模型"""
19
+ try:
20
+ from voxcpm import VoxCPM
21
+ import torch
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ print(f"Loading VoxCPM model on {device}...")
25
+
26
+ # V3: optimize=False 避免兼容性问题
27
+ model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
28
+ print("Model loaded successfully!")
29
+ return model, device, None
30
+ except Exception as e:
31
+ print(f"Error loading model: {e}")
32
+ traceback.print_exc()
33
+ return None, "cpu", str(e)
34
+
35
+ # 全局模型状态
36
+ MODEL_STATE = {
37
+ "model": None,
38
+ "device": "cpu",
39
+ "error": None,
40
+ "loading": False
41
+ }
42
+
43
+ def ensure_model():
44
+ """确保模型已加载"""
45
+ if MODEL_STATE["model"] is None and not MODEL_STATE["loading"]:
46
+ MODEL_STATE["loading"] = True
 
 
47
  try:
48
+ model, device, error = load_model()
49
+ MODEL_STATE["model"] = model
50
+ MODEL_STATE["device"] = device
51
+ MODEL_STATE["error"] = error
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
+ MODEL_STATE["error"] = str(e)
 
54
  traceback.print_exc()
55
+ finally:
56
+ MODEL_STATE["loading"] = False
57
+ return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]
58
+
59
+ def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
60
+ """生成音频"""
61
+ if not text or not reference_audio:
62
+ return None, "❌ 请输入文本和参考音频"
63
+
64
+ if not text.strip():
65
+ return None, "❌ 文本不能为空"
66
+
67
+ try:
68
+ model, device, error = ensure_model()
69
+ if error:
70
+ return None, f"❌ 模型加载失败: {error}"
71
+ if model is None:
72
+ return None, "❌ 模型正在加载中,请稍候..."
73
+
74
+ # 读取参考音频
75
+ ref_audio, sr = sf.read(reference_audio)
76
+
77
+ # 如果是立体声,转换为单声道
78
+ if len(ref_audio.shape) > 1:
79
+ ref_audio = ref_audio[:, 0]
80
+
81
+ # 保存到临时文件
82
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
83
+ sf.write(tmp.name, ref_audio, sr)
84
+ ref_path = tmp.name
85
+
86
+ print(f"Generating with text: {text[:50]}...")
87
+ print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz")
88
+
89
+ # 生成音频
90
+ import time
91
+ t0 = time.time()
92
+ wav = model.generate(
93
+ text=text,
94
+ reference_wav_path=ref_path,
95
+ cfg_value=float(cfg_value),
96
+ inference_timesteps=int(steps),
97
+ )
98
+ elapsed = time.time() - t0
99
+
100
+ # 保存输出
101
+ sample_rate = model.tts_model.sample_rate
102
+ output_path = "/tmp/voxcpm_output.wav"
103
+ sf.write(output_path, wav, sample_rate)
104
+
105
+ duration = len(wav) / sample_rate
106
+ msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s"
107
+ print(msg)
108
+
109
+ # 清理临时文件
110
+ os.unlink(ref_path)
111
+
112
+ return output_path, msg
113
+
114
+ except Exception as e:
115
+ error_msg = f"❌ 生成失败: {str(e)}"
116
+ print(f"Error: {e}")
117
+ traceback.print_exc()
118
+ return None, error_msg
119
+
120
+ # 预设文本
121
+ PRESET_TEXTS = {
122
+ "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
123
+ "计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!",
124
+ "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
125
+ }
126
+
127
+ # 创建 Gradio 界面
128
+ with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
129
+ gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)")
130
+ gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
131
+
132
+ with gr.Row():
133
+ with gr.Column():
134
+ text_input = gr.Textbox(
135
+ label="输入文本",
136
+ placeholder="输入要合成的文本...",
137
+ lines=3,
138
+ value=PRESET_TEXTS["问候"]
139
  )
140
+
141
+ with gr.Row():
142
+ for name, txt in PRESET_TEXTS.items():
143
+ gr.Button(name).click(lambda t=txt: t, inputs=None, outputs=text_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
+ with gr.Column():
146
+ ref_audio_input = gr.Audio(
147
+ label="参考音频 (One 的声音)",
148
+ type="filepath"
149
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ with gr.Row():
152
+ cfg_slider = gr.Slider(
153
+ minimum=0.5,
154
+ maximum=5.0,
155
+ value=2.0,
156
+ step=0.1,
157
+ label="CFG Value (越高越像参考音色)"
 
 
 
 
158
  )
159
+ steps_slider = gr.Slider(
160
+ minimum=5,
161
+ maximum=50,
162
+ value=10,
163
+ step=1,
164
+ label="推理步数 (越高质量越好但越慢)"
 
 
 
165
  )
166
+
167
+ generate_btn = gr.Button("🎙️ 生成音频", variant="primary")
168
+
169
+ with gr.Row():
170
+ output_audio = gr.Audio(label="生成结果")
171
+ status_msg = gr.Markdown(value="⏸️ 等待生成...")
172
+
173
+ generate_btn.click(
174
+ fn=generate_audio,
175
+ inputs=[text_input, ref_audio_input, cfg_slider, steps_slider],
176
+ outputs=[output_audio, status_msg]
177
+ )
178
+
179
+ gr.Markdown("---")
180
+ gr.Markdown("### 说明")
181
+ gr.Markdown("""
182
+ - **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音)
183
+ - **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色
184
+ - **推理步数**: 默认 10,越高质量越好但生成越慢
185
+ - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
186
+ """)
187
 
188
  if __name__ == "__main__":
189
+ # 启动时预加载模型
190
+ import threading
191
+ def preload():
192
+ print("Preloading VoxCPM model...")
193
+ ensure_model()
194
+
195
+ threading.Thread(target=preload, daemon=True).start()
196
+
197
+ demo.launch()