ayf3 commited on
Commit
6620877
·
verified ·
1 Parent(s): 5749c4e

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +75 -58
app.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
- NumberBlocks One Voice Cloner - V5 Fixed
4
- Pinned gradio version to avoid jinja2 schema bugs
5
  """
6
 
7
  import os
@@ -17,8 +17,10 @@ from pathlib import Path
17
  from huggingface_hub import hf_hub_download, HfApi
18
  import gradio as gr
19
 
 
 
20
  # ============================================================
21
- # 模型定义 - VITS-like RVC Model
22
  # ============================================================
23
 
24
  class PosteriorEncoder(nn.Module):
@@ -125,7 +127,6 @@ class Decoder(nn.Module):
125
 
126
 
127
  class RVCModel(nn.Module):
128
- """VITS-like RVC v3.0 Model"""
129
  def __init__(self, n_mels=80, hidden_channels=192):
130
  super().__init__()
131
  self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
@@ -240,22 +241,24 @@ def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256,
240
 
241
 
242
  # ============================================================
243
- # Inference Engine
244
  # ============================================================
245
 
246
  class VoiceCloner:
247
  def __init__(self):
248
  self.device = torch.device('cpu')
249
  self.rvc_model = None
250
- self.hifigan = None
 
251
  self.sample_rate = 40000
252
  self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
253
  self.model_loaded = False
254
- self.samples = []
255
- self.load_models()
256
 
257
- def load_models(self):
258
- print("Loading RVC model...")
 
259
  try:
260
  model_path = hf_hub_download(
261
  repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
@@ -270,13 +273,21 @@ class VoiceCloner:
270
  self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
271
  self.rvc_model.load_state_dict(state_dict, strict=False)
272
  self.rvc_model.eval()
273
- print(f"RVC model loaded (hidden={hidden_ch})")
 
274
  except Exception as e:
275
- print(f"RVC model load failed: {e}")
276
-
277
- print("Loading HiFi-GAN vocoder...")
 
 
 
 
 
278
  try:
279
- hifigan_path = hf_hub_download(repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000")
 
 
280
  ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
281
  state_dict = ckpt.get('generator', ckpt.get('state_dict', ckpt))
282
  if any(k.startswith('generator.') for k in state_dict):
@@ -284,28 +295,35 @@ class VoiceCloner:
284
  self.hifigan = HiFiGANGenerator()
285
  self.hifigan.load_state_dict(state_dict, strict=False)
286
  self.hifigan.eval()
287
- print("HiFi-GAN vocoder loaded")
288
  except Exception as e:
289
- print(f"HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
290
  self.hifigan = None
291
 
 
 
 
 
 
292
  try:
293
  api = HfApi()
294
  files = api.list_repo_files(self.dataset_id, repo_type="dataset")
295
  self.samples = [f for f in files if f.startswith('models/top_')
296
  and f.endswith('.wav')
297
  and '_p+' not in f and '_p-' not in f and '_s+' not in f]
298
- print(f"Found {len(self.samples)} sample audio files")
299
  except Exception as e:
300
- print(f"Could not list samples: {e}")
301
-
302
- self.model_loaded = self.rvc_model is not None
303
 
304
  def process_audio(self, input_audio, pitch_shift=0):
305
  if not self.model_loaded:
306
- return None, "Model not loaded"
307
  if input_audio is None:
308
- return None, "Please upload an audio file"
 
 
 
 
309
  try:
310
  y, sr = torchaudio.load(input_audio)
311
  if y.shape[0] > 1:
@@ -356,13 +374,14 @@ class VoiceCloner:
356
  audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
357
  output_path = tempfile.mktemp(suffix='.wav')
358
  sf.write(output_path, audio_out, self.sample_rate)
359
- return output_path, f"Success ({vocoder_name}) | {len(y)/sr:.1f}s -> {len(audio_out)/self.sample_rate:.1f}s"
360
  except Exception as e:
361
  import traceback
362
  traceback.print_exc()
363
- return None, f"Error: {str(e)}"
364
 
365
  def generate_random(self):
 
366
  if not self.samples:
367
  return None, "No samples available"
368
  try:
@@ -373,47 +392,45 @@ class VoiceCloner:
373
  return output, f"{msg}\nSample: {Path(sample).name}"
374
  return output, msg
375
  except Exception as e:
376
- return None, f"Error: {str(e)}"
377
 
378
 
379
  # ============================================================
380
  # Gradio UI
381
  # ============================================================
382
 
383
- print("Initializing NumberBlocks One Voice Cloner...")
384
  cloner = VoiceCloner()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
- vc_interface = gr.Interface(
387
- fn=cloner.process_audio,
388
- inputs=[
389
- gr.Audio(label="Upload Audio", type="filepath"),
390
- gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)"),
391
- ],
392
- outputs=[
393
- gr.Audio(label="Result", type="filepath"),
394
- gr.Textbox(label="Status"),
395
- ],
396
- title="NumberBlocks One Voice Cloner",
397
- description="RVC v2 Model (60.7MB) + HiFi-GAN Vocoder | Upload audio to convert to One's voice",
398
- allow_flagging="never",
399
- )
400
-
401
- rand_interface = gr.Interface(
402
- fn=cloner.generate_random,
403
- inputs=[],
404
- outputs=[
405
- gr.Audio(label="Result", type="filepath"),
406
- gr.Textbox(label="Status"),
407
- ],
408
- title="Random Sample Generation",
409
- description="Generate from random dataset sample + RVC conversion",
410
- allow_flagging="never",
411
- )
412
-
413
- demo = gr.TabbedInterface(
414
- [vc_interface, rand_interface],
415
- ["Voice Conversion", "Random Sample"],
416
- )
417
 
418
  if __name__ == "__main__":
419
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  #!/usr/bin/env python3
2
  """
3
+ NumberBlocks One Voice Cloner - V6 Robust
4
+ Fixes: user creation in Dockerfile, lazy-load HiFi-GAN, startup-timeout protection
5
  """
6
 
7
  import os
 
17
  from huggingface_hub import hf_hub_download, HfApi
18
  import gradio as gr
19
 
20
+ print("=== NumberBlocks One Voice Cloner V6 ===")
21
+
22
  # ============================================================
23
+ # Model Definitions
24
  # ============================================================
25
 
26
  class PosteriorEncoder(nn.Module):
 
127
 
128
 
129
  class RVCModel(nn.Module):
 
130
  def __init__(self, n_mels=80, hidden_channels=192):
131
  super().__init__()
132
  self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
 
241
 
242
 
243
  # ============================================================
244
+ # Inference Engine - with lazy loading
245
  # ============================================================
246
 
247
  class VoiceCloner:
248
  def __init__(self):
249
  self.device = torch.device('cpu')
250
  self.rvc_model = None
251
+ self.hifigan = None # lazy loaded
252
+ self._hifigan_loaded = False
253
  self.sample_rate = 40000
254
  self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
255
  self.model_loaded = False
256
+ self.samples = None # lazy loaded
257
+ self._load_rvc_only()
258
 
259
+ def _load_rvc_only(self):
260
+ """Load only the RVC model at startup (fast)"""
261
+ print("[STARTUP] Loading RVC model...")
262
  try:
263
  model_path = hf_hub_download(
264
  repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
 
273
  self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
274
  self.rvc_model.load_state_dict(state_dict, strict=False)
275
  self.rvc_model.eval()
276
+ self.model_loaded = True
277
+ print(f"[STARTUP] RVC model loaded OK (hidden={hidden_ch})")
278
  except Exception as e:
279
+ print(f"[STARTUP] RVC model load FAILED: {e}")
280
+
281
+ def _ensure_hifigan(self):
282
+ """Lazy-load HiFi-GAN on first inference request"""
283
+ if self._hifigan_loaded:
284
+ return
285
+ self._hifigan_loaded = True
286
+ print("[LAZY] Loading HiFi-GAN vocoder...")
287
  try:
288
+ hifigan_path = hf_hub_download(
289
+ repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000"
290
+ )
291
  ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
292
  state_dict = ckpt.get('generator', ckpt.get('state_dict', ckpt))
293
  if any(k.startswith('generator.') for k in state_dict):
 
295
  self.hifigan = HiFiGANGenerator()
296
  self.hifigan.load_state_dict(state_dict, strict=False)
297
  self.hifigan.eval()
298
+ print("[LAZY] HiFi-GAN loaded OK")
299
  except Exception as e:
300
+ print(f"[LAZY] HiFi-GAN FAILED (Griffin-Lim fallback): {e}")
301
  self.hifigan = None
302
 
303
+ def _ensure_samples(self):
304
+ """Lazy-load sample list"""
305
+ if self.samples is not None:
306
+ return
307
+ self.samples = []
308
  try:
309
  api = HfApi()
310
  files = api.list_repo_files(self.dataset_id, repo_type="dataset")
311
  self.samples = [f for f in files if f.startswith('models/top_')
312
  and f.endswith('.wav')
313
  and '_p+' not in f and '_p-' not in f and '_s+' not in f]
314
+ print(f"[LAZY] Found {len(self.samples)} samples")
315
  except Exception as e:
316
+ print(f"[LAZY] Could not list samples: {e}")
 
 
317
 
318
  def process_audio(self, input_audio, pitch_shift=0):
319
  if not self.model_loaded:
320
+ return None, "Model not loaded. Check logs."
321
  if input_audio is None:
322
+ return None, "Please upload an audio file."
323
+
324
+ # Lazy load vocoder on first real request
325
+ self._ensure_hifigan()
326
+
327
  try:
328
  y, sr = torchaudio.load(input_audio)
329
  if y.shape[0] > 1:
 
374
  audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
375
  output_path = tempfile.mktemp(suffix='.wav')
376
  sf.write(output_path, audio_out, self.sample_rate)
377
+ return output_path, f" {vocoder_name} | {len(y)/sr:.1f}s {len(audio_out)/self.sample_rate:.1f}s"
378
  except Exception as e:
379
  import traceback
380
  traceback.print_exc()
381
+ return None, f"Error: {str(e)}"
382
 
383
  def generate_random(self):
384
+ self._ensure_samples()
385
  if not self.samples:
386
  return None, "No samples available"
387
  try:
 
392
  return output, f"{msg}\nSample: {Path(sample).name}"
393
  return output, msg
394
  except Exception as e:
395
+ return None, f"Error: {str(e)}"
396
 
397
 
398
  # ============================================================
399
  # Gradio UI
400
  # ============================================================
401
 
402
+ print("[STARTUP] Creating VoiceCloner (RVC only, HiFi-GAN lazy)...")
403
  cloner = VoiceCloner()
404
+ print(f"[STARTUP] Ready. model_loaded={cloner.model_loaded}")
405
+
406
+ demo = gr.Blocks(title="NumberBlocks One Voice Cloner")
407
+
408
+ with demo:
409
+ gr.Markdown("# 🎤 NumberBlocks One Voice Cloner")
410
+ gr.Markdown("RVC v2 Model (60.7MB) + HiFi-GAN Vocoder | Upload audio → convert to One's voice")
411
+
412
+ with gr.Tab("Voice Conversion"):
413
+ with gr.Row():
414
+ input_audio = gr.Audio(label="Upload Audio", type="filepath")
415
+ output_audio = gr.Audio(label="Result", type="filepath")
416
+ pitch_slider = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)")
417
+ convert_btn = gr.Button("🎤 Convert Voice", variant="primary")
418
+ status_text = gr.Textbox(label="Status")
419
+ convert_btn.click(
420
+ fn=cloner.process_audio,
421
+ inputs=[input_audio, pitch_slider],
422
+ outputs=[output_audio, status_text],
423
+ )
424
 
425
+ with gr.Tab("Random Sample"):
426
+ rand_audio = gr.Audio(label="Result", type="filepath")
427
+ rand_status = gr.Textbox(label="Status")
428
+ rand_btn = gr.Button("🎲 Generate Random", variant="primary")
429
+ rand_btn.click(
430
+ fn=cloner.generate_random,
431
+ inputs=[],
432
+ outputs=[rand_audio, rand_status],
433
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
  if __name__ == "__main__":
436
  demo.launch(server_name="0.0.0.0", server_port=7860)