ayf3 commited on
Commit
c92f551
·
verified ·
1 Parent(s): e215365

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +94 -219
app.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
- NumberBlocks One Voice Cloner - V3 (librosa-free)
4
- torchaudio + scipy 替代 librosa,避免 numba 兼容问题
5
  """
6
 
7
  import os
@@ -13,9 +13,9 @@ import torch
13
  import torch.nn as nn
14
  import torch.nn.functional as F
15
  import torchaudio
16
- import gradio as gr
17
  from pathlib import Path
18
  from huggingface_hub import hf_hub_download, HfApi
 
19
 
20
  # ============================================================
21
  # 模型定义 - VITS-like RVC Model
@@ -44,24 +44,6 @@ class PosteriorEncoder(nn.Module):
44
  return m, logs
45
 
46
 
47
- class ResidualCouplingBlock(nn.Module):
48
- def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
49
- super().__init__()
50
- self.flows = nn.ModuleList()
51
- for _ in range(n_flows):
52
- self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
53
- self.flows.append(Flip())
54
-
55
- def forward(self, x, reverse=False):
56
- if not reverse:
57
- for flow in self.flows:
58
- x, _ = flow(x, reverse=reverse)
59
- else:
60
- for flow in reversed(self.flows):
61
- x = flow(x, reverse=reverse)
62
- return x
63
-
64
-
65
  class ResidualCouplingLayer(nn.Module):
66
  def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
67
  super().__init__()
@@ -103,6 +85,24 @@ class Flip(nn.Module):
103
  return torch.flip(x, [1])
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  class Decoder(nn.Module):
107
  def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
108
  super().__init__()
@@ -125,7 +125,7 @@ class Decoder(nn.Module):
125
 
126
 
127
  class RVCModel(nn.Module):
128
- """VITS-like RVC v3.0 Model (5.3M params)"""
129
  def __init__(self, n_mels=80, hidden_channels=192):
130
  super().__init__()
131
  self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
@@ -151,7 +151,7 @@ class RVCModel(nn.Module):
151
 
152
 
153
  # ============================================================
154
- # HiFi-GAN Vocoder Definition
155
  # ============================================================
156
 
157
  class ResBlock1(nn.Module):
@@ -175,7 +175,6 @@ class ResBlock1(nn.Module):
175
 
176
 
177
  class HiFiGANGenerator(nn.Module):
178
- """HiFi-GAN Generator (Universal V1 compatible)"""
179
  def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
180
  upsample_kernel_sizes=(16, 16, 4, 4),
181
  upsample_initial_channel=512,
@@ -183,13 +182,10 @@ class HiFiGANGenerator(nn.Module):
183
  resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
184
  super().__init__()
185
  self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
186
-
187
  self.num_upsamples = len(upsample_rates)
188
  self.num_kernels = len(resblock_kernel_sizes)
189
-
190
  self.ups = nn.ModuleList()
191
  self.resblocks = nn.ModuleList()
192
-
193
  ch = upsample_initial_channel
194
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
195
  ch_new = ch // 2
@@ -197,7 +193,6 @@ class HiFiGANGenerator(nn.Module):
197
  for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
198
  self.resblocks.append(ResBlock1(ch_new, rk, rd))
199
  ch = ch_new
200
-
201
  self.conv_post = nn.Sequential(
202
  nn.LeakyReLU(0.1),
203
  nn.Conv1d(ch, 1, 7, padding=3),
@@ -218,50 +213,28 @@ class HiFiGANGenerator(nn.Module):
218
 
219
 
220
  # ============================================================
221
- # Mel-spectrogram utilities (torchaudio-based, no librosa)
222
  # ============================================================
223
 
224
- def make_mel_transform(sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80, f_min=0.0, f_max=None):
225
- """Create torchaudio mel spectrogram transform"""
226
- return torchaudio.transforms.MelSpectrogram(
227
- sample_rate=sample_rate,
228
- n_fft=n_fft,
229
- hop_length=hop_length,
230
- n_mels=n_mels,
231
- f_min=f_min,
232
- f_max=f_max if f_max else float(sample_rate // 2),
233
- power=2.0,
234
- norm=None,
235
- mel_scale="htk",
236
- )
237
-
238
  def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
239
- """Compute log mel spectrogram using torchaudio"""
240
- mel_transform = make_mel_transform(sample_rate, n_fft, hop_length, n_mels)
 
 
 
241
  mel = mel_transform(y)
242
  mel = torch.log(torch.clamp(mel, min=1e-5))
243
  return mel
244
 
 
245
  def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32):
246
- """Griffin-Lim: mel -> audio using torchaudio"""
247
- # Create inverse mel transform
248
  inverse_mel = torchaudio.transforms.InverseMelScale(
249
- n_stft=n_fft // 2 + 1,
250
- n_mels=mel.shape[0],
251
- sample_rate=sample_rate,
252
- f_min=0,
253
- f_max=float(sample_rate // 2),
254
- mel_scale="htk",
255
  )
256
- # Convert from log mel back to power spectrogram
257
  mel_power = torch.exp(mel)
258
  spec = inverse_mel(mel_power)
259
- # Griffin-Lim
260
- griffin_lim = torchaudio.transforms.GriffinLim(
261
- n_fft=n_fft,
262
- hop_length=hop_length,
263
- n_iter=n_iter,
264
- )
265
  audio = griffin_lim(spec)
266
  return audio.numpy()
267
 
@@ -282,59 +255,36 @@ class VoiceCloner:
282
  self.load_models()
283
 
284
  def load_models(self):
285
- """Load RVC model + HiFi-GAN vocoder"""
286
  print("Loading RVC model...")
287
  try:
288
  model_path = hf_hub_download(
289
- repo_id=self.dataset_id,
290
- filename="models/one_voice_rvc_v2.pth",
291
- repo_type="dataset"
292
  )
293
-
294
  ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
295
-
296
- if isinstance(ckpt, dict) and 'model' in ckpt:
297
- state_dict = ckpt['model']
298
- elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
299
- state_dict = ckpt['state_dict']
300
- else:
301
- state_dict = ckpt
302
-
303
  hidden_ch = 192
304
  for k, v in state_dict.items():
305
  if 'enc_p.pre.weight' in k:
306
  hidden_ch = v.shape[0]
307
  break
308
-
309
  self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
310
  self.rvc_model.load_state_dict(state_dict, strict=False)
311
  self.rvc_model.eval()
312
  print(f"RVC model loaded (hidden={hidden_ch})")
313
-
314
  except Exception as e:
315
  print(f"RVC model load failed: {e}")
316
- self.rvc_model = None
317
 
318
  print("Loading HiFi-GAN vocoder...")
319
  try:
320
- hifigan_path = self._get_hifigan()
321
- if hifigan_path:
322
- ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
323
- if isinstance(ckpt, dict) and 'generator' in ckpt:
324
- state_dict = ckpt['generator']
325
- elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
326
- state_dict = {k.replace('generator.', ''): v
327
- for k, v in ckpt['state_dict'].items()
328
- if k.startswith('generator.')}
329
- else:
330
- state_dict = ckpt
331
-
332
- self.hifigan = HiFiGANGenerator()
333
- self.hifigan.load_state_dict(state_dict, strict=False)
334
- self.hifigan.eval()
335
- print("HiFi-GAN vocoder loaded")
336
- else:
337
- print("HiFi-GAN not available, will use Griffin-Lim fallback")
338
  except Exception as e:
339
  print(f"HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
340
  self.hifigan = None
@@ -348,79 +298,38 @@ class VoiceCloner:
348
  print(f"Found {len(self.samples)} sample audio files")
349
  except Exception as e:
350
  print(f"Could not list samples: {e}")
351
- self.samples = []
352
 
353
  self.model_loaded = self.rvc_model is not None
354
 
355
- def _get_hifigan(self):
356
- """Get HiFi-GAN model"""
357
- try:
358
- path = hf_hub_download(
359
- repo_id="jik876/hifi-gan",
360
- filename="UNIVERSAL_V1/g_02500000",
361
- )
362
- return path
363
- except:
364
- pass
365
- try:
366
- path = hf_hub_download(
367
- repo_id="facebook/hifigan-universal-v1",
368
- filename="hifigan.pt",
369
- )
370
- return path
371
- except:
372
- pass
373
- return None
374
-
375
- def mel_to_audio_hifigan(self, mel):
376
- """Convert mel spectrogram to audio using HiFi-GAN"""
377
- with torch.no_grad():
378
- audio = self.hifigan(mel.unsqueeze(0))
379
- return audio.squeeze(0).squeeze(0).cpu().numpy()
380
-
381
  def process_audio(self, input_audio, pitch_shift=0):
382
- """Process audio through RVC model + vocoder"""
383
  if not self.model_loaded:
384
  return None, "Model not loaded"
385
-
386
  if input_audio is None:
387
  return None, "Please upload an audio file"
388
-
389
  try:
390
- # Load audio with torchaudio
391
  y, sr = torchaudio.load(input_audio)
392
- # Convert to mono
393
  if y.shape[0] > 1:
394
  y = y.mean(dim=0)
395
  else:
396
  y = y.squeeze(0)
397
- # Resample
398
  if sr != self.sample_rate:
399
- resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
400
- y = resampler(y)
401
  sr = self.sample_rate
402
 
403
- # Pitch shift using torchaudio (simple resample trick)
404
  if pitch_shift != 0:
405
- # Pitch shift by resampling: shift up by N semitones = speed up by 2^(N/12)
406
  factor = 2.0 ** (abs(pitch_shift) / 12.0)
407
- if pitch_shift > 0:
408
- new_len = int(len(y) / factor)
409
- else:
410
- new_len = int(len(y) * factor)
411
- y = torch.nn.functional.interpolate(
412
- y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear'
413
- ).squeeze(0).squeeze(0)
414
-
415
- # Trim silence (simple energy-based)
416
  energy = y ** 2
417
- window_size = int(0.1 * sr) # 100ms window
418
  if len(energy) > window_size:
419
  kernel = torch.ones(window_size) / window_size
420
- smooth_energy = torch.nn.functional.conv1d(
421
  energy.unsqueeze(0).unsqueeze(0), kernel.unsqueeze(0).unsqueeze(0), padding=window_size // 2
422
  ).squeeze()
423
- threshold = smooth_energy.max() * (10 ** (-20 / 10)) # -20dB
424
  active = torch.where(smooth_energy > threshold)[0]
425
  if len(active) > 0:
426
  y = y[active[0]:active[-1] + 1]
@@ -436,109 +345,75 @@ class VoiceCloner:
436
  mel_out = mel_out.squeeze(0)
437
 
438
  if self.hifigan is not None:
439
- audio_out = self.mel_to_audio_hifigan(mel_out)
 
 
440
  vocoder_name = "HiFi-GAN"
441
  else:
442
  audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
443
  vocoder_name = "Griffin-Lim"
444
 
445
  audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
446
-
447
  output_path = tempfile.mktemp(suffix='.wav')
448
  sf.write(output_path, audio_out, self.sample_rate)
449
-
450
- return output_path, f"Success ({vocoder_name}) | Input: {len(y)/sr:.1f}s -> Output: {len(audio_out)/self.sample_rate:.1f}s"
451
-
452
  except Exception as e:
453
  import traceback
454
  traceback.print_exc()
455
- return None, f"Conversion failed: {str(e)}"
456
 
457
  def generate_random(self):
458
- """Generate audio from a random sample"""
459
  if not self.samples:
460
  return None, "No samples available"
461
-
462
  try:
463
  sample = random.choice(self.samples)
464
- sample_path = hf_hub_download(
465
- repo_id=self.dataset_id,
466
- filename=sample,
467
- repo_type="dataset"
468
- )
469
  output, msg = self.process_audio(sample_path)
470
  if output:
471
  return output, f"{msg}\nSample: {Path(sample).name}"
472
  return output, msg
473
  except Exception as e:
474
- return None, f"Generation failed: {str(e)}"
475
 
476
 
477
  # ============================================================
478
- # Gradio UI
479
  # ============================================================
480
 
481
  print("Initializing NumberBlocks One Voice Cloner...")
482
  cloner = VoiceCloner()
483
 
484
- with gr.Blocks(
485
- title="NumberBlocks One Voice",
486
- theme=gr.themes.Soft(),
487
- ) as demo:
488
- gr.Markdown("# NumberBlocks One Voice Cloner")
489
- gr.Markdown("RVC v2 Model (60.7MB) + HiFi-GAN Vocoder")
490
-
491
- with gr.Tab("Voice Conversion"):
492
- gr.Markdown("### Upload audio -> Convert to One's voice")
493
- with gr.Row():
494
- with gr.Column():
495
- vc_input = gr.Audio(label="Upload Audio", type="filepath")
496
- vc_pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)")
497
- vc_btn = gr.Button("Convert", variant="primary")
498
- with gr.Column():
499
- vc_output = gr.Audio(label="Result", type="filepath")
500
- vc_status = gr.Textbox(label="Status")
501
-
502
- vc_btn.click(
503
- fn=cloner.process_audio,
504
- inputs=[vc_input, vc_pitch],
505
- outputs=[vc_output, vc_status]
506
- )
507
-
508
- with gr.Tab("Random Sample"):
509
- gr.Markdown("### Random sample + RVC conversion")
510
- rand_btn = gr.Button("Generate Random", variant="primary")
511
- rand_output = gr.Audio(label="Result", type="filepath")
512
- rand_status = gr.Textbox(label="Status")
513
-
514
- rand_btn.click(
515
- fn=cloner.generate_random,
516
- inputs=[],
517
- outputs=[rand_output, rand_status]
518
- )
519
-
520
- with gr.Tab("About"):
521
- model_status = "Loaded" if cloner.model_loaded else "Not loaded"
522
- hifigan_status = "HiFi-GAN" if cloner.hifigan else "Griffin-Lim (fallback)"
523
- gr.Markdown(f"""
524
- ### NumberBlocks One Voice Cloner V3 (librosa-free)
525
-
526
- - **Model**: RVC v3.0 (VITS-like, 5.3M params, 60.7MB)
527
- - **Vocoder**: {hifigan_status}
528
- - **Sample Rate**: 40kHz
529
- - **Model Status**: {model_status}
530
- - **Training Data**: 100 source files -> 1,334 chunks, 500 steps
531
- - **Dataset**: [ayf3/numberblocks-one-voice-dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
532
-
533
- **Features**:
534
- - Upload audio -> One voice conversion
535
- - Random sample generation
536
- - Pitch adjustment (-12 to +12 semitones)
537
- - HiFi-GAN high quality vocoder
538
-
539
- **Limitations**:
540
- - CPU inference (slow)
541
- - Input recommended < 10 seconds
542
- """)
543
-
544
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 
1
  #!/usr/bin/env python3
2
  """
3
+ NumberBlocks One Voice Cloner - V4 Minimal
4
+ Ultra-simple UI to avoid gradio schema issues
5
  """
6
 
7
  import os
 
13
  import torch.nn as nn
14
  import torch.nn.functional as F
15
  import torchaudio
 
16
  from pathlib import Path
17
  from huggingface_hub import hf_hub_download, HfApi
18
+ import gradio as gr
19
 
20
  # ============================================================
21
  # 模型定义 - VITS-like RVC Model
 
44
  return m, logs
45
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  class ResidualCouplingLayer(nn.Module):
48
  def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
49
  super().__init__()
 
85
  return torch.flip(x, [1])
86
 
87
 
88
+ class ResidualCouplingBlock(nn.Module):
89
+ def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
90
+ super().__init__()
91
+ self.flows = nn.ModuleList()
92
+ for _ in range(n_flows):
93
+ self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
94
+ self.flows.append(Flip())
95
+
96
+ def forward(self, x, reverse=False):
97
+ if not reverse:
98
+ for flow in self.flows:
99
+ x, _ = flow(x, reverse=reverse)
100
+ else:
101
+ for flow in reversed(self.flows):
102
+ x = flow(x, reverse=reverse)
103
+ return x
104
+
105
+
106
  class Decoder(nn.Module):
107
  def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
108
  super().__init__()
 
125
 
126
 
127
  class RVCModel(nn.Module):
128
+ """VITS-like RVC v3.0 Model"""
129
  def __init__(self, n_mels=80, hidden_channels=192):
130
  super().__init__()
131
  self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
 
151
 
152
 
153
  # ============================================================
154
+ # HiFi-GAN Vocoder
155
  # ============================================================
156
 
157
  class ResBlock1(nn.Module):
 
175
 
176
 
177
  class HiFiGANGenerator(nn.Module):
 
178
  def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
179
  upsample_kernel_sizes=(16, 16, 4, 4),
180
  upsample_initial_channel=512,
 
182
  resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
183
  super().__init__()
184
  self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
 
185
  self.num_upsamples = len(upsample_rates)
186
  self.num_kernels = len(resblock_kernel_sizes)
 
187
  self.ups = nn.ModuleList()
188
  self.resblocks = nn.ModuleList()
 
189
  ch = upsample_initial_channel
190
  for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
191
  ch_new = ch // 2
 
193
  for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
194
  self.resblocks.append(ResBlock1(ch_new, rk, rd))
195
  ch = ch_new
 
196
  self.conv_post = nn.Sequential(
197
  nn.LeakyReLU(0.1),
198
  nn.Conv1d(ch, 1, 7, padding=3),
 
213
 
214
 
215
  # ============================================================
216
+ # Mel utilities
217
  # ============================================================
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
220
+ mel_transform = torchaudio.transforms.MelSpectrogram(
221
+ sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
222
+ n_mels=n_mels, f_min=0.0, f_max=float(sample_rate // 2),
223
+ power=2.0, norm=None, mel_scale="htk",
224
+ )
225
  mel = mel_transform(y)
226
  mel = torch.log(torch.clamp(mel, min=1e-5))
227
  return mel
228
 
229
+
230
  def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32):
 
 
231
  inverse_mel = torchaudio.transforms.InverseMelScale(
232
+ n_stft=n_fft // 2 + 1, n_mels=mel.shape[0],
233
+ sample_rate=sample_rate, f_min=0, f_max=float(sample_rate // 2), mel_scale="htk",
 
 
 
 
234
  )
 
235
  mel_power = torch.exp(mel)
236
  spec = inverse_mel(mel_power)
237
+ griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, hop_length=hop_length, n_iter=n_iter)
 
 
 
 
 
238
  audio = griffin_lim(spec)
239
  return audio.numpy()
240
 
 
255
  self.load_models()
256
 
257
  def load_models(self):
 
258
  print("Loading RVC model...")
259
  try:
260
  model_path = hf_hub_download(
261
+ repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
 
 
262
  )
 
263
  ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
264
+ state_dict = ckpt.get('model', ckpt.get('state_dict', ckpt))
 
 
 
 
 
 
 
265
  hidden_ch = 192
266
  for k, v in state_dict.items():
267
  if 'enc_p.pre.weight' in k:
268
  hidden_ch = v.shape[0]
269
  break
 
270
  self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
271
  self.rvc_model.load_state_dict(state_dict, strict=False)
272
  self.rvc_model.eval()
273
  print(f"RVC model loaded (hidden={hidden_ch})")
 
274
  except Exception as e:
275
  print(f"RVC model load failed: {e}")
 
276
 
277
  print("Loading HiFi-GAN vocoder...")
278
  try:
279
+ hifigan_path = hf_hub_download(repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000")
280
+ ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
281
+ state_dict = ckpt.get('generator', ckpt.get('state_dict', ckpt))
282
+ if any(k.startswith('generator.') for k in state_dict):
283
+ state_dict = {k.replace('generator.', ''): v for k, v in state_dict.items() if k.startswith('generator.')}
284
+ self.hifigan = HiFiGANGenerator()
285
+ self.hifigan.load_state_dict(state_dict, strict=False)
286
+ self.hifigan.eval()
287
+ print("HiFi-GAN vocoder loaded")
 
 
 
 
 
 
 
 
 
288
  except Exception as e:
289
  print(f"HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
290
  self.hifigan = None
 
298
  print(f"Found {len(self.samples)} sample audio files")
299
  except Exception as e:
300
  print(f"Could not list samples: {e}")
 
301
 
302
  self.model_loaded = self.rvc_model is not None
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  def process_audio(self, input_audio, pitch_shift=0):
 
305
  if not self.model_loaded:
306
  return None, "Model not loaded"
 
307
  if input_audio is None:
308
  return None, "Please upload an audio file"
 
309
  try:
 
310
  y, sr = torchaudio.load(input_audio)
 
311
  if y.shape[0] > 1:
312
  y = y.mean(dim=0)
313
  else:
314
  y = y.squeeze(0)
 
315
  if sr != self.sample_rate:
316
+ y = torchaudio.transforms.Resample(sr, self.sample_rate)(y)
 
317
  sr = self.sample_rate
318
 
 
319
  if pitch_shift != 0:
 
320
  factor = 2.0 ** (abs(pitch_shift) / 12.0)
321
+ new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
322
+ y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear').squeeze(0).squeeze(0)
323
+
324
+ # Trim silence
 
 
 
 
 
325
  energy = y ** 2
326
+ window_size = int(0.1 * sr)
327
  if len(energy) > window_size:
328
  kernel = torch.ones(window_size) / window_size
329
+ smooth_energy = F.conv1d(
330
  energy.unsqueeze(0).unsqueeze(0), kernel.unsqueeze(0).unsqueeze(0), padding=window_size // 2
331
  ).squeeze()
332
+ threshold = smooth_energy.max() * (10 ** (-20 / 10))
333
  active = torch.where(smooth_energy > threshold)[0]
334
  if len(active) > 0:
335
  y = y[active[0]:active[-1] + 1]
 
345
  mel_out = mel_out.squeeze(0)
346
 
347
  if self.hifigan is not None:
348
+ with torch.no_grad():
349
+ audio_out = self.hifigan(mel_out.unsqueeze(0))
350
+ audio_out = audio_out.squeeze(0).squeeze(0).cpu().numpy()
351
  vocoder_name = "HiFi-GAN"
352
  else:
353
  audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
354
  vocoder_name = "Griffin-Lim"
355
 
356
  audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
 
357
  output_path = tempfile.mktemp(suffix='.wav')
358
  sf.write(output_path, audio_out, self.sample_rate)
359
+ return output_path, f"Success ({vocoder_name}) | {len(y)/sr:.1f}s -> {len(audio_out)/self.sample_rate:.1f}s"
 
 
360
  except Exception as e:
361
  import traceback
362
  traceback.print_exc()
363
+ return None, f"Error: {str(e)}"
364
 
365
  def generate_random(self):
 
366
  if not self.samples:
367
  return None, "No samples available"
 
368
  try:
369
  sample = random.choice(self.samples)
370
+ sample_path = hf_hub_download(repo_id=self.dataset_id, filename=sample, repo_type="dataset")
 
 
 
 
371
  output, msg = self.process_audio(sample_path)
372
  if output:
373
  return output, f"{msg}\nSample: {Path(sample).name}"
374
  return output, msg
375
  except Exception as e:
376
+ return None, f"Error: {str(e)}"
377
 
378
 
379
  # ============================================================
380
+ # Gradio UI - Minimal version using Interface (not Blocks)
381
  # ============================================================
382
 
383
  print("Initializing NumberBlocks One Voice Cloner...")
384
  cloner = VoiceCloner()
385
 
386
+ # Use simple gr.Interface instead of Blocks to avoid schema issues
387
+ vc_interface = gr.Interface(
388
+ fn=cloner.process_audio,
389
+ inputs=[
390
+ gr.Audio(label="Upload Audio", type="filepath"),
391
+ gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)"),
392
+ ],
393
+ outputs=[
394
+ gr.Audio(label="Result", type="filepath"),
395
+ gr.Textbox(label="Status"),
396
+ ],
397
+ title="NumberBlocks One Voice Cloner",
398
+ description="RVC v2 Model (60.7MB) + HiFi-GAN Vocoder | Upload audio to convert to One's voice",
399
+ allow_flagging="never",
400
+ )
401
+
402
+ rand_interface = gr.Interface(
403
+ fn=cloner.generate_random,
404
+ inputs=[],
405
+ outputs=[
406
+ gr.Audio(label="Result", type="filepath"),
407
+ gr.Textbox(label="Status"),
408
+ ],
409
+ title="Random Sample Generation",
410
+ description="Generate from random dataset sample + RVC conversion",
411
+ allow_flagging="never",
412
+ )
413
+
414
+ demo = gr.TabbedInterface(
415
+ [vc_interface, rand_interface],
416
+ ["Voice Conversion", "Random Sample"],
417
+ )
418
+
419
+ demo.launch(server_name="0.0.0.0", server_port=7860)