ayf3 commited on
Commit
52c31e0
·
verified ·
1 Parent(s): 65cc5a2

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +181 -164
app.py CHANGED
@@ -1,7 +1,11 @@
1
  #!/usr/bin/env python3
2
  """
3
- NumberBlocks One Voice Cloner - V6 Robust
4
- Fixes: user creation in Dockerfile, lazy-load HiFi-GAN, startup-timeout protection
 
 
 
 
5
  """
6
 
7
  import os
@@ -17,137 +21,124 @@ from pathlib import Path
17
  from huggingface_hub import hf_hub_download, HfApi
18
  import gradio as gr
19
 
20
- print("=== NumberBlocks One Voice Cloner V6 ===")
21
 
22
  # ============================================================
23
- # Model Definitions
24
  # ============================================================
25
 
26
- class PosteriorEncoder(nn.Module):
27
- def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
28
  super().__init__()
29
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
30
- self.enc = nn.ModuleList()
31
- for _ in range(n_layers):
32
- self.enc.append(nn.Sequential(
33
- nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
34
- padding=(kernel_size - 1) * dilation_rate // 2,
35
- dilation=dilation_rate),
36
- nn.GLU(dim=1),
37
- ))
38
- self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1)
 
39
 
40
  def forward(self, x):
41
- x = self.pre(x)
42
- for layer in self.enc:
43
- x = x + layer(x)
44
- stats = self.proj(x)
45
- m, logs = stats.chunk(2, dim=1)
46
- return m, logs
 
 
 
47
 
48
 
49
- class ResidualCouplingLayer(nn.Module):
50
- def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
51
  super().__init__()
52
- self.pre = nn.Conv1d(channels, hidden_channels, 1)
53
- self.enc = nn.ModuleList()
54
- for _ in range(n_layers):
55
- self.enc.append(nn.Sequential(
56
- nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
57
- padding=(kernel_size - 1) * dilation_rate // 2,
58
- dilation=dilation_rate),
59
- nn.GLU(dim=1),
60
- ))
61
- self.post = nn.Conv1d(hidden_channels, channels * 2, 1)
62
- self.post.weight.data.zero_()
63
- self.post.bias.data.zero_()
64
-
65
- def forward(self, x, reverse=False):
66
- h = self.pre(x)
67
- for layer in self.enc:
68
- h = h + layer(h)
69
- stats = self.post(h)
70
- m, logs = stats.chunk(2, dim=1)
71
- if not reverse:
72
- log_s = torch.clamp(logs, -5.0, 5.0)
73
- y = m + x * torch.exp(log_s)
74
- logdet = torch.sum(log_s)
75
- return y, logdet
76
- else:
77
- log_s = torch.clamp(logs, -5.0, 5.0)
78
- y = (x - m) * torch.exp(-log_s)
79
- return y
80
-
81
 
82
- class Flip(nn.Module):
83
- def forward(self, x, reverse=False):
84
- if not reverse:
85
- return torch.flip(x, [1]), 0
86
- else:
87
- return torch.flip(x, [1])
88
 
89
 
90
- class ResidualCouplingBlock(nn.Module):
91
- def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
92
  super().__init__()
93
- self.flows = nn.ModuleList()
94
- for _ in range(n_flows):
95
- self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
96
- self.flows.append(Flip())
 
 
 
97
 
98
- def forward(self, x, reverse=False):
 
 
 
 
99
  if not reverse:
100
- for flow in self.flows:
101
- x, _ = flow(x, reverse=reverse)
 
 
102
  else:
103
- for flow in reversed(self.flows):
104
- x = flow(x, reverse=reverse)
105
- return x
106
 
107
 
108
  class Decoder(nn.Module):
109
- def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
110
  super().__init__()
111
- self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1)
112
- self.dec = nn.ModuleList()
113
- for _ in range(n_layers):
114
- self.dec.append(nn.Sequential(
115
- nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
116
- padding=(kernel_size - 1) * dilation_rate // 2,
117
- dilation=dilation_rate),
118
- nn.GLU(dim=1),
119
- ))
120
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
121
 
122
  def forward(self, x):
123
- x = self.pre(x)
124
- for layer in self.dec:
125
- x = x + layer(x)
126
- return self.proj(x)
 
 
127
 
128
 
129
  class RVCModel(nn.Module):
130
- def __init__(self, n_mels=80, hidden_channels=192):
131
  super().__init__()
132
- self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
133
- self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels)
134
- self.dec = Decoder(hidden_channels, n_mels)
135
  self.n_mels = n_mels
 
 
 
 
136
 
137
  def forward(self, mel):
138
- m, logs = self.enc_p(mel)
139
- z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0
140
- z_p = self.flow(z)
 
141
  z_back = self.flow(z_p, reverse=True)
142
- mel_out = self.dec(z_back)
143
  return mel_out
144
 
145
  def infer(self, mel, noise_scale=0.0):
146
- m, logs = self.enc_p(mel)
147
- z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale
148
- z_p = self.flow(z)
 
149
  z_back = self.flow(z_p, reverse=True)
150
- mel_out = self.dec(z_back)
151
  return mel_out
152
 
153
 
@@ -214,13 +205,16 @@ class HiFiGANGenerator(nn.Module):
214
 
215
 
216
  # ============================================================
217
- # Mel utilities (no librosa)
218
  # ============================================================
219
 
220
- def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
 
 
 
221
  mel_transform = torchaudio.transforms.MelSpectrogram(
222
- sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
223
- n_mels=n_mels, f_min=0.0, f_max=float(sample_rate // 2),
224
  power=2.0, norm=None, mel_scale="htk",
225
  )
226
  mel = mel_transform(y)
@@ -228,58 +222,55 @@ def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
228
  return mel
229
 
230
 
231
- def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32):
232
  inverse_mel = torchaudio.transforms.InverseMelScale(
233
- n_stft=n_fft // 2 + 1, n_mels=mel.shape[0],
234
- sample_rate=sample_rate, f_min=0, f_max=float(sample_rate // 2), mel_scale="htk",
235
  )
236
  mel_power = torch.exp(mel)
237
  spec = inverse_mel(mel_power)
238
- griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, hop_length=hop_length, n_iter=n_iter)
239
- audio = griffin_lim(spec)
240
  return audio.numpy()
241
 
242
 
243
  # ============================================================
244
- # Inference Engine - with lazy loading
245
  # ============================================================
246
 
247
  class VoiceCloner:
248
  def __init__(self):
249
- self.device = torch.device('cpu')
250
  self.rvc_model = None
251
- self.hifigan = None # lazy loaded
252
  self._hifigan_loaded = False
253
- self.sample_rate = 40000
254
- self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
255
  self.model_loaded = False
256
- self.samples = None # lazy loaded
257
- self._load_rvc_only()
 
258
 
259
- def _load_rvc_only(self):
260
- """Load only the RVC model at startup (fast)"""
261
- print("[STARTUP] Loading RVC model...")
262
  try:
263
  model_path = hf_hub_download(
264
  repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
265
  )
266
- ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
267
- state_dict = ckpt.get('model', ckpt.get('state_dict', ckpt))
268
- hidden_ch = 192
269
- for k, v in state_dict.items():
270
- if 'enc_p.pre.weight' in k:
271
- hidden_ch = v.shape[0]
272
- break
273
- self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
274
- self.rvc_model.load_state_dict(state_dict, strict=False)
275
- self.rvc_model.eval()
276
  self.model_loaded = True
277
- print(f"[STARTUP] RVC model loaded OK (hidden={hidden_ch})")
278
  except Exception as e:
279
  print(f"[STARTUP] RVC model load FAILED: {e}")
 
 
280
 
281
  def _ensure_hifigan(self):
282
- """Lazy-load HiFi-GAN on first inference request"""
283
  if self._hifigan_loaded:
284
  return
285
  self._hifigan_loaded = True
@@ -288,56 +279,90 @@ class VoiceCloner:
288
  hifigan_path = hf_hub_download(
289
  repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000"
290
  )
291
- ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
292
- state_dict = ckpt.get('generator', ckpt.get('state_dict', ckpt))
293
- if any(k.startswith('generator.') for k in state_dict):
294
- state_dict = {k.replace('generator.', ''): v for k, v in state_dict.items() if k.startswith('generator.')}
295
  self.hifigan = HiFiGANGenerator()
296
  self.hifigan.load_state_dict(state_dict, strict=False)
297
  self.hifigan.eval()
298
- print("[LAZY] HiFi-GAN loaded OK")
299
  except Exception as e:
300
- print(f"[LAZY] HiFi-GAN FAILED (Griffin-Lim fallback): {e}")
301
  self.hifigan = None
302
 
303
  def _ensure_samples(self):
304
- """Lazy-load sample list"""
305
  if self.samples is not None:
306
  return
307
  self.samples = []
308
  try:
309
  api = HfApi()
310
  files = api.list_repo_files(self.dataset_id, repo_type="dataset")
311
- self.samples = [f for f in files if f.startswith('models/top_')
312
- and f.endswith('.wav')
313
- and '_p+' not in f and '_p-' not in f and '_s+' not in f]
 
314
  print(f"[LAZY] Found {len(self.samples)} samples")
315
  except Exception as e:
316
  print(f"[LAZY] Could not list samples: {e}")
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  def process_audio(self, input_audio, pitch_shift=0):
319
  if not self.model_loaded:
320
  return None, "Model not loaded. Check logs."
321
  if input_audio is None:
322
  return None, "Please upload an audio file."
323
 
324
- # Lazy load vocoder on first real request
325
  self._ensure_hifigan()
326
 
327
  try:
328
- # Use soundfile directly to avoid torchaudio torchcodec backend issues
329
- audio_data, sr = sf.read(input_audio, dtype='float32')
330
  if audio_data.ndim > 1:
331
  audio_data = audio_data.mean(axis=1)
332
  y = torch.from_numpy(audio_data)
333
- if sr != self.sample_rate:
334
- y = torchaudio.transforms.Resample(sr, self.sample_rate)(y)
335
- sr = self.sample_rate
336
 
337
  if pitch_shift != 0:
338
  factor = 2.0 ** (abs(pitch_shift) / 12.0)
339
  new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
340
- y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear').squeeze(0).squeeze(0)
341
 
342
  # Trim silence
343
  energy = y ** 2
@@ -352,29 +377,21 @@ class VoiceCloner:
352
  if len(active) > 0:
353
  y = y[active[0]:active[-1] + 1]
354
 
355
- max_len = 10 * self.sample_rate
356
  if len(y) > max_len:
357
  y = y[:max_len]
358
 
359
- mel = compute_mel(y, sample_rate=self.sample_rate, n_mels=80)
360
 
361
  with torch.no_grad():
362
  mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
363
  mel_out = mel_out.squeeze(0)
364
 
365
- if self.hifigan is not None:
366
- with torch.no_grad():
367
- audio_out = self.hifigan(mel_out.unsqueeze(0))
368
- audio_out = audio_out.squeeze(0).squeeze(0).cpu().numpy()
369
- vocoder_name = "HiFi-GAN"
370
- else:
371
- audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
372
- vocoder_name = "Griffin-Lim"
373
-
374
  audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
375
- output_path = tempfile.mktemp(suffix='.wav')
376
- sf.write(output_path, audio_out, self.sample_rate)
377
- return output_path, f"✅ {vocoder_name} | {len(y)/sr:.1f}s → {len(audio_out)/self.sample_rate:.1f}s"
378
  except Exception as e:
379
  import traceback
380
  traceback.print_exc()
@@ -399,15 +416,15 @@ class VoiceCloner:
399
  # Gradio UI
400
  # ============================================================
401
 
402
- print("[STARTUP] Creating VoiceCloner (RVC only, HiFi-GAN lazy)...")
403
  cloner = VoiceCloner()
404
  print(f"[STARTUP] Ready. model_loaded={cloner.model_loaded}")
405
 
406
- demo = gr.Blocks(title="NumberBlocks One Voice Cloner")
407
 
408
  with demo:
409
- gr.Markdown("# 🎤 NumberBlocks One Voice Cloner")
410
- gr.Markdown("RVC v2 Model (60.7MB) + HiFi-GAN Vocoder | Upload audio → convert to One's voice")
411
 
412
  with gr.Tab("Voice Conversion"):
413
  with gr.Row():
 
1
  #!/usr/bin/env python3
2
  """
3
+ NumberBlocks One Voice Cloner - V7 Architecture Fix
4
+ CRITICAL FIX: Model classes now match the actual checkpoint architecture.
5
+ - n_mels=128 (was 80), hidden=256 (was 192), enc_out=512, z_channels=192
6
+ - Encoder: 5 Conv+BN+LayerNorm (not PosteriorEncoder)
7
+ - Flow: single AffineCouplingFlow (not ResidualCouplingBlock)
8
+ - Decoder: 5 Conv+BN (not generic Decoder)
9
  """
10
 
11
  import os
 
21
  from huggingface_hub import hf_hub_download, HfApi
22
  import gradio as gr
23
 
24
+ print("=== NumberBlocks One Voice Cloner V7 (Architecture Fix) ===")
25
 
26
  # ============================================================
27
+ # CORRECT Model Architecture
28
  # ============================================================
29
 
30
+ class Encoder(nn.Module):
31
+ def __init__(self, in_channels=128, hidden=256, out_channels=512):
32
  super().__init__()
33
+ self.conv1 = nn.Conv1d(in_channels, hidden, 5, padding=2)
34
+ self.bn1 = nn.BatchNorm1d(hidden)
35
+ self.conv2 = nn.Conv1d(hidden, hidden, 5, padding=2)
36
+ self.bn2 = nn.BatchNorm1d(hidden)
37
+ self.conv3 = nn.Conv1d(hidden, hidden, 5, padding=2)
38
+ self.bn3 = nn.BatchNorm1d(hidden)
39
+ self.conv4 = nn.Conv1d(hidden, out_channels, 5, padding=2)
40
+ self.bn4 = nn.BatchNorm1d(out_channels)
41
+ self.conv5 = nn.Conv1d(out_channels, out_channels, 3, padding=1)
42
+ self.bn5 = nn.BatchNorm1d(out_channels)
43
+ self.ln = nn.LayerNorm(out_channels)
44
 
45
  def forward(self, x):
46
+ x = F.relu(self.bn1(self.conv1(x)))
47
+ x = F.relu(self.bn2(self.conv2(x)))
48
+ x = F.relu(self.bn3(self.conv3(x)))
49
+ x = F.relu(self.bn4(self.conv4(x)))
50
+ x = F.relu(self.bn5(self.conv5(x)))
51
+ x = x.permute(0, 2, 1)
52
+ x = self.ln(x)
53
+ x = x.permute(0, 2, 1)
54
+ return x
55
 
56
 
57
+ class Posterior(nn.Module):
58
+ def __init__(self, in_channels=512, z_channels=192):
59
  super().__init__()
60
+ self.conv = nn.Conv1d(in_channels, z_channels * 2, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ def forward(self, x):
63
+ h = self.conv(x)
64
+ mu, logvar = h.chunk(2, dim=1)
65
+ return mu, logvar
 
 
66
 
67
 
68
+ class AffineCouplingFlow(nn.Module):
69
+ def __init__(self, z_channels=192, hidden=256):
70
  super().__init__()
71
+ self.net = nn.Sequential(
72
+ nn.Conv1d(z_channels // 2, hidden, 1),
73
+ nn.ReLU(),
74
+ nn.Conv1d(hidden, hidden, 1),
75
+ nn.ReLU(),
76
+ nn.Conv1d(hidden, z_channels, 1),
77
+ )
78
 
79
+ def forward(self, z, reverse=False):
80
+ z1, z2 = z.chunk(2, dim=1)
81
+ sb = self.net(z1)
82
+ s, b = sb.chunk(2, dim=1)
83
+ s = torch.clamp(s, -5.0, 5.0)
84
  if not reverse:
85
+ z2_new = z2 * torch.exp(s) + b
86
+ z_out = torch.cat([z1, z2_new], dim=1)
87
+ logdet = torch.sum(s)
88
+ return z_out, logdet
89
  else:
90
+ z2_new = (z2 - b) * torch.exp(-s)
91
+ z_out = torch.cat([z1, z2_new], dim=1)
92
+ return z_out
93
 
94
 
95
  class Decoder(nn.Module):
96
+ def __init__(self, in_channels=192, out_channels=128):
97
  super().__init__()
98
+ self.conv1 = nn.Conv1d(in_channels, 512, 5, padding=2)
99
+ self.bn1 = nn.BatchNorm1d(512)
100
+ self.conv2 = nn.Conv1d(512, 512, 5, padding=2)
101
+ self.bn2 = nn.BatchNorm1d(512)
102
+ self.conv3 = nn.Conv1d(512, 256, 5, padding=2)
103
+ self.bn3 = nn.BatchNorm1d(256)
104
+ self.conv4 = nn.Conv1d(256, 256, 3, padding=1)
105
+ self.bn4 = nn.BatchNorm1d(256)
106
+ self.conv5 = nn.Conv1d(256, out_channels, 1)
 
107
 
108
  def forward(self, x):
109
+ x = F.relu(self.bn1(self.conv1(x)))
110
+ x = F.relu(self.bn2(self.conv2(x)))
111
+ x = F.relu(self.bn3(self.conv3(x)))
112
+ x = F.relu(self.bn4(self.conv4(x)))
113
+ x = self.conv5(x)
114
+ return x
115
 
116
 
117
  class RVCModel(nn.Module):
118
+ def __init__(self, n_mels=128, hidden=256, enc_out=512, z_channels=192):
119
  super().__init__()
 
 
 
120
  self.n_mels = n_mels
121
+ self.encoder = Encoder(n_mels, hidden, enc_out)
122
+ self.posterior = Posterior(enc_out, z_channels)
123
+ self.flow = AffineCouplingFlow(z_channels, hidden)
124
+ self.decoder = Decoder(z_channels, n_mels)
125
 
126
  def forward(self, mel):
127
+ h = self.encoder(mel)
128
+ mu, logvar = self.posterior(h)
129
+ z = mu + torch.randn_like(logvar) * torch.exp(logvar) * 0.0
130
+ z_p, _ = self.flow(z)
131
  z_back = self.flow(z_p, reverse=True)
132
+ mel_out = self.decoder(z_back)
133
  return mel_out
134
 
135
  def infer(self, mel, noise_scale=0.0):
136
+ h = self.encoder(mel)
137
+ mu, logvar = self.posterior(h)
138
+ z = mu + torch.randn_like(logvar) * torch.exp(logvar) * noise_scale
139
+ z_p, _ = self.flow(z)
140
  z_back = self.flow(z_p, reverse=True)
141
+ mel_out = self.decoder(z_back)
142
  return mel_out
143
 
144
 
 
205
 
206
 
207
  # ============================================================
208
+ # Mel utilities
209
  # ============================================================
210
 
211
+ SAMPLE_RATE = 40000
212
+ N_MELS = 128 # MATCHES MODEL
213
+
214
+ def compute_mel(y, sr=SAMPLE_RATE):
215
  mel_transform = torchaudio.transforms.MelSpectrogram(
216
+ sample_rate=sr, n_fft=1024, hop_length=256,
217
+ n_mels=N_MELS, f_min=0.0, f_max=float(sr // 2),
218
  power=2.0, norm=None, mel_scale="htk",
219
  )
220
  mel = mel_transform(y)
 
222
  return mel
223
 
224
 
225
+ def mel_to_audio_griffinlim(mel, sr=SAMPLE_RATE, n_iter=60):
226
  inverse_mel = torchaudio.transforms.InverseMelScale(
227
+ n_stft=1024 // 2 + 1, n_mels=N_MELS,
228
+ sample_rate=sr, f_min=0, f_max=float(sr // 2), mel_scale="htk",
229
  )
230
  mel_power = torch.exp(mel)
231
  spec = inverse_mel(mel_power)
232
+ gl = torchaudio.transforms.GriffinLim(n_fft=1024, hop_length=256, n_iter=n_iter)
233
+ audio = gl(spec)
234
  return audio.numpy()
235
 
236
 
237
  # ============================================================
238
+ # Inference Engine
239
  # ============================================================
240
 
241
  class VoiceCloner:
242
  def __init__(self):
243
+ self.device = torch.device("cpu")
244
  self.rvc_model = None
245
+ self.hifigan = None
246
  self._hifigan_loaded = False
 
 
247
  self.model_loaded = False
248
+ self.samples = None
249
+ self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
250
+ self._load_rvc()
251
 
252
+ def _load_rvc(self):
253
+ print("[STARTUP] Loading RVC model (V7 correct architecture)...")
 
254
  try:
255
  model_path = hf_hub_download(
256
  repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
257
  )
258
+ ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
259
+ sd = ckpt["model_state_dict"]
260
+
261
+ model = RVCModel(n_mels=128, hidden=256, enc_out=512, z_channels=192)
262
+ result = model.load_state_dict(sd, strict=True)
263
+ print(f"[STARTUP] strict=True: missing={result.missing_keys}, unexpected={result.unexpected_keys}")
264
+ model.eval()
265
+ self.rvc_model = model
 
 
266
  self.model_loaded = True
267
+ print(f"[STARTUP] RVC model loaded OK (5,296,064 params, strict=True)")
268
  except Exception as e:
269
  print(f"[STARTUP] RVC model load FAILED: {e}")
270
+ import traceback
271
+ traceback.print_exc()
272
 
273
  def _ensure_hifigan(self):
 
274
  if self._hifigan_loaded:
275
  return
276
  self._hifigan_loaded = True
 
279
  hifigan_path = hf_hub_download(
280
  repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000"
281
  )
282
+ ckpt = torch.load(hifigan_path, map_location="cpu", weights_only=False)
283
+ state_dict = ckpt.get("generator", ckpt.get("state_dict", ckpt))
284
+ if any(k.startswith("generator.") for k in state_dict):
285
+ state_dict = {k.replace("generator.", ""): v for k, v in state_dict.items() if k.startswith("generator.")}
286
  self.hifigan = HiFiGANGenerator()
287
  self.hifigan.load_state_dict(state_dict, strict=False)
288
  self.hifigan.eval()
289
+ print("[LAZY] HiFi-GAN loaded OK (Griffin-Lim fallback for mel conversion)")
290
  except Exception as e:
291
+ print(f"[LAZY] HiFi-GAN FAILED: {e}")
292
  self.hifigan = None
293
 
294
  def _ensure_samples(self):
 
295
  if self.samples is not None:
296
  return
297
  self.samples = []
298
  try:
299
  api = HfApi()
300
  files = api.list_repo_files(self.dataset_id, repo_type="dataset")
301
+ # Look for cleaned audio files as samples
302
+ self.samples = [f for f in files if f.startswith("audio/") and f.endswith("_cleaned.wav")]
303
+ if not self.samples:
304
+ self.samples = [f for f in files if f.startswith("audio/") and f.endswith(".wav") and not f.endswith("_cleaned.wav")][:10]
305
  print(f"[LAZY] Found {len(self.samples)} samples")
306
  except Exception as e:
307
  print(f"[LAZY] Could not list samples: {e}")
308
 
309
+ def _mel_to_audio(self, mel_out):
310
+ """Convert mel spectrogram back to audio.
311
+ RVC model outputs 128-bin mel @ 40kHz.
312
+ HiFi-GAN expects 80-bin mel @ 22.05kHz.
313
+ Pipeline: Griffin-Lim(128bin@40k) → audio → resample(22.05k) → mel(80bin) → HiFi-GAN → audio
314
+ """
315
+ if self.hifigan is not None:
316
+ try:
317
+ # Step 1: Griffin-Lim to get rough audio at 40kHz
318
+ audio_gl = mel_to_audio_griffinlim(mel_out, sr=SAMPLE_RATE)
319
+ audio_tensor = torch.from_numpy(audio_gl).float()
320
+
321
+ # Step 2: Resample 40kHz → 22.05kHz
322
+ resampler = torchaudio.transforms.Resample(SAMPLE_RATE, 22050)
323
+ audio_22k = resampler(audio_tensor)
324
+
325
+ # Step 3: Compute 80-bin mel @ 22.05kHz for HiFi-GAN
326
+ mel_80 = torchaudio.transforms.MelSpectrogram(
327
+ sample_rate=22050, n_fft=1024, hop_length=256,
328
+ n_mels=80, f_min=0.0, f_max=8000.0,
329
+ power=2.0, norm=None, mel_scale="htk",
330
+ )(audio_22k)
331
+ mel_80 = torch.log(torch.clamp(mel_80, min=1e-5))
332
+
333
+ # Step 4: HiFi-GAN
334
+ with torch.no_grad():
335
+ audio_out = self.hifigan(mel_80.unsqueeze(0))
336
+ audio_out = audio_out.squeeze(0).squeeze(0).cpu().numpy()
337
+ return audio_out, 22050, "HiFi-GAN+GL"
338
+ except Exception as e:
339
+ print(f"HiFi-GAN pipeline failed, falling back to Griffin-Lim: {e}")
340
+
341
+ # Fallback: Griffin-Lim only
342
+ audio_out = mel_to_audio_griffinlim(mel_out, sr=SAMPLE_RATE)
343
+ return audio_out, SAMPLE_RATE, "Griffin-Lim"
344
+
345
  def process_audio(self, input_audio, pitch_shift=0):
346
  if not self.model_loaded:
347
  return None, "Model not loaded. Check logs."
348
  if input_audio is None:
349
  return None, "Please upload an audio file."
350
 
 
351
  self._ensure_hifigan()
352
 
353
  try:
354
+ audio_data, sr = sf.read(input_audio, dtype="float32")
 
355
  if audio_data.ndim > 1:
356
  audio_data = audio_data.mean(axis=1)
357
  y = torch.from_numpy(audio_data)
358
+ if sr != SAMPLE_RATE:
359
+ y = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(y)
360
+ sr = SAMPLE_RATE
361
 
362
  if pitch_shift != 0:
363
  factor = 2.0 ** (abs(pitch_shift) / 12.0)
364
  new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
365
+ y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode="linear").squeeze(0).squeeze(0)
366
 
367
  # Trim silence
368
  energy = y ** 2
 
377
  if len(active) > 0:
378
  y = y[active[0]:active[-1] + 1]
379
 
380
+ max_len = 10 * SAMPLE_RATE
381
  if len(y) > max_len:
382
  y = y[:max_len]
383
 
384
+ mel = compute_mel(y, sr=SAMPLE_RATE)
385
 
386
  with torch.no_grad():
387
  mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
388
  mel_out = mel_out.squeeze(0)
389
 
390
+ audio_out, out_sr, vocoder_name = self._mel_to_audio(mel_out)
 
 
 
 
 
 
 
 
391
  audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
392
+ output_path = tempfile.mktemp(suffix=".wav")
393
+ sf.write(output_path, audio_out, out_sr)
394
+ return output_path, f"✅ {vocoder_name} | {len(y)/SAMPLE_RATE:.1f}s → {len(audio_out)/out_sr:.1f}s | Model: strict=True, 128-mel"
395
  except Exception as e:
396
  import traceback
397
  traceback.print_exc()
 
416
  # Gradio UI
417
  # ============================================================
418
 
419
+ print("[STARTUP] Creating VoiceCloner (V7 correct architecture)...")
420
  cloner = VoiceCloner()
421
  print(f"[STARTUP] Ready. model_loaded={cloner.model_loaded}")
422
 
423
+ demo = gr.Blocks(title="NumberBlocks One Voice Cloner V7")
424
 
425
  with demo:
426
+ gr.Markdown("# 🎤 NumberBlocks One Voice Cloner V7")
427
+ gr.Markdown("RVC v2 Model (60.7MB, strict=True, 128-mel) + HiFi-GAN Vocoder | Upload audio → convert to One's voice")
428
 
429
  with gr.Tab("Voice Conversion"):
430
  with gr.Row():