Numberblocks1Voice

Sleeping

App Files Files Community

ayf3 commited on Apr 2

Commit

52c31e0

verified ·

1 Parent(s): 65cc5a2

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +181 -164

app.py CHANGED Viewed

@@ -1,7 +1,11 @@
 #!/usr/bin/env python3
 """
-NumberBlocks One Voice Cloner - V6 Robust
-Fixes: user creation in Dockerfile, lazy-load HiFi-GAN, startup-timeout protection
 """
 import os
@@ -17,137 +21,124 @@ from pathlib import Path
 from huggingface_hub import hf_hub_download, HfApi
 import gradio as gr
-print("=== NumberBlocks One Voice Cloner V6 ===")
 # ============================================================
-# Model Definitions
 # ============================================================
-class PosteriorEncoder(nn.Module):
-    def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = nn.ModuleList()
-        for _ in range(n_layers):
-            self.enc.append(nn.Sequential(
-                nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
-                          padding=(kernel_size - 1) * dilation_rate // 2,
-                          dilation=dilation_rate),
-                nn.GLU(dim=1),
-            ))
-        self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1)
     def forward(self, x):
-        x = self.pre(x)
-        for layer in self.enc:
-            x = x + layer(x)
-        stats = self.proj(x)
-        m, logs = stats.chunk(2, dim=1)
-        return m, logs
-class ResidualCouplingLayer(nn.Module):
-    def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
-        self.pre = nn.Conv1d(channels, hidden_channels, 1)
-        self.enc = nn.ModuleList()
-        for _ in range(n_layers):
-            self.enc.append(nn.Sequential(
-                nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
-                          padding=(kernel_size - 1) * dilation_rate // 2,
-                          dilation=dilation_rate),
-                nn.GLU(dim=1),
-            ))
-        self.post = nn.Conv1d(hidden_channels, channels * 2, 1)
-        self.post.weight.data.zero_()
-        self.post.bias.data.zero_()
-    def forward(self, x, reverse=False):
-        h = self.pre(x)
-        for layer in self.enc:
-            h = h + layer(h)
-        stats = self.post(h)
-        m, logs = stats.chunk(2, dim=1)
-        if not reverse:
-            log_s = torch.clamp(logs, -5.0, 5.0)
-            y = m + x * torch.exp(log_s)
-            logdet = torch.sum(log_s)
-            return y, logdet
-        else:
-            log_s = torch.clamp(logs, -5.0, 5.0)
-            y = (x - m) * torch.exp(-log_s)
-            return y
-class Flip(nn.Module):
-    def forward(self, x, reverse=False):
-        if not reverse:
-            return torch.flip(x, [1]), 0
-        else:
-            return torch.flip(x, [1])
-class ResidualCouplingBlock(nn.Module):
-    def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
         super().__init__()
-        self.flows = nn.ModuleList()
-        for _ in range(n_flows):
-            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
-            self.flows.append(Flip())
-    def forward(self, x, reverse=False):
         if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, reverse=reverse)
         else:
-            for flow in reversed(self.flows):
-                x = flow(x, reverse=reverse)
-        return x
 class Decoder(nn.Module):
-    def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
-        self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1)
-        self.dec = nn.ModuleList()
-        for _ in range(n_layers):
-            self.dec.append(nn.Sequential(
-                nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
-                          padding=(kernel_size - 1) * dilation_rate // 2,
-                          dilation=dilation_rate),
-                nn.GLU(dim=1),
-            ))
-        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
     def forward(self, x):
-        x = self.pre(x)
-        for layer in self.dec:
-            x = x + layer(x)
-        return self.proj(x)
 class RVCModel(nn.Module):
-    def __init__(self, n_mels=80, hidden_channels=192):
         super().__init__()
-        self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
-        self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels)
-        self.dec = Decoder(hidden_channels, n_mels)
         self.n_mels = n_mels
     def forward(self, mel):
-        m, logs = self.enc_p(mel)
-        z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0
-        z_p = self.flow(z)
         z_back = self.flow(z_p, reverse=True)
-        mel_out = self.dec(z_back)
         return mel_out
     def infer(self, mel, noise_scale=0.0):
-        m, logs = self.enc_p(mel)
-        z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale
-        z_p = self.flow(z)
         z_back = self.flow(z_p, reverse=True)
-        mel_out = self.dec(z_back)
         return mel_out
@@ -214,13 +205,16 @@ class HiFiGANGenerator(nn.Module):
 # ============================================================
-# Mel utilities (no librosa)
 # ============================================================
-def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
     mel_transform = torchaudio.transforms.MelSpectrogram(
-        sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
-        n_mels=n_mels, f_min=0.0, f_max=float(sample_rate // 2),
         power=2.0, norm=None, mel_scale="htk",
     )
     mel = mel_transform(y)
@@ -228,58 +222,55 @@ def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
     return mel
-def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32):
     inverse_mel = torchaudio.transforms.InverseMelScale(
-        n_stft=n_fft // 2 + 1, n_mels=mel.shape[0],
-        sample_rate=sample_rate, f_min=0, f_max=float(sample_rate // 2), mel_scale="htk",
     )
     mel_power = torch.exp(mel)
     spec = inverse_mel(mel_power)
-    griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, hop_length=hop_length, n_iter=n_iter)
-    audio = griffin_lim(spec)
     return audio.numpy()
 # ============================================================
-# Inference Engine - with lazy loading
 # ============================================================
 class VoiceCloner:
     def __init__(self):
-        self.device = torch.device('cpu')
         self.rvc_model = None
-        self.hifigan = None  # lazy loaded
         self._hifigan_loaded = False
-        self.sample_rate = 40000
-        self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
         self.model_loaded = False
-        self.samples = None  # lazy loaded
-        self._load_rvc_only()
-    def _load_rvc_only(self):
-        """Load only the RVC model at startup (fast)"""
-        print("[STARTUP] Loading RVC model...")
         try:
             model_path = hf_hub_download(
                 repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
             )
-            ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
-            state_dict = ckpt.get('model', ckpt.get('state_dict', ckpt))
-            hidden_ch = 192
-            for k, v in state_dict.items():
-                if 'enc_p.pre.weight' in k:
-                    hidden_ch = v.shape[0]
-                    break
-            self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
-            self.rvc_model.load_state_dict(state_dict, strict=False)
-            self.rvc_model.eval()
             self.model_loaded = True
-            print(f"[STARTUP] RVC model loaded OK (hidden={hidden_ch})")
         except Exception as e:
             print(f"[STARTUP] RVC model load FAILED: {e}")
     def _ensure_hifigan(self):
-        """Lazy-load HiFi-GAN on first inference request"""
         if self._hifigan_loaded:
             return
         self._hifigan_loaded = True
@@ -288,56 +279,90 @@ class VoiceCloner:
             hifigan_path = hf_hub_download(
                 repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000"
             )
-            ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
-            state_dict = ckpt.get('generator', ckpt.get('state_dict', ckpt))
-            if any(k.startswith('generator.') for k in state_dict):
-                state_dict = {k.replace('generator.', ''): v for k, v in state_dict.items() if k.startswith('generator.')}
             self.hifigan = HiFiGANGenerator()
             self.hifigan.load_state_dict(state_dict, strict=False)
             self.hifigan.eval()
-            print("[LAZY] HiFi-GAN loaded OK")
         except Exception as e:
-            print(f"[LAZY] HiFi-GAN FAILED (Griffin-Lim fallback): {e}")
             self.hifigan = None
     def _ensure_samples(self):
-        """Lazy-load sample list"""
         if self.samples is not None:
             return
         self.samples = []
         try:
             api = HfApi()
             files = api.list_repo_files(self.dataset_id, repo_type="dataset")
-            self.samples = [f for f in files if f.startswith('models/top_')
-                           and f.endswith('.wav')
-                           and '_p+' not in f and '_p-' not in f and '_s+' not in f]
             print(f"[LAZY] Found {len(self.samples)} samples")
         except Exception as e:
             print(f"[LAZY] Could not list samples: {e}")
     def process_audio(self, input_audio, pitch_shift=0):
         if not self.model_loaded:
             return None, "Model not loaded. Check logs."
         if input_audio is None:
             return None, "Please upload an audio file."
-        # Lazy load vocoder on first real request
         self._ensure_hifigan()
         try:
-            # Use soundfile directly to avoid torchaudio torchcodec backend issues
-            audio_data, sr = sf.read(input_audio, dtype='float32')
             if audio_data.ndim > 1:
                 audio_data = audio_data.mean(axis=1)
             y = torch.from_numpy(audio_data)
-            if sr != self.sample_rate:
-                y = torchaudio.transforms.Resample(sr, self.sample_rate)(y)
-                sr = self.sample_rate
             if pitch_shift != 0:
                 factor = 2.0 ** (abs(pitch_shift) / 12.0)
                 new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
-                y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear').squeeze(0).squeeze(0)
             # Trim silence
             energy = y ** 2
@@ -352,29 +377,21 @@ class VoiceCloner:
                 if len(active) > 0:
                     y = y[active[0]:active[-1] + 1]
-            max_len = 10 * self.sample_rate
             if len(y) > max_len:
                 y = y[:max_len]
-            mel = compute_mel(y, sample_rate=self.sample_rate, n_mels=80)
             with torch.no_grad():
                 mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
                 mel_out = mel_out.squeeze(0)
-            if self.hifigan is not None:
-                with torch.no_grad():
-                    audio_out = self.hifigan(mel_out.unsqueeze(0))
-                audio_out = audio_out.squeeze(0).squeeze(0).cpu().numpy()
-                vocoder_name = "HiFi-GAN"
-            else:
-                audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
-                vocoder_name = "Griffin-Lim"
             audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
-            output_path = tempfile.mktemp(suffix='.wav')
-            sf.write(output_path, audio_out, self.sample_rate)
-            return output_path, f"✅ {vocoder_name} | {len(y)/sr:.1f}s → {len(audio_out)/self.sample_rate:.1f}s"
         except Exception as e:
             import traceback
             traceback.print_exc()
@@ -399,15 +416,15 @@ class VoiceCloner:
 # Gradio UI
 # ============================================================
-print("[STARTUP] Creating VoiceCloner (RVC only, HiFi-GAN lazy)...")
 cloner = VoiceCloner()
 print(f"[STARTUP] Ready. model_loaded={cloner.model_loaded}")
-demo = gr.Blocks(title="NumberBlocks One Voice Cloner")
 with demo:
-    gr.Markdown("# 🎤 NumberBlocks One Voice Cloner")
-    gr.Markdown("RVC v2 Model (60.7MB) + HiFi-GAN Vocoder | Upload audio → convert to One's voice")
     with gr.Tab("Voice Conversion"):
         with gr.Row():

 #!/usr/bin/env python3
 """
+NumberBlocks One Voice Cloner - V7 Architecture Fix
+CRITICAL FIX: Model classes now match the actual checkpoint architecture.
+- n_mels=128 (was 80), hidden=256 (was 192), enc_out=512, z_channels=192
+- Encoder: 5 Conv+BN+LayerNorm (not PosteriorEncoder)
+- Flow: single AffineCouplingFlow (not ResidualCouplingBlock)
+- Decoder: 5 Conv+BN (not generic Decoder)
 """
 import os
 from huggingface_hub import hf_hub_download, HfApi
 import gradio as gr
+print("=== NumberBlocks One Voice Cloner V7 (Architecture Fix) ===")
 # ============================================================
+# CORRECT Model Architecture
 # ============================================================
+class Encoder(nn.Module):
+    def __init__(self, in_channels=128, hidden=256, out_channels=512):
         super().__init__()
+        self.conv1 = nn.Conv1d(in_channels, hidden, 5, padding=2)
+        self.bn1 = nn.BatchNorm1d(hidden)
+        self.conv2 = nn.Conv1d(hidden, hidden, 5, padding=2)
+        self.bn2 = nn.BatchNorm1d(hidden)
+        self.conv3 = nn.Conv1d(hidden, hidden, 5, padding=2)
+        self.bn3 = nn.BatchNorm1d(hidden)
+        self.conv4 = nn.Conv1d(hidden, out_channels, 5, padding=2)
+        self.bn4 = nn.BatchNorm1d(out_channels)
+        self.conv5 = nn.Conv1d(out_channels, out_channels, 3, padding=1)
+        self.bn5 = nn.BatchNorm1d(out_channels)
+        self.ln = nn.LayerNorm(out_channels)
     def forward(self, x):
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = F.relu(self.bn2(self.conv2(x)))
+        x = F.relu(self.bn3(self.conv3(x)))
+        x = F.relu(self.bn4(self.conv4(x)))
+        x = F.relu(self.bn5(self.conv5(x)))
+        x = x.permute(0, 2, 1)
+        x = self.ln(x)
+        x = x.permute(0, 2, 1)
+        return x
+class Posterior(nn.Module):
+    def __init__(self, in_channels=512, z_channels=192):
         super().__init__()
+        self.conv = nn.Conv1d(in_channels, z_channels * 2, 1)
+    def forward(self, x):
+        h = self.conv(x)
+        mu, logvar = h.chunk(2, dim=1)
+        return mu, logvar
+class AffineCouplingFlow(nn.Module):
+    def __init__(self, z_channels=192, hidden=256):
         super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv1d(z_channels // 2, hidden, 1),
+            nn.ReLU(),
+            nn.Conv1d(hidden, hidden, 1),
+            nn.ReLU(),
+            nn.Conv1d(hidden, z_channels, 1),
+        )
+    def forward(self, z, reverse=False):
+        z1, z2 = z.chunk(2, dim=1)
+        sb = self.net(z1)
+        s, b = sb.chunk(2, dim=1)
+        s = torch.clamp(s, -5.0, 5.0)
         if not reverse:
+            z2_new = z2 * torch.exp(s) + b
+            z_out = torch.cat([z1, z2_new], dim=1)
+            logdet = torch.sum(s)
+            return z_out, logdet
         else:
+            z2_new = (z2 - b) * torch.exp(-s)
+            z_out = torch.cat([z1, z2_new], dim=1)
+            return z_out
 class Decoder(nn.Module):
+    def __init__(self, in_channels=192, out_channels=128):
         super().__init__()
+        self.conv1 = nn.Conv1d(in_channels, 512, 5, padding=2)
+        self.bn1 = nn.BatchNorm1d(512)
+        self.conv2 = nn.Conv1d(512, 512, 5, padding=2)
+        self.bn2 = nn.BatchNorm1d(512)
+        self.conv3 = nn.Conv1d(512, 256, 5, padding=2)
+        self.bn3 = nn.BatchNorm1d(256)
+        self.conv4 = nn.Conv1d(256, 256, 3, padding=1)
+        self.bn4 = nn.BatchNorm1d(256)
+        self.conv5 = nn.Conv1d(256, out_channels, 1)
     def forward(self, x):
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = F.relu(self.bn2(self.conv2(x)))
+        x = F.relu(self.bn3(self.conv3(x)))
+        x = F.relu(self.bn4(self.conv4(x)))
+        x = self.conv5(x)
+        return x
 class RVCModel(nn.Module):
+    def __init__(self, n_mels=128, hidden=256, enc_out=512, z_channels=192):
         super().__init__()
         self.n_mels = n_mels
+        self.encoder = Encoder(n_mels, hidden, enc_out)
+        self.posterior = Posterior(enc_out, z_channels)
+        self.flow = AffineCouplingFlow(z_channels, hidden)
+        self.decoder = Decoder(z_channels, n_mels)
     def forward(self, mel):
+        h = self.encoder(mel)
+        mu, logvar = self.posterior(h)
+        z = mu + torch.randn_like(logvar) * torch.exp(logvar) * 0.0
+        z_p, _ = self.flow(z)
         z_back = self.flow(z_p, reverse=True)
+        mel_out = self.decoder(z_back)
         return mel_out
     def infer(self, mel, noise_scale=0.0):
+        h = self.encoder(mel)
+        mu, logvar = self.posterior(h)
+        z = mu + torch.randn_like(logvar) * torch.exp(logvar) * noise_scale
+        z_p, _ = self.flow(z)
         z_back = self.flow(z_p, reverse=True)
+        mel_out = self.decoder(z_back)
         return mel_out
 # ============================================================
+# Mel utilities
 # ============================================================
+SAMPLE_RATE = 40000
+N_MELS = 128  # MATCHES MODEL
+def compute_mel(y, sr=SAMPLE_RATE):
     mel_transform = torchaudio.transforms.MelSpectrogram(
+        sample_rate=sr, n_fft=1024, hop_length=256,
+        n_mels=N_MELS, f_min=0.0, f_max=float(sr // 2),
         power=2.0, norm=None, mel_scale="htk",
     )
     mel = mel_transform(y)
     return mel
+def mel_to_audio_griffinlim(mel, sr=SAMPLE_RATE, n_iter=60):
     inverse_mel = torchaudio.transforms.InverseMelScale(
+        n_stft=1024 // 2 + 1, n_mels=N_MELS,
+        sample_rate=sr, f_min=0, f_max=float(sr // 2), mel_scale="htk",
     )
     mel_power = torch.exp(mel)
     spec = inverse_mel(mel_power)
+    gl = torchaudio.transforms.GriffinLim(n_fft=1024, hop_length=256, n_iter=n_iter)
+    audio = gl(spec)
     return audio.numpy()
 # ============================================================
+# Inference Engine
 # ============================================================
 class VoiceCloner:
     def __init__(self):
+        self.device = torch.device("cpu")
         self.rvc_model = None
+        self.hifigan = None
         self._hifigan_loaded = False
         self.model_loaded = False
+        self.samples = None
+        self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
+        self._load_rvc()
+    def _load_rvc(self):
+        print("[STARTUP] Loading RVC model (V7 correct architecture)...")
         try:
             model_path = hf_hub_download(
                 repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
             )
+            ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
+            sd = ckpt["model_state_dict"]
+            model = RVCModel(n_mels=128, hidden=256, enc_out=512, z_channels=192)
+            result = model.load_state_dict(sd, strict=True)
+            print(f"[STARTUP] strict=True: missing={result.missing_keys}, unexpected={result.unexpected_keys}")
+            model.eval()
+            self.rvc_model = model
             self.model_loaded = True
+            print(f"[STARTUP] RVC model loaded OK (5,296,064 params, strict=True)")
         except Exception as e:
             print(f"[STARTUP] RVC model load FAILED: {e}")
+            import traceback
+            traceback.print_exc()
     def _ensure_hifigan(self):
         if self._hifigan_loaded:
             return
         self._hifigan_loaded = True
             hifigan_path = hf_hub_download(
                 repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000"
             )
+            ckpt = torch.load(hifigan_path, map_location="cpu", weights_only=False)
+            state_dict = ckpt.get("generator", ckpt.get("state_dict", ckpt))
+            if any(k.startswith("generator.") for k in state_dict):
+                state_dict = {k.replace("generator.", ""): v for k, v in state_dict.items() if k.startswith("generator.")}
             self.hifigan = HiFiGANGenerator()
             self.hifigan.load_state_dict(state_dict, strict=False)
             self.hifigan.eval()
+            print("[LAZY] HiFi-GAN loaded OK (Griffin-Lim fallback for mel conversion)")
         except Exception as e:
+            print(f"[LAZY] HiFi-GAN FAILED: {e}")
             self.hifigan = None
     def _ensure_samples(self):
         if self.samples is not None:
             return
         self.samples = []
         try:
             api = HfApi()
             files = api.list_repo_files(self.dataset_id, repo_type="dataset")
+            # Look for cleaned audio files as samples
+            self.samples = [f for f in files if f.startswith("audio/") and f.endswith("_cleaned.wav")]
+            if not self.samples:
+                self.samples = [f for f in files if f.startswith("audio/") and f.endswith(".wav") and not f.endswith("_cleaned.wav")][:10]
             print(f"[LAZY] Found {len(self.samples)} samples")
         except Exception as e:
             print(f"[LAZY] Could not list samples: {e}")
+    def _mel_to_audio(self, mel_out):
+        """Convert mel spectrogram back to audio.
+        RVC model outputs 128-bin mel @ 40kHz.
+        HiFi-GAN expects 80-bin mel @ 22.05kHz.
+        Pipeline: Griffin-Lim(128bin@40k) → audio → resample(22.05k) → mel(80bin) → HiFi-GAN → audio
+        """
+        if self.hifigan is not None:
+            try:
+                # Step 1: Griffin-Lim to get rough audio at 40kHz
+                audio_gl = mel_to_audio_griffinlim(mel_out, sr=SAMPLE_RATE)
+                audio_tensor = torch.from_numpy(audio_gl).float()
+                # Step 2: Resample 40kHz → 22.05kHz
+                resampler = torchaudio.transforms.Resample(SAMPLE_RATE, 22050)
+                audio_22k = resampler(audio_tensor)
+                # Step 3: Compute 80-bin mel @ 22.05kHz for HiFi-GAN
+                mel_80 = torchaudio.transforms.MelSpectrogram(
+                    sample_rate=22050, n_fft=1024, hop_length=256,
+                    n_mels=80, f_min=0.0, f_max=8000.0,
+                    power=2.0, norm=None, mel_scale="htk",
+                )(audio_22k)
+                mel_80 = torch.log(torch.clamp(mel_80, min=1e-5))
+                # Step 4: HiFi-GAN
+                with torch.no_grad():
+                    audio_out = self.hifigan(mel_80.unsqueeze(0))
+                audio_out = audio_out.squeeze(0).squeeze(0).cpu().numpy()
+                return audio_out, 22050, "HiFi-GAN+GL"
+            except Exception as e:
+                print(f"HiFi-GAN pipeline failed, falling back to Griffin-Lim: {e}")
+        # Fallback: Griffin-Lim only
+        audio_out = mel_to_audio_griffinlim(mel_out, sr=SAMPLE_RATE)
+        return audio_out, SAMPLE_RATE, "Griffin-Lim"
     def process_audio(self, input_audio, pitch_shift=0):
         if not self.model_loaded:
             return None, "Model not loaded. Check logs."
         if input_audio is None:
             return None, "Please upload an audio file."
         self._ensure_hifigan()
         try:
+            audio_data, sr = sf.read(input_audio, dtype="float32")
             if audio_data.ndim > 1:
                 audio_data = audio_data.mean(axis=1)
             y = torch.from_numpy(audio_data)
+            if sr != SAMPLE_RATE:
+                y = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(y)
+                sr = SAMPLE_RATE
             if pitch_shift != 0:
                 factor = 2.0 ** (abs(pitch_shift) / 12.0)
                 new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
+                y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode="linear").squeeze(0).squeeze(0)
             # Trim silence
             energy = y ** 2
                 if len(active) > 0:
                     y = y[active[0]:active[-1] + 1]
+            max_len = 10 * SAMPLE_RATE
             if len(y) > max_len:
                 y = y[:max_len]
+            mel = compute_mel(y, sr=SAMPLE_RATE)
             with torch.no_grad():
                 mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
                 mel_out = mel_out.squeeze(0)
+            audio_out, out_sr, vocoder_name = self._mel_to_audio(mel_out)
             audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
+            output_path = tempfile.mktemp(suffix=".wav")
+            sf.write(output_path, audio_out, out_sr)
+            return output_path, f"✅ {vocoder_name} | {len(y)/SAMPLE_RATE:.1f}s → {len(audio_out)/out_sr:.1f}s | Model: strict=True, 128-mel"
         except Exception as e:
             import traceback
             traceback.print_exc()
 # Gradio UI
 # ============================================================
+print("[STARTUP] Creating VoiceCloner (V7 correct architecture)...")
 cloner = VoiceCloner()
 print(f"[STARTUP] Ready. model_loaded={cloner.model_loaded}")
+demo = gr.Blocks(title="NumberBlocks One Voice Cloner V7")
 with demo:
+    gr.Markdown("# 🎤 NumberBlocks One Voice Cloner V7")
+    gr.Markdown("RVC v2 Model (60.7MB, strict=True, 128-mel) + HiFi-GAN Vocoder | Upload audio → convert to One's voice")
     with gr.Tab("Voice Conversion"):
         with gr.Row():