Numberblocks1Voice

Sleeping

App Files Files Community

ayf3 commited on Apr 2

Commit

c92f551

verified ·

1 Parent(s): e215365

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +94 -219

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
-NumberBlocks One Voice Cloner - V3 (librosa-free)
-用 torchaudio + scipy 替代 librosa，避免 numba 兼容问题
 """
 import os
@@ -13,9 +13,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio
-import gradio as gr
 from pathlib import Path
 from huggingface_hub import hf_hub_download, HfApi
 # ============================================================
 # 模型定义 - VITS-like RVC Model
@@ -44,24 +44,6 @@ class PosteriorEncoder(nn.Module):
         return m, logs
-class ResidualCouplingBlock(nn.Module):
-    def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
-        super().__init__()
-        self.flows = nn.ModuleList()
-        for _ in range(n_flows):
-            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
-            self.flows.append(Flip())
-    def forward(self, x, reverse=False):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, reverse=reverse)
-        return x
 class ResidualCouplingLayer(nn.Module):
     def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
@@ -103,6 +85,24 @@ class Flip(nn.Module):
             return torch.flip(x, [1])
 class Decoder(nn.Module):
     def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
@@ -125,7 +125,7 @@ class Decoder(nn.Module):
 class RVCModel(nn.Module):
-    """VITS-like RVC v3.0 Model (5.3M params)"""
     def __init__(self, n_mels=80, hidden_channels=192):
         super().__init__()
         self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
@@ -151,7 +151,7 @@ class RVCModel(nn.Module):
 # ============================================================
-# HiFi-GAN Vocoder Definition
 # ============================================================
 class ResBlock1(nn.Module):
@@ -175,7 +175,6 @@ class ResBlock1(nn.Module):
 class HiFiGANGenerator(nn.Module):
-    """HiFi-GAN Generator (Universal V1 compatible)"""
     def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
                  upsample_kernel_sizes=(16, 16, 4, 4),
                  upsample_initial_channel=512,
@@ -183,13 +182,10 @@ class HiFiGANGenerator(nn.Module):
                  resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
         super().__init__()
         self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
         self.num_upsamples = len(upsample_rates)
         self.num_kernels = len(resblock_kernel_sizes)
         self.ups = nn.ModuleList()
         self.resblocks = nn.ModuleList()
         ch = upsample_initial_channel
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
             ch_new = ch // 2
@@ -197,7 +193,6 @@ class HiFiGANGenerator(nn.Module):
             for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                 self.resblocks.append(ResBlock1(ch_new, rk, rd))
             ch = ch_new
         self.conv_post = nn.Sequential(
             nn.LeakyReLU(0.1),
             nn.Conv1d(ch, 1, 7, padding=3),
@@ -218,50 +213,28 @@ class HiFiGANGenerator(nn.Module):
 # ============================================================
-# Mel-spectrogram utilities (torchaudio-based, no librosa)
 # ============================================================
-def make_mel_transform(sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80, f_min=0.0, f_max=None):
-    """Create torchaudio mel spectrogram transform"""
-    return torchaudio.transforms.MelSpectrogram(
-        sample_rate=sample_rate,
-        n_fft=n_fft,
-        hop_length=hop_length,
-        n_mels=n_mels,
-        f_min=f_min,
-        f_max=f_max if f_max else float(sample_rate // 2),
-        power=2.0,
-        norm=None,
-        mel_scale="htk",
-    )
 def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
-    """Compute log mel spectrogram using torchaudio"""
-    mel_transform = make_mel_transform(sample_rate, n_fft, hop_length, n_mels)
     mel = mel_transform(y)
     mel = torch.log(torch.clamp(mel, min=1e-5))
     return mel
 def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32):
-    """Griffin-Lim: mel -> audio using torchaudio"""
-    # Create inverse mel transform
     inverse_mel = torchaudio.transforms.InverseMelScale(
-        n_stft=n_fft // 2 + 1,
-        n_mels=mel.shape[0],
-        sample_rate=sample_rate,
-        f_min=0,
-        f_max=float(sample_rate // 2),
-        mel_scale="htk",
     )
-    # Convert from log mel back to power spectrogram
     mel_power = torch.exp(mel)
     spec = inverse_mel(mel_power)
-    # Griffin-Lim
-    griffin_lim = torchaudio.transforms.GriffinLim(
-        n_fft=n_fft,
-        hop_length=hop_length,
-        n_iter=n_iter,
-    )
     audio = griffin_lim(spec)
     return audio.numpy()
@@ -282,59 +255,36 @@ class VoiceCloner:
         self.load_models()
     def load_models(self):
-        """Load RVC model + HiFi-GAN vocoder"""
         print("Loading RVC model...")
         try:
             model_path = hf_hub_download(
-                repo_id=self.dataset_id,
-                filename="models/one_voice_rvc_v2.pth",
-                repo_type="dataset"
             )
             ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
-            if isinstance(ckpt, dict) and 'model' in ckpt:
-                state_dict = ckpt['model']
-            elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
-                state_dict = ckpt['state_dict']
-            else:
-                state_dict = ckpt
             hidden_ch = 192
             for k, v in state_dict.items():
                 if 'enc_p.pre.weight' in k:
                     hidden_ch = v.shape[0]
                     break
             self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
             self.rvc_model.load_state_dict(state_dict, strict=False)
             self.rvc_model.eval()
             print(f"RVC model loaded (hidden={hidden_ch})")
         except Exception as e:
             print(f"RVC model load failed: {e}")
-            self.rvc_model = None
         print("Loading HiFi-GAN vocoder...")
         try:
-            hifigan_path = self._get_hifigan()
-            if hifigan_path:
-                ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
-                if isinstance(ckpt, dict) and 'generator' in ckpt:
-                    state_dict = ckpt['generator']
-                elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
-                    state_dict = {k.replace('generator.', ''): v
-                                  for k, v in ckpt['state_dict'].items()
-                                  if k.startswith('generator.')}
-                else:
-                    state_dict = ckpt
-                self.hifigan = HiFiGANGenerator()
-                self.hifigan.load_state_dict(state_dict, strict=False)
-                self.hifigan.eval()
-                print("HiFi-GAN vocoder loaded")
-            else:
-                print("HiFi-GAN not available, will use Griffin-Lim fallback")
         except Exception as e:
             print(f"HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
             self.hifigan = None
@@ -348,79 +298,38 @@ class VoiceCloner:
             print(f"Found {len(self.samples)} sample audio files")
         except Exception as e:
             print(f"Could not list samples: {e}")
-            self.samples = []
         self.model_loaded = self.rvc_model is not None
-    def _get_hifigan(self):
-        """Get HiFi-GAN model"""
-        try:
-            path = hf_hub_download(
-                repo_id="jik876/hifi-gan",
-                filename="UNIVERSAL_V1/g_02500000",
-            )
-            return path
-        except:
-            pass
-        try:
-            path = hf_hub_download(
-                repo_id="facebook/hifigan-universal-v1",
-                filename="hifigan.pt",
-            )
-            return path
-        except:
-            pass
-        return None
-    def mel_to_audio_hifigan(self, mel):
-        """Convert mel spectrogram to audio using HiFi-GAN"""
-        with torch.no_grad():
-            audio = self.hifigan(mel.unsqueeze(0))
-        return audio.squeeze(0).squeeze(0).cpu().numpy()
     def process_audio(self, input_audio, pitch_shift=0):
-        """Process audio through RVC model + vocoder"""
         if not self.model_loaded:
             return None, "Model not loaded"
         if input_audio is None:
             return None, "Please upload an audio file"
         try:
-            # Load audio with torchaudio
             y, sr = torchaudio.load(input_audio)
-            # Convert to mono
             if y.shape[0] > 1:
                 y = y.mean(dim=0)
             else:
                 y = y.squeeze(0)
-            # Resample
             if sr != self.sample_rate:
-                resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
-                y = resampler(y)
                 sr = self.sample_rate
-            # Pitch shift using torchaudio (simple resample trick)
             if pitch_shift != 0:
-                # Pitch shift by resampling: shift up by N semitones = speed up by 2^(N/12)
                 factor = 2.0 ** (abs(pitch_shift) / 12.0)
-                if pitch_shift > 0:
-                    new_len = int(len(y) / factor)
-                else:
-                    new_len = int(len(y) * factor)
-                y = torch.nn.functional.interpolate(
-                    y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear'
-                ).squeeze(0).squeeze(0)
-            # Trim silence (simple energy-based)
             energy = y ** 2
-            window_size = int(0.1 * sr)  # 100ms window
             if len(energy) > window_size:
                 kernel = torch.ones(window_size) / window_size
-                smooth_energy = torch.nn.functional.conv1d(
                     energy.unsqueeze(0).unsqueeze(0), kernel.unsqueeze(0).unsqueeze(0), padding=window_size // 2
                 ).squeeze()
-                threshold = smooth_energy.max() * (10 ** (-20 / 10))  # -20dB
                 active = torch.where(smooth_energy > threshold)[0]
                 if len(active) > 0:
                     y = y[active[0]:active[-1] + 1]
@@ -436,109 +345,75 @@ class VoiceCloner:
                 mel_out = mel_out.squeeze(0)
             if self.hifigan is not None:
-                audio_out = self.mel_to_audio_hifigan(mel_out)
                 vocoder_name = "HiFi-GAN"
             else:
                 audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
                 vocoder_name = "Griffin-Lim"
             audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
             output_path = tempfile.mktemp(suffix='.wav')
             sf.write(output_path, audio_out, self.sample_rate)
-            return output_path, f"Success ({vocoder_name}) | Input: {len(y)/sr:.1f}s -> Output: {len(audio_out)/self.sample_rate:.1f}s"
         except Exception as e:
             import traceback
             traceback.print_exc()
-            return None, f"Conversion failed: {str(e)}"
     def generate_random(self):
-        """Generate audio from a random sample"""
         if not self.samples:
             return None, "No samples available"
         try:
             sample = random.choice(self.samples)
-            sample_path = hf_hub_download(
-                repo_id=self.dataset_id,
-                filename=sample,
-                repo_type="dataset"
-            )
             output, msg = self.process_audio(sample_path)
             if output:
                 return output, f"{msg}\nSample: {Path(sample).name}"
             return output, msg
         except Exception as e:
-            return None, f"Generation failed: {str(e)}"
 # ============================================================
-# Gradio UI
 # ============================================================
 print("Initializing NumberBlocks One Voice Cloner...")
 cloner = VoiceCloner()
-with gr.Blocks(
-    title="NumberBlocks One Voice",
-    theme=gr.themes.Soft(),
-) as demo:
-    gr.Markdown("# NumberBlocks One Voice Cloner")
-    gr.Markdown("RVC v2 Model (60.7MB) + HiFi-GAN Vocoder")
-    with gr.Tab("Voice Conversion"):
-        gr.Markdown("### Upload audio -> Convert to One's voice")
-        with gr.Row():
-            with gr.Column():
-                vc_input = gr.Audio(label="Upload Audio", type="filepath")
-                vc_pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)")
-                vc_btn = gr.Button("Convert", variant="primary")
-            with gr.Column():
-                vc_output = gr.Audio(label="Result", type="filepath")
-                vc_status = gr.Textbox(label="Status")
-        vc_btn.click(
-            fn=cloner.process_audio,
-            inputs=[vc_input, vc_pitch],
-            outputs=[vc_output, vc_status]
-        )
-    with gr.Tab("Random Sample"):
-        gr.Markdown("### Random sample + RVC conversion")
-        rand_btn = gr.Button("Generate Random", variant="primary")
-        rand_output = gr.Audio(label="Result", type="filepath")
-        rand_status = gr.Textbox(label="Status")
-        rand_btn.click(
-            fn=cloner.generate_random,
-            inputs=[],
-            outputs=[rand_output, rand_status]
-        )
-    with gr.Tab("About"):
-        model_status = "Loaded" if cloner.model_loaded else "Not loaded"
-        hifigan_status = "HiFi-GAN" if cloner.hifigan else "Griffin-Lim (fallback)"
-        gr.Markdown(f"""
-        ### NumberBlocks One Voice Cloner V3 (librosa-free)
-        - **Model**: RVC v3.0 (VITS-like, 5.3M params, 60.7MB)
-        - **Vocoder**: {hifigan_status}
-        - **Sample Rate**: 40kHz
-        - **Model Status**: {model_status}
-        - **Training Data**: 100 source files -> 1,334 chunks, 500 steps
-        - **Dataset**: [ayf3/numberblocks-one-voice-dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
-        **Features**:
-        - Upload audio -> One voice conversion
-        - Random sample generation
-        - Pitch adjustment (-12 to +12 semitones)
-        - HiFi-GAN high quality vocoder
-        **Limitations**:
-        - CPU inference (slow)
-        - Input recommended < 10 seconds
-        """)
-demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

 #!/usr/bin/env python3
 """
+NumberBlocks One Voice Cloner - V4 Minimal
+Ultra-simple UI to avoid gradio schema issues
 """
 import os
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio
 from pathlib import Path
 from huggingface_hub import hf_hub_download, HfApi
+import gradio as gr
 # ============================================================
 # 模型定义 - VITS-like RVC Model
         return m, logs
 class ResidualCouplingLayer(nn.Module):
     def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
             return torch.flip(x, [1])
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(n_flows):
+            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
+            self.flows.append(Flip())
+    def forward(self, x, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, reverse=reverse)
+        return x
 class Decoder(nn.Module):
     def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
         super().__init__()
 class RVCModel(nn.Module):
+    """VITS-like RVC v3.0 Model"""
     def __init__(self, n_mels=80, hidden_channels=192):
         super().__init__()
         self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
 # ============================================================
+# HiFi-GAN Vocoder
 # ============================================================
 class ResBlock1(nn.Module):
 class HiFiGANGenerator(nn.Module):
     def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
                  upsample_kernel_sizes=(16, 16, 4, 4),
                  upsample_initial_channel=512,
                  resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
         super().__init__()
         self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
         self.num_upsamples = len(upsample_rates)
         self.num_kernels = len(resblock_kernel_sizes)
         self.ups = nn.ModuleList()
         self.resblocks = nn.ModuleList()
         ch = upsample_initial_channel
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
             ch_new = ch // 2
             for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                 self.resblocks.append(ResBlock1(ch_new, rk, rd))
             ch = ch_new
         self.conv_post = nn.Sequential(
             nn.LeakyReLU(0.1),
             nn.Conv1d(ch, 1, 7, padding=3),
 # ============================================================
+# Mel utilities
 # ============================================================
 def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
+    mel_transform = torchaudio.transforms.MelSpectrogram(
+        sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
+        n_mels=n_mels, f_min=0.0, f_max=float(sample_rate // 2),
+        power=2.0, norm=None, mel_scale="htk",
+    )
     mel = mel_transform(y)
     mel = torch.log(torch.clamp(mel, min=1e-5))
     return mel
 def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32):
     inverse_mel = torchaudio.transforms.InverseMelScale(
+        n_stft=n_fft // 2 + 1, n_mels=mel.shape[0],
+        sample_rate=sample_rate, f_min=0, f_max=float(sample_rate // 2), mel_scale="htk",
     )
     mel_power = torch.exp(mel)
     spec = inverse_mel(mel_power)
+    griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, hop_length=hop_length, n_iter=n_iter)
     audio = griffin_lim(spec)
     return audio.numpy()
         self.load_models()
     def load_models(self):
         print("Loading RVC model...")
         try:
             model_path = hf_hub_download(
+                repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
             )
             ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
+            state_dict = ckpt.get('model', ckpt.get('state_dict', ckpt))
             hidden_ch = 192
             for k, v in state_dict.items():
                 if 'enc_p.pre.weight' in k:
                     hidden_ch = v.shape[0]
                     break
             self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
             self.rvc_model.load_state_dict(state_dict, strict=False)
             self.rvc_model.eval()
             print(f"RVC model loaded (hidden={hidden_ch})")
         except Exception as e:
             print(f"RVC model load failed: {e}")
         print("Loading HiFi-GAN vocoder...")
         try:
+            hifigan_path = hf_hub_download(repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000")
+            ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
+            state_dict = ckpt.get('generator', ckpt.get('state_dict', ckpt))
+            if any(k.startswith('generator.') for k in state_dict):
+                state_dict = {k.replace('generator.', ''): v for k, v in state_dict.items() if k.startswith('generator.')}
+            self.hifigan = HiFiGANGenerator()
+            self.hifigan.load_state_dict(state_dict, strict=False)
+            self.hifigan.eval()
+            print("HiFi-GAN vocoder loaded")
         except Exception as e:
             print(f"HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
             self.hifigan = None
             print(f"Found {len(self.samples)} sample audio files")
         except Exception as e:
             print(f"Could not list samples: {e}")
         self.model_loaded = self.rvc_model is not None
     def process_audio(self, input_audio, pitch_shift=0):
         if not self.model_loaded:
             return None, "Model not loaded"
         if input_audio is None:
             return None, "Please upload an audio file"
         try:
             y, sr = torchaudio.load(input_audio)
             if y.shape[0] > 1:
                 y = y.mean(dim=0)
             else:
                 y = y.squeeze(0)
             if sr != self.sample_rate:
+                y = torchaudio.transforms.Resample(sr, self.sample_rate)(y)
                 sr = self.sample_rate
             if pitch_shift != 0:
                 factor = 2.0 ** (abs(pitch_shift) / 12.0)
+                new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
+                y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear').squeeze(0).squeeze(0)
+            # Trim silence
             energy = y ** 2
+            window_size = int(0.1 * sr)
             if len(energy) > window_size:
                 kernel = torch.ones(window_size) / window_size
+                smooth_energy = F.conv1d(
                     energy.unsqueeze(0).unsqueeze(0), kernel.unsqueeze(0).unsqueeze(0), padding=window_size // 2
                 ).squeeze()
+                threshold = smooth_energy.max() * (10 ** (-20 / 10))
                 active = torch.where(smooth_energy > threshold)[0]
                 if len(active) > 0:
                     y = y[active[0]:active[-1] + 1]
                 mel_out = mel_out.squeeze(0)
             if self.hifigan is not None:
+                with torch.no_grad():
+                    audio_out = self.hifigan(mel_out.unsqueeze(0))
+                audio_out = audio_out.squeeze(0).squeeze(0).cpu().numpy()
                 vocoder_name = "HiFi-GAN"
             else:
                 audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
                 vocoder_name = "Griffin-Lim"
             audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
             output_path = tempfile.mktemp(suffix='.wav')
             sf.write(output_path, audio_out, self.sample_rate)
+            return output_path, f"Success ({vocoder_name}) | {len(y)/sr:.1f}s -> {len(audio_out)/self.sample_rate:.1f}s"
         except Exception as e:
             import traceback
             traceback.print_exc()
+            return None, f"Error: {str(e)}"
     def generate_random(self):
         if not self.samples:
             return None, "No samples available"
         try:
             sample = random.choice(self.samples)
+            sample_path = hf_hub_download(repo_id=self.dataset_id, filename=sample, repo_type="dataset")
             output, msg = self.process_audio(sample_path)
             if output:
                 return output, f"{msg}\nSample: {Path(sample).name}"
             return output, msg
         except Exception as e:
+            return None, f"Error: {str(e)}"
 # ============================================================
+# Gradio UI - Minimal version using Interface (not Blocks)
 # ============================================================
 print("Initializing NumberBlocks One Voice Cloner...")
 cloner = VoiceCloner()
+# Use simple gr.Interface instead of Blocks to avoid schema issues
+vc_interface = gr.Interface(
+    fn=cloner.process_audio,
+    inputs=[
+        gr.Audio(label="Upload Audio", type="filepath"),
+        gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)"),
+    ],
+    outputs=[
+        gr.Audio(label="Result", type="filepath"),
+        gr.Textbox(label="Status"),
+    ],
+    title="NumberBlocks One Voice Cloner",
+    description="RVC v2 Model (60.7MB) + HiFi-GAN Vocoder | Upload audio to convert to One's voice",
+    allow_flagging="never",
+)
+rand_interface = gr.Interface(
+    fn=cloner.generate_random,
+    inputs=[],
+    outputs=[
+        gr.Audio(label="Result", type="filepath"),
+        gr.Textbox(label="Status"),
+    ],
+    title="Random Sample Generation",
+    description="Generate from random dataset sample + RVC conversion",
+    allow_flagging="never",
+)
+demo = gr.TabbedInterface(
+    [vc_interface, rand_interface],
+    ["Voice Conversion", "Random Sample"],
+)
+demo.launch(server_name="0.0.0.0", server_port=7860)