#!/usr/bin/env python3
"""
NumberBlocks One Voice Cloner - RVC Inference with HiFi-GAN Vocoder
Uses the trained RVC v2 model + pretrained HiFi-GAN universal vocoder for high-quality synthesis.
"""
import os, json
import gradio as gr
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# ════════════════════════════════════════════════════════════
# HiFi-GAN Generator (exact match to pretrained weights)
# ════════════════════════════════════════════════════════════

class HiFiGANResBlock(nn.Module):
    def __init__(self, channels, kernel_size, dilation_sizes):
        super().__init__()
        # Store padding values for manual padding (original HiFi-GAN doesn't use Conv1d padding)
        self.paddings1 = []
        self.convs1 = nn.ModuleList()
        for d in dilation_sizes:
            self.convs1.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0)))
            self.paddings1.append((kernel_size - 1) * d // 2)
        self.paddings2 = []
        self.convs2 = nn.ModuleList()
        for d in dilation_sizes:
            self.convs2.append(nn.utils.weight_norm(nn.Conv1d(channels, channels, kernel_size, 1, dilation=d, padding=0)))
            self.paddings2.append((kernel_size - 1) * d // 2)

    def forward(self, x):
        for c1, p1, c2, p2 in zip(self.convs1, self.paddings1, self.convs2, self.paddings2):
            xt = F.leaky_relu(x, 0.1)
            xt = F.pad(xt, (p1, p1))
            xt = c1(xt)
            xt = F.leaky_relu(xt, 0.1)
            xt = F.pad(xt, (p2, p2))
            xt = c2(xt)
            x = xt + x
        return x


class HiFiGANGenerator(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_kernels = len(config["resblock_kernel_sizes"])
        self.conv_pre = nn.utils.weight_norm(
            nn.Conv1d(80, config["upsample_initial_channel"], 7, 1, padding=3)
        )
        self.ups = nn.ModuleList()
        for u, k in zip(config["upsample_rates"], config["upsample_kernel_sizes"]):
            ch_in = config["upsample_initial_channel"] // (2 ** (len(self.ups)))
            ch_out = ch_in // 2
            self.ups.append(
                nn.utils.weight_norm(
                    nn.ConvTranspose1d(ch_in, ch_out, k, u, padding=(k - u) // 2)
                )
            )
        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = config["upsample_initial_channel"] // (2 ** (i + 1))
            for k, d in zip(config["resblock_kernel_sizes"], config["resblock_dilation_sizes"]):
                self.resblocks.append(HiFiGANResBlock(ch, k, d))
        ch_out = config["upsample_initial_channel"] // (2 ** len(self.ups))
        self.conv_post = nn.utils.weight_norm(nn.Conv1d(ch_out, 1, 7, 1, padding=3))

    def forward(self, mel):
        x = self.conv_pre(mel)
        for i, up in enumerate(self.ups):
            x = F.leaky_relu(x, 0.1)
            x = up(x)
            xs = 0
            for j in range(self.num_kernels):
                xs += self.resblocks[i * self.num_kernels + j](x)
            x = xs / self.num_kernels
        x = F.leaky_relu(x, 0.1)
        x = self.conv_post(x)
        x = torch.tanh(x)
        return x


# ════════════════════════════════════════════════════════════
# Voice Model (VITS-like from training)
# ════════════════════════════════════════════════════════════

class VoiceModel(nn.Module):
    def __init__(self, n_mels, hd):
        super().__init__()
        self.encoder = self._build_encoder(n_mels, hd)
        self.posterior = self._build_posterior(hd)
        self.flow = self._build_flow(hd)
        self.decoder = self._build_decoder(hd)

    def _build_encoder(self, n_mels, hd):
        layers = []
        ch_in = n_mels
        for ch_out, ks in [(hd,5),(hd,5),(hd,5),(hd*2,5),(hd*2,3)]:
            layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)),
                           nn.BatchNorm1d(ch_out), nn.ReLU()])
            ch_in = ch_out
        layers.append(nn.LayerNorm(hd*2))
        class Enc(nn.Module):
            def __init__(self, seq):
                super().__init__(); self.seq = seq
            def forward(self, x):
                x = self.seq[:-1](x)
                x = self.seq[-1](x.transpose(1,2)).transpose(1,2)
                return x
        return Enc(nn.Sequential(*layers))

    def _build_posterior(self, hd):
        class Post(nn.Module):
            def __init__(s):
                super().__init__(); s.conv = nn.utils.weight_norm(nn.Conv1d(hd*2, 384, 1))
            def forward(s, x):
                stats = s.conv(x); m, logs = torch.split(stats, 192, dim=1)
                z = m + torch.randn_like(m)*torch.exp(logs); return z, m, logs
        return Post()

    def _build_flow(self, hd):
        class Flow(nn.Module):
            def __init__(s):
                super().__init__()
                s.net = nn.Sequential(nn.Conv1d(96,hd,1), nn.ReLU(), nn.Conv1d(hd,hd,1), nn.ReLU(), nn.Conv1d(hd,192,1))
            def forward(s, z):
                z1, z2 = torch.split(z, 96, dim=1); return z + s.net(z1)
        return Flow()

    def _build_decoder(self, hd):
        layers = []
        ch_in = 192
        for ch_out, ks in [(hd*2,5),(hd*2,5),(hd,5),(hd,3)]:
            layers.extend([nn.utils.weight_norm(nn.Conv1d(ch_in, ch_out, ks, padding=ks//2)),
                           nn.BatchNorm1d(ch_out), nn.ReLU()])
            ch_in = ch_out
        layers.append(nn.utils.weight_norm(nn.Conv1d(hd, 80, 1)))
        return nn.Sequential(*layers)


# ════════════════════════════════════════════════════════════
# Model Loading
# ════════════════════════════════════════════════════════════

def download_file(repo_id, filename, repo_type="dataset"):
    from huggingface_hub import hf_hub_download
    return hf_hub_download(repo_id=repo_id, filename=filename, repo_type=repo_type)

def load_hifigan():
    cfg_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "config.json", repo_type="model")
    weights_path = download_file("ORI-Muchim/HiFi-GAN_44100hz_universal", "g_02500000", repo_type="model")
    with open(cfg_path) as f:
        hfg_cfg = json.load(f)
    vocoder = HiFiGANGenerator(hfg_cfg)
    ckpt = torch.load(weights_path, map_location="cpu", weights_only=False)
    vocoder.load_state_dict(ckpt["generator"])
    vocoder.eval()
    return vocoder, hfg_cfg

def load_voice_model():
    model_file = download_file("ayf3/numberblocks-one-voice-dataset", "models/one_voice_rvc_v2.pth")
    ckpt = torch.load(model_file, map_location="cpu", weights_only=False)
    cfg = ckpt['config']
    sd = ckpt['model_state_dict']
    model = VoiceModel(cfg['n_mels'], cfg['hidden_dim'])
    model.load_state_dict(sd, strict=False)
    model.eval()
    return model, cfg


# ════════════════════════════════════════════════════════════
# Audio Processing
# ════════════════════════════════════════════════════════════

def mel_spectrogram(audio, sr, n_mels=80, hop_length=256, win_length=1024, n_fft=1024):
    import librosa
    mel = librosa.feature.melspectrogram(
        y=audio.astype(np.float32), sr=sr, n_mels=n_mels,
        hop_length=hop_length, win_length=win_length, n_fft=n_fft, fmax=8000
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db

def mel_to_audio_hifigan(vocoder, mel_tensor):
    with torch.no_grad():
        audio = vocoder(mel_tensor)
    return audio.squeeze().cpu().numpy()


# ════════════════════════════════════════════════════════════
# Globals & Init
# ════════════════════════════════════════════════════════════

voice_model = None
voice_config = None
hifigan = None
hifigan_config = None

def init_models():
    global voice_model, voice_config, hifigan, hifigan_config
    if voice_model is None:
        print("Loading voice model...")
        voice_model, voice_config = load_voice_model()
        print("Voice model loaded.")
    if hifigan is None:
        print("Loading HiFi-GAN vocoder...")
        hifigan, hifigan_config = load_hifigan()
        print("HiFi-GAN vocoder loaded.")


# ════════════════════════════════════════════════════════════
# Core Functions
# ════════════════════════════════════════════════════════════

def convert_voice(audio_input, transpose=0):
    init_models()
    import librosa

    if audio_input is None:
        return None, "❌ 请上传音频文件"

    sr_in, data = audio_input[0], audio_input[1]

    # Resample to 44100 for HiFi-GAN
    if sr_in != 44100:
        data = librosa.resample(data.astype(np.float32), orig_sr=sr_in, target_sr=44100)

    if len(data.shape) > 1:
        data = data.mean(axis=1)

    # Trim to max 30 seconds
    max_samples = 44100 * 30
    if len(data) > max_samples:
        data = data[:max_samples]

    # Compute mel spectrogram
    mel = mel_spectrogram(data, 44100)
    mel_norm = (mel - mel.mean()) / (mel.std() + 1e-8)

    if transpose != 0:
        mel_norm = np.roll(mel_norm, transpose, axis=0)

    # Voice model timbre transfer
    with torch.no_grad():
        mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0)
        mel_out, _, _ = voice_model(mel_tensor)

    mel_out_np = mel_out.squeeze().cpu().numpy()
    mel_out_np = np.clip(mel_out_np, -4.0, 4.0)

    # HiFi-GAN vocoding
    with torch.no_grad():
        audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0))

    mx = np.max(np.abs(audio_out))
    if mx > 0:
        audio_out = audio_out / mx * 0.85

    return (44100, audio_out.astype(np.float32)), \
        f"✅ 转换完成! (HiFi-GAN vocoder)\n输入: {len(data)/44100:.1f}s → 输出: {len(audio_out)/44100:.1f}s"


def generate_sample():
    init_models()

    n_frames = 400
    with torch.no_grad():
        z = torch.randn(1, 192, n_frames) * 0.5
        z = voice_model.flow(z)
        mel_out = voice_model.decoder(z)

    mel_out_np = np.clip(mel_out.squeeze().cpu().numpy(), -4.0, 4.0)

    with torch.no_grad():
        audio_out = mel_to_audio_hifigan(hifigan, torch.FloatTensor(mel_out_np).unsqueeze(0))

    mx = np.max(np.abs(audio_out))
    if mx > 0:
        audio_out = audio_out / mx * 0.85

    return (44100, audio_out.astype(np.float32)), \
        f"✅ 生成完成! (HiFi-GAN vocoder)\n时长: {len(audio_out)/44100:.1f}s"


# ════════════════════════════════════════════════════════════
# Gradio UI
# ════════════════════════════════════════════════════════════

with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo:
    gr.HTML("""
    <div style="text-align:center; margin-bottom:1rem">
        <h1 style="color:#ff6b6b">🎙️ NumberBlocks One 语音克隆</h1>
        <p>RVC v2 Model + HiFi-GAN Vocoder — High Quality Voice Conversion</p>
    </div>
    """)

    with gr.Tab("🔊 Voice Conversion"):
        gr.Markdown("上传音频，将其转换为 One 的声音（使用 HiFi-GAN 神经声码器提升音质）")
        audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"])
        pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)")
        convert_btn = gr.Button("🔄 转换", variant="primary")
        audio_out = gr.Audio(label="输出音频 (HiFi-GAN)")
        status = gr.Textbox(label="状态")
        convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status])

    with gr.Tab("🎵 Sample Generation"):
        gr.Markdown("生成 One 的随机语音样本（使用 HiFi-GAN 神经声码器）")
        gen_btn = gr.Button("🎵 生成样本", variant="primary")
        gen_out = gr.Audio(label="生成音频 (HiFi-GAN)")
        gen_status = gr.Textbox(label="状态")
        gen_btn.click(generate_sample, outputs=[gen_out, gen_status])

    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
        ### Model Info
        - **Voice Model**: VITS-like (Encoder + Posterior + Flow + Decoder) — 5.3M params
        - **Vocoder**: HiFi-GAN Universal (44100Hz) — 928K params
        - **Sample Rate**: 44100 Hz
        - **Training Data**: 100 source files, 1,334 chunks
        - **Training Steps**: 500

        ### What's New
        - ✅ Integrated HiFi-GAN neural vocoder replacing overlap-add
        - ✅ Significantly improved audio quality and naturalness
        - ✅ Proper mel→audio conversion with learned upsampling

        ### Links
        - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
        - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer)
        """)

demo.launch(server_name="0.0.0.0", server_port=7860)