"""
VieNeu-TTS v3 Turbo — Hugging Face ZeroGPU Space
================================================
Vietnamese text-to-speech, 48 kHz, with built-in voices + instant voice cloning
+ a multi-speaker conversation tab. Shows generation speed / RTF per request.
ZeroGPU notes:
  * The model is placed on ``cuda`` at module import (ZeroGPU runs a CUDA
    emulation outside ``@spaces.GPU`` so this is the recommended, fastest path).
  * Real GPU compute happens only inside the ``@spaces.GPU`` decorated functions.
"""
import os
import re
import time
import logging

import numpy as np
import soundfile as sf
import gradio as gr
import spaces

# ── Logging request infer của người dùng (xem ở tab "Logs" của Space) ──────────
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("vieneu.space")


def _log_infer(mode: str, text: str, *, voice=None, cloned: bool = False) -> None:
    """Log một request infer: chế độ, giọng/clone, độ dài và nội dung text.

    Dùng repr() để text (kể cả hội thoại nhiều dòng) gói gọn trên một dòng log.
    """
    src = "clone" if cloned else f"voice={voice}"
    logger.info("[%s] %s | chars=%d | text=%r", mode, src, len(text or ""), text)

from vieneu import Vieneu
from vieneu_utils.core_utils import split_text_into_chunks
from vieneu_utils.phonemize_text import phonemize_text_with_emotions

# ── TẠM THỜI: vá lỗi ZeroGPU "No CUDA GPUs are available" lúc import ───────────
# Bản `spaces` mới không cho safetensors materialize tensor thẳng lên CUDA khi
# chạy ngoài @spaces.GPU. Ta ép load_model nạp weights lên CPU; thư viện
# (_apply_dtype) sau đó tự .to(cuda) — đường .to() này ZeroGPU defer được nên
# hợp lệ ở import time. (Bản vá vĩnh viễn nằm trong vieneu hub_load_v3_turbo.)
# Phải patch TRƯỚC khi gọi Vieneu(...).
import vieneu._v3_turbo_engine.hub_load_v3_turbo as _hub_load

_orig_load_model = _hub_load.load_model

def _load_model_cpu(model, filename, *args, **kwargs):
    kwargs.pop("device", None)          # bỏ device -> nạp lên CPU
    return _orig_load_model(model, filename, *args, **kwargs)

_hub_load.load_model = _load_model_cpu

# ── Load model once, on GPU (CUDA emulation makes this valid at startup) ───────
print("⏳ Loading VieNeu-TTS v3 Turbo (PyTorch / CUDA) ...")
tts = Vieneu(
    mode="v3turbo",
    device="cuda",       # ZeroGPU: keep weights on cuda from the start
    backend="pytorch",   # force the PyTorch engine (ONNX is the CPU-only path)
    hf_token=os.getenv("HF_TOKEN"),
)
print("✅ Model ready.")

SR = tts.sample_rate
PRESET_VOICES = tts.list_preset_voices()                  # [(label, voice_id), ...]
VOICE_CHOICES = [(label, vid) for label, vid in PRESET_VOICES]
VOICE_IDS = [vid for _, vid in PRESET_VOICES]
VOICE_SET = set(VOICE_IDS)
DEFAULT_VOICE = tts._default_voice or (VOICE_IDS[0] if VOICE_IDS else None)
# Default voice shown in the read-text / stream dropdown.
DEFAULT_UI_VOICE = "Ngọc Linh" if "Ngọc Linh" in VOICE_SET else DEFAULT_VOICE
# Two distinct voices for the default conversation demo, if available.
SPK_A = VOICE_IDS[5] if VOICE_IDS else DEFAULT_VOICE
SPK_B = VOICE_IDS[8] if len(VOICE_IDS) > 1 else SPK_A

DEFAULT_TEXT = (
    "Xin chào mọi người! [hắng giọng] Như bạn đang nghe thấy đấy, tốc độ xử lý của mình "
    "cực kỳ nhanh và mượt mà, giúp phản hồi gần như ngay lập tức theo thời gian thực. "
    "Chính vì vậy, mình rất phù hợp để ứng dụng trực tiếp vào các hệ thống Chatbot thông minh, "
    "trợ lý ảo, hoặc làm tổng đài viên tự động cho các doanh nghiệp. Tiện lợi quá đúng không ạ? "
    "[cười] Hi vọng phiên bản nâng cấp v3 này sẽ mang lại trải nghiệm tuyệt vời cho dự án của bạn."
)

DEFAULT_DIALOGUE = (
    f"{SPK_A}: Chào bạn, dạo này công việc thế nào rồi?\n"
    f"{SPK_B}: Mình vẫn ổn, cảm ơn bạn đã hỏi thăm nhé!\n"
    f"{SPK_A}: Nghe nói bạn vừa hoàn thành một dự án mới, chúc mừng nha!\n"
    f"{SPK_B}: Cảm ơn nhiều, đúng là một hành trình thú vị đấy [cười].\n"
)


def _stats_md(elapsed: float, n_samples: int) -> str:
    """Generation-speed / real-time-factor report."""
    dur = n_samples / SR if SR else 0.0
    rtf = (elapsed / dur) if dur > 0 else 0.0
    speed = (dur / elapsed) if elapsed > 0 else 0.0
    return (
        f"⏱️ Thời gian sinh: **{elapsed:.2f}s**\n"
        f"🔊 Độ dài audio: **{dur:.2f}s**\n"
        f"⚡ RTF: **{rtf:.3f}** (×{speed:.1f} so với thời gian thực)"
    )


def _gpu_duration(text, *args, **kwargs):
    """Dynamic ZeroGPU budget: scale with text length, capped at 3 minutes."""
    n = len(text or "")
    return int(min(180, 30 + n // 8))


# Each streamed chunk is generated in full and yielded as ONE consistent piece.
# Gradio's streaming audio wants chunks that are consistent and **>1 second**
# (smaller/uneven pieces stutter and stop), so we target ~2 s of audio per chunk —
# small enough to start fast, large enough to play back smoothly.
_STREAM_CHUNK_CHARS = 140


def _check_ref_len(path):
    """Warn when the reference clip is longer than the ideal 3–5 s window."""
    if not path:
        return gr.update(visible=False)
    try:
        dur = sf.info(path).duration
    except Exception:
        return gr.update(visible=False)
    if dur > 5.5:
        return gr.update(
            visible=True,
            value=(
                f"⚠️ Audio mẫu đang dài **{dur:.1f} giây**. Hãy cắt còn "
                f"**3–5 giây** (một câu nói rõ, ít ồn) để clone giọng tốt nhất — "
                f"audio quá dài thường cho kết quả kém hơn."
            ),
        )
    return gr.update(visible=False)


# ── Single-speaker synthesis ──────────────────────────────────────────────────
@spaces.GPU(duration=_gpu_duration)
def synthesize(text, voice, ref_audio, temperature, top_k, top_p,
               repetition_penalty, max_new_frames, max_chars):
    text = (text or "").strip()
    if not text:
        raise gr.Error("Vui lòng nhập văn bản cần đọc.")
    _log_infer("full", text, voice=voice, cloned=bool(ref_audio))

    # Preset voices are identified by their reserved speaker token (resolved from
    # ``voice=<name>`` inside the engine), not by an emotion knob — so we don't
    # pass ``emotion`` here. For voice cloning the engine defaults to "natural".
    kwargs = dict(
        temperature=float(temperature),
        top_k=int(top_k),
        top_p=float(top_p),
        repetition_penalty=float(repetition_penalty),
        max_new_frames=int(max_new_frames),
        max_chars=int(max_chars),
    )

    t0 = time.perf_counter()
    # An uploaded reference clip takes precedence → voice cloning.
    if ref_audio:
        wav = tts.infer(text, ref_audio=ref_audio, **kwargs)
    else:
        wav = tts.infer(text, voice=voice, **kwargs)
    elapsed = time.perf_counter() - t0

    wav = np.asarray(wav, dtype=np.float32)
    return (SR, wav), _stats_md(elapsed, len(wav))


# ── Conversation (multi-speaker) synthesis ────────────────────────────────────
_LINE_RE = re.compile(r"^\s*([^:：]+)[:：]\s*(.*)$")


def _parse_dialogue(script):
    """Parse ``Speaker: line`` rows into ``[(voice_id, text), ...]``.
    The speaker label must match a built-in voice name; unknown / missing labels
    fall back to the default voice.
    """
    turns = []
    for raw in (script or "").splitlines():
        raw = raw.strip()
        if not raw:
            continue
        m = _LINE_RE.match(raw)
        if m:
            spk, txt = m.group(1).strip(), m.group(2).strip()
            voice = spk if spk in VOICE_SET else DEFAULT_VOICE
        else:
            voice, txt = DEFAULT_VOICE, raw
        if txt:
            turns.append((voice, txt))
    return turns


@spaces.GPU(duration=_gpu_duration)
def synthesize_conversation(script, silence_s, temperature, max_new_frames):
    turns = _parse_dialogue(script)
    if not turns:
        raise gr.Error("Vui lòng nhập hội thoại (mỗi dòng: 'Tên giọng: nội dung').")
    _log_infer("conversation", (script or "").strip(), voice=f"{len(turns)} turns")

    gap = np.zeros(int(max(0.0, float(silence_s)) * SR), dtype=np.float32)
    pieces = []
    t0 = time.perf_counter()
    for i, (voice, txt) in enumerate(turns):
        wav = tts.infer(
            txt,
            voice=voice,                 # → reserved speaker token for this voice
            temperature=float(temperature),
            max_new_frames=int(max_new_frames),
        )
        pieces.append(np.asarray(wav, dtype=np.float32))
        if i < len(turns) - 1 and gap.size:
            pieces.append(gap)
    elapsed = time.perf_counter() - t0

    full = np.concatenate(pieces) if pieces else np.zeros(0, dtype=np.float32)
    info = f"{_stats_md(elapsed, len(full))}\n👥 Số lượt thoại: **{len(turns)}**"
    return (SR, full), info


# ── Streaming ─────────────────────────────────────────────────────────────────
# One ``@spaces.GPU`` generator holds the GPU for the whole request and yields one
# fully-generated chunk at a time. We deliberately do NOT forward the engine's
# frame-level pieces (~0.3-0.7 s, uneven) — Gradio's streaming audio stutters on
# sub-second/uneven chunks. Instead each ~2 s sentence chunk is generated in full
# and yielded as a single consistent piece. With RTF ~0.5 a ~2 s chunk is ready in
# ~1 s, so while one chunk plays the next is already buffered → gapless playback.
@spaces.GPU(duration=_gpu_duration)
def stream_synthesize(text, voice, ref_audio, temperature, top_k, top_p,
                      repetition_penalty, max_new_frames, max_chars):
    text = (text or "").strip()
    if not text:
        raise gr.Error("Vui lòng nhập văn bản cần đọc.")
    _log_infer("stream", text, voice=voice, cloned=bool(ref_audio))

    # Preset → reserved token + cached codes; cloning → encode the reference once.
    if ref_audio:
        ref_codes, voice_token_id = tts.encode_reference(ref_audio), None
    else:
        ref_codes, voice_token_id = tts._resolve_v3_ref(voice, None, None)

    for chunk in split_text_into_chunks(text, max_chars=_STREAM_CHUNK_CHARS):
        phonemes = phonemize_text_with_emotions(chunk)
        wav = tts.engine.infer(
            text="", phonemes=phonemes, ref_codes=ref_codes,
            voice_token_id=voice_token_id, emotion="natural",
            temperature=float(temperature), top_k=int(top_k), top_p=float(top_p),
            repetition_penalty=float(repetition_penalty), max_new_frames=int(max_new_frames),
        )
        wav = np.asarray(wav, dtype=np.float32)
        if wav.size:
            yield (SR, wav)


# ── UI ────────────────────────────────────────────────────────────────────────
HEADER = """
<div style="text-align:center;padding:22px;border-radius:14px;
     background:linear-gradient(135deg,#0f172a 0%,#1e293b 100%);color:#fff;margin-bottom:18px;">
  <div style="font-size:2.1rem;font-weight:800;">🦜 VieNeu-TTS <span
       style="background:-webkit-linear-gradient(45deg,#60A5FA,#22D3EE);
       -webkit-background-clip:text;-webkit-text-fill-color:transparent;">v3 Turbo</span></div>
  <div style="margin-top:8px;font-size:0.95rem;">
     <a href="https://huggingface.co/pnnbao-ump/VieNeu-TTS-v3-Turbo" target="_blank"
        style="color:#60A5FA;text-decoration:none;font-weight:600;">🤗 Model card</a>
     &nbsp;·&nbsp;
     <a href="https://github.com/pnnbao97/VieNeu-TTS" target="_blank"
        style="color:#60A5FA;text-decoration:none;font-weight:600;">💻 GitHub repo</a>
  </div>
  <div style="opacity:.85;margin-top:6px;">
     Text-to-Speech tiếng Việt · 48&nbsp;kHz · giọng dựng sẵn + nhân bản giọng + hội thoại
  </div>
</div>
"""

VOICE_HINT = "**Giọng dựng sẵn:** " + " · ".join(f"`{v}`" for v in VOICE_IDS)

theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="cyan", neutral_hue="slate")

with gr.Blocks(theme=theme, title="VieNeu-TTS v3 Turbo") as demo:
    gr.HTML(HEADER)

    with gr.Tabs():
        # ── Tab 1: single speaker ───────────────────────────────────────────
        with gr.Tab("📝 Đọc văn bản"):
            gr.Markdown(
                "Chèn tag cảm xúc trong văn bản (thử nghiệm): `[cười]`, `[thở dài]`, `[hắng giọng]`."
            )
            with gr.Row():
                with gr.Column(scale=3):
                    text_in = gr.Textbox(
                        label="Văn bản", value=DEFAULT_TEXT, lines=8,
                        placeholder="Nhập văn bản tiếng Việt...",
                    )
                    with gr.Tabs():
                        with gr.Tab("Giọng dựng sẵn"):
                            voice_in = gr.Dropdown(
                                label="Chọn giọng", choices=VOICE_CHOICES, value=DEFAULT_UI_VOICE,
                            )
                        with gr.Tab("Nhân bản giọng"):
                            gr.Markdown(
                                "### ⏱️ Audio mẫu nên dài **3–5 giây**\n"
                                "Dùng **một câu nói rõ ràng, ít tiếng ồn**. "
                                "**Đừng** tải lên file dài (cả đoạn/cả bài) — audio càng dài "
                                "clone càng dễ sai giọng và méo tiếng."
                            )
                            ref_audio_in = gr.Audio(
                                label="Audio mẫu (3–5 giây)", type="filepath",
                                sources=["upload", "microphone"],
                            )
                            ref_warn = gr.Markdown(visible=False)
                            gr.Markdown(
                                "_Có audio mẫu ở đây sẽ **ghi đè** giọng dựng sẵn. "
                                "Xoá audio để quay lại giọng dựng sẵn._"
                            )
                            ref_audio_in.change(_check_ref_len, ref_audio_in, ref_warn)
                    with gr.Accordion("Tuỳ chọn nâng cao", open=False):
                        with gr.Row():
                            temperature_in = gr.Slider(0.1, 1.5, value=0.8, step=0.05, label="temperature")
                            top_p_in = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
                        with gr.Row():
                            top_k_in = gr.Slider(1, 100, value=25, step=1, label="top_k")
                            rep_pen_in = gr.Slider(1.0, 2.0, value=1.2, step=0.05, label="repetition_penalty")
                        with gr.Row():
                            max_frames_in = gr.Slider(50, 1200, value=300, step=10, label="max_new_frames (mỗi đoạn)")
                            max_chars_in = gr.Slider(64, 400, value=256, step=8, label="max_chars (cắt đoạn)")
                with gr.Column(scale=2):
                    with gr.Tabs():
                        with gr.Tab("🔊 Tạo (đầy đủ)"):
                            audio_out = gr.Audio(label="Kết quả", type="numpy")
                            stats_out = gr.Markdown(label="Tốc độ sinh / RTF")
                            run_btn = gr.Button("Tạo giọng nói", variant="primary")
                        with gr.Tab("⚡ Stream"):
                            stream_audio_out = gr.Audio(
                                label="Phát trực tiếp", streaming=True, autoplay=True,
                            )
                            with gr.Row():
                                stream_btn = gr.Button("⚡ Phát trực tiếp", variant="primary")
                                stop_btn = gr.Button("⏹ Dừng", variant="stop")
                            gr.Markdown(
                                "_Phát dần theo từng câu (~2 giây) ngay khi sinh xong; câu sau "
                                "được sinh sẵn trong lúc câu trước đang phát nên nghe liền mạch._"
                            )

            run_btn.click(
                fn=synthesize,
                inputs=[text_in, voice_in, ref_audio_in, temperature_in, top_k_in,
                        top_p_in, rep_pen_in, max_frames_in, max_chars_in],
                outputs=[audio_out, stats_out],
            )
            stream_evt = stream_btn.click(
                fn=stream_synthesize,
                inputs=[text_in, voice_in, ref_audio_in, temperature_in, top_k_in,
                        top_p_in, rep_pen_in, max_frames_in, max_chars_in],
                outputs=stream_audio_out,
            )
            stop_btn.click(fn=None, cancels=stream_evt)
            gr.Examples(
                examples=[
                    [DEFAULT_TEXT, DEFAULT_UI_VOICE],
                    ["Hello everyone! [hắng giọng] As you can hear, my processing speed is incredibly fast and smooth, allowing for near-instant responses in real time. Because of this, I’m a perfect fit for direct integration into smart chatbots, virtual assistants, or automated call centers for businesses. Pretty convenient, right? [cười] I hope this version three upgrade brings an amazing experience to your project.", DEFAULT_UI_VOICE],
                ],
                inputs=[text_in, voice_in],
            )

        # ── Tab 2: conversation ─────────────────────────────────────────────
        with gr.Tab("💬 Hội thoại"):
            gr.Markdown(
                "Mỗi dòng theo dạng **`Tên giọng: nội dung`**. "
                "Tên không khớp sẽ dùng giọng mặc định.\n\n" + VOICE_HINT
            )
            with gr.Row():
                with gr.Column(scale=3):
                    conv_in = gr.Textbox(
                        label="Kịch bản hội thoại", value=DEFAULT_DIALOGUE, lines=12,
                        placeholder=f"{SPK_A}: ...\n{SPK_B}: ...",
                    )
                    with gr.Accordion("Tuỳ chọn nâng cao", open=False):
                        with gr.Row():
                            conv_silence_in = gr.Slider(0.0, 1.5, value=0.35, step=0.05, label="Khoảng lặng giữa lượt (giây)")
                            conv_temp_in = gr.Slider(0.1, 1.5, value=0.8, step=0.05, label="temperature")
                        conv_frames_in = gr.Slider(50, 1200, value=300, step=10, label="max_new_frames (mỗi lượt)")
                    conv_btn = gr.Button("🎭 Tạo hội thoại", variant="primary")
                with gr.Column(scale=2):
                    conv_audio_out = gr.Audio(label="Kết quả hội thoại", type="numpy")
                    conv_stats_out = gr.Markdown(label="Tốc độ sinh / RTF")

            conv_btn.click(
                fn=synthesize_conversation,
                inputs=[conv_in, conv_silence_in, conv_temp_in, conv_frames_in],
                outputs=[conv_audio_out, conv_stats_out],
            )

    gr.Markdown(
        "Model: [pnnbao-ump/VieNeu-TTS-v3-Turbo]"
        "(https://huggingface.co/pnnbao-ump/VieNeu-TTS-v3-Turbo) · "
        "Code: [github.com/pnnbao97/VieNeu-TTS](https://github.com/pnnbao97/VieNeu-TTS)"
    )


if __name__ == "__main__":
    demo.queue().launch()