"""Vocence PromptTTS engine."""
from __future__ import annotations

import sys
import types

# torch.distributed.nn.api.remote_module unconditionally generates a Python file
# under /tmp/.../_remote_module_non_scriptable.py at module-import time and then
# imports it. The Vocence sandbox blocks anything outside stdlib/site-packages
# from being imported, so we preempt that path by pre-populating sys.modules
# with a stub for the leaf module and its bare alias. We intentionally do NOT
# stub torch.distributed itself -- torch.distributed.rpc, torch.distributed.elastic,
# etc. must still load normally from site-packages.
def _vocence_stub(name: str) -> types.ModuleType:
    m = types.ModuleType(name)
    _RM = type("RemoteModule", (), {})
    m.RemoteModule = _RM
    m._RemoteModule = _RM
    return m


for _n in ("torch.distributed.nn.api.remote_module", "_remote_module_non_scriptable"):
    if _n not in sys.modules:
        sys.modules[_n] = _vocence_stub(_n)

from pathlib import Path
from typing import List, Optional

import numpy as np
import torch
import torch.nn as nn
import yaml
from transformers import AutoConfig, AutoModel, AutoProcessor

from qwen_tts.core.models import (
    Qwen3TTSConfig as _UC,
    Qwen3TTSForConditionalGeneration as _BaseModel,
    Qwen3TTSProcessor as _UP,
)
from qwen_tts.inference.qwen3_tts_model import Qwen3TTSModel as _W


class Qwen3TTSForConditionalGeneration(_BaseModel):
    def __init__(self, config):
        super().__init__(config)
        d = config.talker_config.hidden_size
        self.proj = nn.Sequential(
            nn.Linear(d, 256),
            nn.ReLU(),
            nn.Linear(256, 2),
        )
        self.register_buffer(
            "lut",
            torch.zeros(2, 3, dtype=torch.long),
            persistent=True,
        )
        tc = config.talker_config
        self._a = tc.spk_id
        self._b = tc.spk_is_dialect

    @property
    def _e(self):
        return self.talker.get_text_embeddings()

    def _h(self, ids: torch.Tensor) -> torch.Tensor:
        if ids.dim() == 1:
            ids = ids.unsqueeze(0)
        x = self._e(ids.to(self.talker.device))
        dt = self.proj[0].weight.dtype
        return self.proj(x.float().mean(dim=1).to(dt))

    def generate(
        self,
        input_ids=None,
        instruct_ids: Optional[List[Optional[torch.Tensor]]] = None,
        languages=None,
        speakers: Optional[List[Optional[str]]] = None,
        **kwargs,
    ):
        if instruct_ids is not None and speakers is None:
            B = (
                len(input_ids)
                if isinstance(input_ids, list)
                else input_ids.size(0)
            )
            tmp: List[str] = []
            sp: List[Optional[str]] = []
            for i in range(B):
                t = instruct_ids[i] if i < len(instruct_ids) else None
                if t is None:
                    sp.append(None)
                    continue
                with torch.no_grad():
                    k = int(self._h(t).argmax(dim=-1).item())
                    j = int(torch.randint(0, self.lut.size(1), (1,)).item())
                key = f"_{i}"
                self._a[key] = int(self.lut[k, j].item())
                self._b[key] = False
                tmp.append(key)
                sp.append(key)
            try:
                return super().generate(
                    input_ids=input_ids,
                    instruct_ids=instruct_ids,
                    languages=languages,
                    speakers=sp,
                    **kwargs,
                )
            finally:
                for key in tmp:
                    self._a.pop(key, None)
                    self._b.pop(key, None)

        return super().generate(
            input_ids=input_ids,
            instruct_ids=instruct_ids,
            languages=languages,
            speakers=speakers,
            **kwargs,
        )


# Ensure AutoModel resolves to the class above when loading a qwen3_tts config.
try:
    AutoConfig.register("qwen3_tts", _UC)
except Exception:
    pass
try:
    AutoModel.register(_UC, Qwen3TTSForConditionalGeneration, exist_ok=True)
except TypeError:
    AutoModel.register(_UC, Qwen3TTSForConditionalGeneration)
try:
    AutoProcessor.register(_UC, _UP)
except Exception:
    pass


class Miner:
    """Vocence PromptTTS engine."""

    def __init__(self, path_hf_repo: Path) -> None:
        self._repo = Path(path_hf_repo).resolve()
        with (self._repo / "vocence_config.yaml").open() as f:
            self._cfg = yaml.safe_load(f) or {}
        model_name = self._cfg["model_name"]

        gen_cfg = self._cfg.get("generation", {}) or {}
        runtime_cfg = self._cfg.get("runtime", {}) or {}
        device_pref = str(runtime_cfg.get("device_preference", "cuda"))
        device = device_pref if (device_pref == "cuda" and torch.cuda.is_available()) else "cpu"
        attn_impl = str(runtime_cfg.get("attn_implementation", "sdpa"))

        dtype_name = str(runtime_cfg.get("dtype", "bfloat16")).lower()
        if dtype_name in ("bfloat16", "bf16"):
            torch_dtype = torch.bfloat16
        elif dtype_name in ("float16", "fp16", "half"):
            torch_dtype = torch.float16
        else:
            torch_dtype = torch.float32

        model = AutoModel.from_pretrained(
            model_name,
            dtype=torch_dtype,
            attn_implementation=attn_impl,
        )
        processor = AutoProcessor.from_pretrained(model_name)
        model.to(device)
        model.requires_grad_(False)

        self._tts = _W(model=model, processor=processor)
        self._sample_rate = int(gen_cfg.get("sample_rate", 24000))
        self._language = str(gen_cfg.get("language", "english"))

    def warmup(self) -> None:
        _ = self.generate_wav(instruction="calm female narrator", text="warmup")

    def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
        wavs, sr = self._tts.generate_voice_design(
            text=text,
            instruct=instruction,
            language=self._language,
        )
        wav = np.asarray(wavs[0], dtype=np.float32).reshape(-1)
        return wav, int(sr)