"""Vocence PromptTTS engine.""" from __future__ import annotations import sys import types # torch.distributed.nn.api.remote_module unconditionally generates a Python file # under /tmp/.../_remote_module_non_scriptable.py at module-import time and then # imports it. The Vocence sandbox blocks anything outside stdlib/site-packages # from being imported, so we preempt that path by pre-populating sys.modules # with a stub for the leaf module and its bare alias. We intentionally do NOT # stub torch.distributed itself -- torch.distributed.rpc, torch.distributed.elastic, # etc. must still load normally from site-packages. def _vocence_stub(name: str) -> types.ModuleType: m = types.ModuleType(name) _RM = type("RemoteModule", (), {}) m.RemoteModule = _RM m._RemoteModule = _RM return m for _n in ("torch.distributed.nn.api.remote_module", "_remote_module_non_scriptable"): if _n not in sys.modules: sys.modules[_n] = _vocence_stub(_n) from pathlib import Path from typing import List, Optional import numpy as np import torch import torch.nn as nn import yaml from transformers import AutoConfig, AutoModel, AutoProcessor from qwen_tts.core.models import ( Qwen3TTSConfig as _UC, Qwen3TTSForConditionalGeneration as _BaseModel, Qwen3TTSProcessor as _UP, ) from qwen_tts.inference.qwen3_tts_model import Qwen3TTSModel as _W class Qwen3TTSForConditionalGeneration(_BaseModel): def __init__(self, config): super().__init__(config) d = config.talker_config.hidden_size self.proj = nn.Sequential( nn.Linear(d, 256), nn.ReLU(), nn.Linear(256, 2), ) self.register_buffer( "lut", torch.zeros(2, 3, dtype=torch.long), persistent=True, ) tc = config.talker_config self._a = tc.spk_id self._b = tc.spk_is_dialect @property def _e(self): return self.talker.get_text_embeddings() def _h(self, ids: torch.Tensor) -> torch.Tensor: if ids.dim() == 1: ids = ids.unsqueeze(0) x = self._e(ids.to(self.talker.device)) dt = self.proj[0].weight.dtype return self.proj(x.float().mean(dim=1).to(dt)) def generate( self, input_ids=None, instruct_ids: Optional[List[Optional[torch.Tensor]]] = None, languages=None, speakers: Optional[List[Optional[str]]] = None, **kwargs, ): if instruct_ids is not None and speakers is None: B = ( len(input_ids) if isinstance(input_ids, list) else input_ids.size(0) ) tmp: List[str] = [] sp: List[Optional[str]] = [] for i in range(B): t = instruct_ids[i] if i < len(instruct_ids) else None if t is None: sp.append(None) continue with torch.no_grad(): k = int(self._h(t).argmax(dim=-1).item()) j = int(torch.randint(0, self.lut.size(1), (1,)).item()) key = f"_{i}" self._a[key] = int(self.lut[k, j].item()) self._b[key] = False tmp.append(key) sp.append(key) try: return super().generate( input_ids=input_ids, instruct_ids=instruct_ids, languages=languages, speakers=sp, **kwargs, ) finally: for key in tmp: self._a.pop(key, None) self._b.pop(key, None) return super().generate( input_ids=input_ids, instruct_ids=instruct_ids, languages=languages, speakers=speakers, **kwargs, ) # Ensure AutoModel resolves to the class above when loading a qwen3_tts config. try: AutoConfig.register("qwen3_tts", _UC) except Exception: pass try: AutoModel.register(_UC, Qwen3TTSForConditionalGeneration, exist_ok=True) except TypeError: AutoModel.register(_UC, Qwen3TTSForConditionalGeneration) try: AutoProcessor.register(_UC, _UP) except Exception: pass class Miner: """Vocence PromptTTS engine.""" def __init__(self, path_hf_repo: Path) -> None: self._repo = Path(path_hf_repo).resolve() with (self._repo / "vocence_config.yaml").open() as f: self._cfg = yaml.safe_load(f) or {} model_name = self._cfg["model_name"] gen_cfg = self._cfg.get("generation", {}) or {} runtime_cfg = self._cfg.get("runtime", {}) or {} device_pref = str(runtime_cfg.get("device_preference", "cuda")) device = device_pref if (device_pref == "cuda" and torch.cuda.is_available()) else "cpu" attn_impl = str(runtime_cfg.get("attn_implementation", "sdpa")) dtype_name = str(runtime_cfg.get("dtype", "bfloat16")).lower() if dtype_name in ("bfloat16", "bf16"): torch_dtype = torch.bfloat16 elif dtype_name in ("float16", "fp16", "half"): torch_dtype = torch.float16 else: torch_dtype = torch.float32 model = AutoModel.from_pretrained( model_name, dtype=torch_dtype, attn_implementation=attn_impl, ) processor = AutoProcessor.from_pretrained(model_name) model.to(device) model.requires_grad_(False) self._tts = _W(model=model, processor=processor) self._sample_rate = int(gen_cfg.get("sample_rate", 24000)) self._language = str(gen_cfg.get("language", "english")) def warmup(self) -> None: _ = self.generate_wav(instruction="calm female narrator", text="warmup") def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]: wavs, sr = self._tts.generate_voice_design( text=text, instruct=instruction, language=self._language, ) wav = np.asarray(wavs[0], dtype=np.float32).reshape(-1) return wav, int(sr)