""" Higgs Audio v2 — Vocence ``Miner`` (Chutes ``POST /speak``: ``instruction`` + ``text``). Loads **generation** weights from ``path_hf_repo`` (HF snapshot root passed by the chute). Loads **audio tokenizer** from, in order: ``HIGGS_AUDIO_TOKENIZER_REPO`` env, ``vocence_config.yaml`` ``runtime.audio_tokenizer_repo``, a local directory ``higgs-audio-v2-tokenizer`` or ``audio_tokenizer`` under the repo (with ``config.json``), else Hub ``eustlb/higgs-audio-v2-tokenizer``. The Hub repo ``bosonai/higgs-audio-v2-tokenizer`` ships weights that do not match ``HiggsAudioV2TokenizerModel`` for this stack; prefer ``eustlb`` tokenizer weights unless you know your files match. Optional env: ``HIGGS_MODEL_REPO`` — if set, overrides ``path_hf_repo`` for the generation model only. """ from __future__ import annotations import os from pathlib import Path from typing import Any import numpy as np import torch from transformers import ( AutoProcessor, HiggsAudioV2ForConditionalGeneration, HiggsAudioV2TokenizerModel, ) def default_hf_repo_root() -> Path: """Directory containing ``miner.py`` (HF snapshot layout: config, weights, …).""" return Path(__file__).resolve().parent def _load_yaml_config(repo: Path) -> dict[str, Any]: path = repo / "vocence_config.yaml" if not path.is_file(): return {} try: import yaml with path.open(encoding="utf-8") as f: data = yaml.safe_load(f) return data if isinstance(data, dict) else {} except Exception: return {} def _resolve_audio_tokenizer_source(repo: Path, cfg: dict[str, Any]) -> str: env = (os.environ.get("HIGGS_AUDIO_TOKENIZER_REPO") or "").strip() if env: return env runtime = cfg.get("runtime") or {} r = runtime.get("audio_tokenizer_repo") if isinstance(r, str) and r.strip(): return r.strip() for name in ("higgs-audio-v2-tokenizer", "audio_tokenizer"): p = (repo / name).resolve() if p.is_dir() and (p / "config.json").is_file(): return str(p) return "eustlb/higgs-audio-v2-tokenizer" def _resolve_generation_model_id(repo: Path) -> str: return (os.environ.get("HIGGS_MODEL_REPO") or "").strip() or str(repo) class Miner: """Higgs Audio v2: ``generate_wav(instruction, text)`` → mono float32 PCM + sample rate.""" def __init__(self, path_hf_repo: Path | str | os.PathLike[str] | None = None) -> None: self._repo_path = ( Path(path_hf_repo).resolve() if path_hf_repo is not None else default_hf_repo_root() ) self._cfg = _load_yaml_config(self._repo_path) gen = self._cfg.get("generation") or {} lim = self._cfg.get("limits") or {} self._max_new_tokens = int(gen.get("max_new_tokens", 1000)) self._do_sample = bool(gen.get("do_sample", False)) self._sampling_rate = int(gen.get("sampling_rate", 24000)) self._max_instruction = int(lim.get("max_instruction_chars", 600)) self._max_text = int(lim.get("max_text_chars", 2000)) model_id = _resolve_generation_model_id(self._repo_path) tok_src = _resolve_audio_tokenizer_source(self._repo_path, self._cfg) self._processor = AutoProcessor.from_pretrained(model_id, device_map="auto") self._processor.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained( tok_src, device_map="auto", ) self._model = HiggsAudioV2ForConditionalGeneration.from_pretrained( model_id, device_map="auto", ) self._model.eval() def _truncate(self, s: str, cap: int) -> str: s = (s or "").strip() return s[:cap] if len(s) > cap else s def _conversation(self, instruction: str, text: str) -> list[dict[str, Any]]: """Map Vocence ``instruction`` (style / scene) + ``text`` (words to speak) to Higgs chat roles.""" scene = ( instruction.strip() if instruction.strip() else "Audio is recorded from a quiet room with a neutral voice." ) return [ { "role": "system", "content": [{"type": "text", "text": "Generate audio following instruction."}], }, { "role": "scene", "content": [{"type": "text", "text": scene}], }, { "role": "user", "content": [{"type": "text", "text": text}], }, ] def warmup(self) -> None: self.generate_wav( "A clear, neutral speaking style with moderate pace.", "Warmup.", ) def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]: if not (text or "").strip(): raise ValueError("text must be non-empty") ins = self._truncate(instruction, self._max_instruction) body = self._truncate(text, self._max_text) inputs = self._processor.apply_chat_template( self._conversation(ins, body), add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", processor_kwargs={"sampling_rate": self._sampling_rate}, ).to(self._model.device) with torch.no_grad(): outputs = self._model.generate( **inputs, max_new_tokens=self._max_new_tokens, do_sample=self._do_sample, ) decoded_list = self._processor.batch_decode(outputs) if not decoded_list: raise RuntimeError("Higgs batch_decode returned no audio.") wav_t = decoded_list[0] if hasattr(wav_t, "detach"): wav = wav_t.detach().cpu().float().numpy() else: wav = np.asarray(wav_t, dtype=np.float32) wav = np.reshape(wav, (-1,)).astype(np.float32, copy=False) peak = float(np.max(np.abs(wav))) if wav.size else 0.0 if peak > 1.0: wav = wav / peak return wav, int(self._sampling_rate)