Spaces:

openbmb
/

VoxCPM-Demo

Running

liuxin Cursor commited on 12 days ago

Commit

03b4e88

1 Parent(s): 0e68e0f

refactor: switch to remote nanovllm API with text normalization

Replace local GPU inference (voxcpm, funasr, modelscope) with remote
nanovllm API calls for TTS, ASR, and denoising. Add client-side text
normalization via wetext. Preserve request logging with active request
counting and detailed payload fields.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show

app.py +386 -359
requirements.txt +4 -20

app.py CHANGED Viewed

@@ -1,28 +1,22 @@
 import json
 import logging
 import os
 import sys
 import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
-from threading import Lock, Semaphore
 from typing import Optional, Tuple
 import gradio as gr
 import numpy as np
-import spaces
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["OPENBLAS_NUM_THREADS"] = "4"
-os.environ["OMP_NUM_THREADS"] = "4"
-os.environ["MKL_NUM_THREADS"] = "4"
-import torch
-import torch._dynamo
-torch._dynamo.config.disable = True
-torch.set_float32_matmul_precision("high")
-DEFAULT_MODEL_REF = "openbmb/VoxCPM2"
 logging.basicConfig(
     level=logging.INFO,
@@ -30,39 +24,13 @@ logging.basicConfig(
     handlers=[logging.StreamHandler(sys.stdout)],
 )
 logger = logging.getLogger(__name__)
 DEFAULT_ASR_MODEL_REF = "FunAudioLLM/SenseVoiceSmall"
-DEFAULT_ZIPENHANCER_MODEL = "iic/speech_zipenhancer_ans_multiloss_16k_base"
 MAX_REFERENCE_AUDIO_SECONDS = 50.0
 _persistent_root = None
 _request_log_dir = None
-def _configure_cache_dirs() -> None:
-    global _persistent_root, _request_log_dir
-    persistent_root = Path(os.environ.get("SPACE_PERSISTENT_ROOT", "/data")).expanduser()
-    if not persistent_root.exists():
-        logger.info("Persistent storage not detected. Request logs disabled.")
-        return
-    logs_dir = Path(
-        os.environ.get("REQUEST_LOG_DIR", str(persistent_root / "logs"))
-    ).expanduser()
-    logs_dir.mkdir(parents=True, exist_ok=True)
-    _persistent_root = persistent_root
-    _request_log_dir = logs_dir
-    logger.info(f"Persistent storage detected at {persistent_root}")
-    logger.info(f"Request logs will be written to daily files under {_request_log_dir}")
-_configure_cache_dirs()
-_asr_model = None
-_voxcpm_model = None
-_denoiser = None
-_asr_lock = Lock()
-_model_lock = Lock()
-_denoiser_lock = Lock()
-_denoise_semaphore = Semaphore(int(os.environ.get("DENOISE_MAX_CONCURRENT", "1")))
 _active_generation_requests = 0
 _active_generation_lock = Lock()
@@ -92,105 +60,244 @@ def _get_bool_env(name: str, default: bool) -> bool:
     raise ValueError(f"Invalid boolean env: {name}={value!r}")
-def _resolve_model_ref() -> str:
-    value = os.environ.get("HF_REPO_ID", "").strip()
-    if value:
-        return value
-    return DEFAULT_MODEL_REF
-def _resolve_asr_model_ref() -> str:
-    return DEFAULT_ASR_MODEL_REF
-def _resolve_zipenhancer_model_ref() -> str:
-    for env_name in ("ZIPENHANCER_MODEL_ID", "ZIPENHANCER_MODEL_PATH"):
-        value = os.environ.get(env_name, "").strip()
-        if value:
-            return value
-    return DEFAULT_ZIPENHANCER_MODEL
-class _ZipEnhancer:
-    def __init__(self, model_ref: str):
-        import torchaudio
-        from modelscope.pipelines import pipeline
-        from modelscope.utils.constant import Tasks
-        self._torchaudio = torchaudio
-        self.model_ref = model_ref
-        self._pipeline = pipeline(Tasks.acoustic_noise_suppression, model=model_ref)
-    def _normalize_loudness(self, wav_path: str) -> None:
-        audio, sr = self._torchaudio.load(wav_path)
-        loudness = self._torchaudio.functional.loudness(audio, sr)
-        normalized_audio = self._torchaudio.functional.gain(audio, -20 - loudness)
-        self._torchaudio.save(wav_path, normalized_audio, sr)
-    def enhance(self, input_path: str) -> str:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-            output_path = tmp_file.name
-        try:
-            self._pipeline(input_path, output_path=output_path)
-            self._normalize_loudness(output_path)
-            return output_path
-        except Exception:
-            if os.path.exists(output_path):
-                try:
-                    os.unlink(output_path)
-                except OSError:
-                    pass
-            raise
-def get_denoiser():
-    global _denoiser
-    if _denoiser is not None:
-        return _denoiser
-    with _denoiser_lock:
-        if _denoiser is not None:
-            return _denoiser
-        model_ref = _resolve_zipenhancer_model_ref()
-        logger.info(f"Loading ZipEnhancer denoiser from {model_ref} ...")
-        _denoiser = _ZipEnhancer(model_ref)
-        logger.info("ZipEnhancer denoiser loaded.")
-    return _denoiser
-def _extract_asr_text(asr_result) -> str:
-    if not asr_result:
-        return ""
-    first_item = asr_result[0]
-    if isinstance(first_item, dict):
-        return str(first_item.get("text", "")).split("|>")[-1].strip()
-    return ""
-def _get_audio_duration_seconds(audio_path: str) -> float:
-    import soundfile as sf
-    info = sf.info(audio_path)
-    return float(info.frames) / float(info.samplerate)
-def _begin_generation_request() -> None:
-    global _active_generation_requests
-    with _active_generation_lock:
-        _active_generation_requests += 1
-def _end_generation_request() -> None:
-    global _active_generation_requests
-    with _active_generation_lock:
-        _active_generation_requests = max(0, _active_generation_requests - 1)
-def _get_active_generation_requests() -> int:
-    with _active_generation_lock:
-        return _active_generation_requests
 def _validate_reference_audio_duration(
@@ -201,46 +308,180 @@ def _validate_reference_audio_duration(
         raise gr.Error(_get_i18n_text("reference_audio_too_long_error", request))
-def _prepare_reference_audio_path(
-    audio_path: Optional[str],
-    *,
-    denoise: bool,
-    request: Optional[gr.Request] = None,
-) -> tuple[Optional[str], Optional[str]]:
-    """Returns (usable_audio_path, temp_path_to_cleanup)."""
-    if audio_path is None or not audio_path.strip():
-        return None, None
-    _validate_reference_audio_duration(audio_path, request)
-    if not denoise:
-        return audio_path, None
-    logger.info("Applying ZipEnhancer denoising to reference audio ...")
-    acquired = _denoise_semaphore.acquire(timeout=30)
-    if not acquired:
-        raise gr.Error(_get_i18n_text("denoise_busy_error", request))
     try:
-        temp_path = get_denoiser().enhance(audio_path)
-        return temp_path, temp_path
-    except Exception as exc:
-        logger.exception("ZipEnhancer denoising failed")
-        raise gr.Error(_get_i18n_text("denoise_failed_error", request)) from exc
-    finally:
-        _denoise_semaphore.release()
-def _safe_prompt_wav_recognition(
-    use_prompt_text: bool, prompt_wav: Optional[str], request: Optional[gr.Request] = None
 ) -> str:
     try:
-        return prompt_wav_recognition(use_prompt_text, prompt_wav)
     except Exception as exc:
-        logger.warning(f"ASR recognition failed: {exc}")
-        raise gr.Error(_get_i18n_text("asr_failed_error", request)) from exc
-# ---------- Inline i18n (en + zh-CN only) ----------
 _USAGE_INSTRUCTIONS_EN = (
     "**VoxCPM2 — Three Modes of Speech Generation:**\n\n"
@@ -399,15 +640,7 @@ def _get_i18n_text(key: str, request: Optional[gr.Request] = None) -> str:
     )
-def _append_request_log(payload: dict) -> None:
-    if _request_log_dir is None:
-        return
-    now = datetime.now(timezone.utc)
-    record = {"timestamp": now.isoformat(), **payload}
-    log_path = _request_log_dir / f"{now.date().isoformat()}.jsonl"
-    with log_path.open("a", encoding="utf-8") as fp:
-        fp.write(json.dumps(record, ensure_ascii=False) + "\n")
 DEFAULT_TARGET_TEXT = (
     "VoxCPM2 is a creative multilingual TTS model from ModelBest, "
@@ -471,212 +704,6 @@ _APP_THEME = gr.themes.Soft(
     font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
 )
-def get_asr_model():
-    global _asr_model
-    if _asr_model is not None:
-        return _asr_model
-    with _asr_lock:
-        if _asr_model is not None:
-            return _asr_model
-        from funasr import AutoModel
-        from huggingface_hub import snapshot_download
-        device = os.environ.get("ASR_DEVICE", "cpu").strip() or "cpu"
-        asr_model_ref = _resolve_asr_model_ref()
-        logger.info(f"Downloading ASR model from Hugging Face: {asr_model_ref}")
-        asr_model_path = snapshot_download(repo_id=asr_model_ref)
-        logger.info(f"Loading ASR model on {device} ...")
-        _asr_model = AutoModel(
-            model=asr_model_path,
-            disable_update=True,
-            log_level="INFO",
-            device=device,
-        )
-        logger.info("ASR model loaded.")
-    return _asr_model
-# ---------- VoxCPM model (single-process, ZeroGPU compatible) ----------
-def get_voxcpm_model():
-    global _voxcpm_model
-    if _voxcpm_model is not None:
-        return _voxcpm_model
-    with _model_lock:
-        if _voxcpm_model is not None:
-            return _voxcpm_model
-        from voxcpm import VoxCPM
-        model_ref = _resolve_model_ref()
-        logger.info(f"Loading VoxCPM model from {model_ref} ...")
-        _voxcpm_model = VoxCPM.from_pretrained(model_ref, load_denoiser=False)
-        logger.info("VoxCPM model loaded.")
-    return _voxcpm_model
-# ---------- GPU-accelerated inference ----------
-def prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
-    if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
-        return ""
-    asr_model = get_asr_model()
-    res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
-    return _extract_asr_text(res)
-def _float_audio_to_int16(wav: np.ndarray) -> np.ndarray:
-    clipped = np.clip(wav, -1.0, 1.0)
-    return (clipped * 32767.0).astype(np.int16, copy=False)
-def _generate_tts_audio_once(
-    text_input: str,
-    control_instruction: str = "",
-    reference_wav_path_input: Optional[str] = None,
-    use_prompt_text: bool = False,
-    prompt_text_input: str = "",
-    cfg_value_input: float = 2.0,
-    do_normalize: bool = True,
-    denoise: bool = True,
-    request: Optional[gr.Request] = None,
-) -> Tuple[int, np.ndarray]:
-    temp_audio_path = None
-    try:
-        model = get_voxcpm_model()
-        text = (text_input or "").strip()
-        if len(text) == 0:
-            raise ValueError("Please input text to synthesize.")
-        control = (control_instruction or "").strip()
-        final_text = f"({control}){text}" if control and not use_prompt_text else text
-        ref_path, temp_audio_path = _prepare_reference_audio_path(
-            reference_wav_path_input,
-            denoise=bool(denoise),
-            request=request,
-        )
-        prompt_text_clean = (prompt_text_input or "").strip()
-        if use_prompt_text and ref_path is None:
-            raise ValueError("Ultimate Cloning Mode requires a reference audio clip.")
-        if use_prompt_text and not prompt_text_clean:
-            raise ValueError(
-                "Ultimate Cloning Mode requires a transcript. Please wait for ASR or fill it in manually."
-            )
-        if not use_prompt_text:
-            prompt_text_clean = ""
-        generate_kwargs = dict(
-            text=final_text,
-            cfg_value=float(cfg_value_input),
-            inference_timesteps=_get_int_env("VOXCPM_INFERENCE_TIMESTEPS", 10),
-        )
-        if use_prompt_text and ref_path:
-            logger.info("[Ultimate Cloning] reference audio + transcript")
-            generate_kwargs["prompt_wav_path"] = ref_path
-            generate_kwargs["prompt_text"] = prompt_text_clean
-            generate_kwargs["reference_wav_path"] = ref_path
-        elif ref_path:
-            logger.info("[Controllable Cloning] reference audio only")
-            generate_kwargs["reference_wav_path"] = ref_path
-        else:
-            logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
-        logger.info(f"Generating: '{final_text[:80]}...'")
-        wav = model.generate(**generate_kwargs)
-        if wav is None or len(wav) == 0:
-            raise RuntimeError("The model returned no audio.")
-        wav = np.asarray(wav, dtype=np.float32)
-        wav = _float_audio_to_int16(wav)
-        return (int(model.tts_model.sample_rate), wav)
-    finally:
-        if temp_audio_path and os.path.exists(temp_audio_path):
-            try:
-                os.unlink(temp_audio_path)
-            except OSError:
-                pass
-@spaces.GPU(duration=300)
-def generate_tts_audio(
-    text_input: str,
-    control_instruction: str = "",
-    reference_wav_path_input: Optional[str] = None,
-    use_prompt_text: bool = False,
-    prompt_text_input: str = "",
-    cfg_value_input: float = 2.0,
-    do_normalize: bool = True,
-    denoise: bool = True,
-    request: Optional[gr.Request] = None,
-) -> Tuple[int, np.ndarray]:
-    _begin_generation_request()
-    request_payload = {
-        "event": "tts_request",
-        "ui_language": _resolve_ui_language(request),
-        "text": (text_input or "").strip(),
-        "control_instruction": (control_instruction or "").strip(),
-        "use_prompt_text": bool(use_prompt_text),
-        "prompt_text": (prompt_text_input or "").strip(),
-        "cfg_value": float(cfg_value_input),
-        "do_normalize": bool(do_normalize),
-        "denoise": bool(denoise),
-        "has_reference_audio": bool(reference_wav_path_input and reference_wav_path_input.strip()),
-    }
-    if request_payload["has_reference_audio"]:
-        try:
-            request_payload["reference_audio_duration_seconds"] = round(
-                _get_audio_duration_seconds(reference_wav_path_input), 3
-            )
-        except Exception as exc:
-            request_payload["reference_audio_duration_error"] = str(exc)
-    try:
-        try:
-            result = _generate_tts_audio_once(
-                text_input=text_input,
-                control_instruction=control_instruction,
-                reference_wav_path_input=reference_wav_path_input,
-                use_prompt_text=use_prompt_text,
-                prompt_text_input=prompt_text_input,
-                cfg_value_input=cfg_value_input,
-                do_normalize=do_normalize,
-                denoise=denoise,
-                request=request,
-            )
-            try:
-                _append_request_log({**request_payload, "status": "success"})
-            except Exception as exc:
-                logger.warning(f"Failed to append request log: {exc}")
-            return result
-        except (ValueError, gr.Error) as exc:
-            try:
-                _append_request_log(
-                    {**request_payload, "status": "rejected", "error": str(exc)}
-                )
-            except Exception as log_exc:
-                logger.warning(f"Failed to append request log: {log_exc}")
-            if isinstance(exc, gr.Error):
-                raise
-            raise gr.Error(str(exc)) from exc
-        except Exception as exc:
-            logger.exception("Generation failed")
-            try:
-                _append_request_log({**request_payload, "status": "error", "error": str(exc)})
-            except Exception as log_exc:
-                logger.warning(f"Failed to append request log: {log_exc}")
-            raise gr.Error(_get_i18n_text("backend_retry_error", request)) from exc
-    finally:
-        _end_generation_request()
 # ---------- UI ----------

+import base64
 import json
 import logging
 import os
+import re
 import sys
 import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
+from threading import Lock
 from typing import Optional, Tuple
 import gradio as gr
 import numpy as np
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ.setdefault("OPENBLAS_NUM_THREADS", "4")
+os.environ.setdefault("OMP_NUM_THREADS", "4")
+os.environ.setdefault("MKL_NUM_THREADS", "4")
 logging.basicConfig(
     level=logging.INFO,
     handlers=[logging.StreamHandler(sys.stdout)],
 )
 logger = logging.getLogger(__name__)
+NANOVLLM_API_BASE = os.environ.get("NANOVLLM_API_BASE", "http://47.85.48.143:8000").rstrip("/")
 DEFAULT_ASR_MODEL_REF = "FunAudioLLM/SenseVoiceSmall"
 MAX_REFERENCE_AUDIO_SECONDS = 50.0
 _persistent_root = None
 _request_log_dir = None
 _active_generation_requests = 0
 _active_generation_lock = Lock()
     raise ValueError(f"Invalid boolean env: {name}={value!r}")
+# ---------- Request Logging ----------
+def _configure_cache_dirs() -> None:
+    global _persistent_root, _request_log_dir
+    persistent_root = Path(os.environ.get("SPACE_PERSISTENT_ROOT", "/data")).expanduser()
+    if not persistent_root.exists():
+        logger.info("Persistent storage not detected. Request logs disabled.")
+        return
+    logs_dir = Path(
+        os.environ.get("REQUEST_LOG_DIR", str(persistent_root / "logs"))
+    ).expanduser()
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    _persistent_root = persistent_root
+    _request_log_dir = logs_dir
+    logger.info(f"Persistent storage detected at {persistent_root}")
+    logger.info(f"Request logs will be written to daily files under {_request_log_dir}")
+_configure_cache_dirs()
+def _append_request_log(payload: dict) -> None:
+    if _request_log_dir is None:
+        return
+    now = datetime.now(timezone.utc)
+    record = {"timestamp": now.isoformat(), **payload}
+    log_path = _request_log_dir / f"{now.date().isoformat()}.jsonl"
+    with log_path.open("a", encoding="utf-8") as fp:
+        fp.write(json.dumps(record, ensure_ascii=False) + "\n")
+def _begin_generation_request() -> None:
+    global _active_generation_requests
+    with _active_generation_lock:
+        _active_generation_requests += 1
+def _end_generation_request() -> None:
+    global _active_generation_requests
+    with _active_generation_lock:
+        _active_generation_requests = max(0, _active_generation_requests - 1)
+def _get_active_generation_requests() -> int:
+    with _active_generation_lock:
+        return _active_generation_requests
+# ---------- Remote ASR & Denoise via HTTP API ----------
+def _api_asr(audio_path: str) -> str:
+    """Call POST /asr on the nanovllm server to transcribe audio."""
+    import requests
+    path = Path(audio_path)
+    wav_b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
+    wav_fmt = path.suffix.lstrip(".").lower() or "wav"
+    resp = requests.post(
+        f"{NANOVLLM_API_BASE}/asr",
+        json={"wav_base64": wav_b64, "wav_format": wav_fmt},
+        timeout=60,
+    )
+    resp.raise_for_status()
+    return resp.json().get("text", "")
+def _api_denoise(audio_path: str) -> str:
+    """Call POST /denoise on the nanovllm server, return path to denoised temp file."""
+    import requests
+    path = Path(audio_path)
+    wav_b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
+    wav_fmt = path.suffix.lstrip(".").lower() or "wav"
+    resp = requests.post(
+        f"{NANOVLLM_API_BASE}/denoise",
+        json={"wav_base64": wav_b64, "wav_format": wav_fmt},
+        timeout=120,
+    )
+    resp.raise_for_status()
+    denoised_b64 = resp.json()["wav_base64"]
+    denoised_bytes = base64.b64decode(denoised_b64)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    tmp.write(denoised_bytes)
+    tmp.close()
+    return tmp.name
+# ---------- Text Normalization (CPU-only, from VoxCPM text_normalize.py) ----------
+_chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]+")
+_text_normalizer = None
+_text_normalizer_lock = Lock()
+def _contains_chinese(text: str) -> bool:
+    return bool(_chinese_char_pattern.search(text))
+def _replace_corner_mark(text: str) -> str:
+    text = text.replace("\u00b2", "\u5e73\u65b9")
+    text = text.replace("\u00b3", "\u7acb\u65b9")
+    text = text.replace("\u221a", "\u6839\u53f7")
+    text = text.replace("\u2248", "\u7ea6\u7b49\u4e8e")
+    text = text.replace("<", "\u5c0f\u4e8e")
+    return text
+def _remove_bracket(text: str) -> str:
+    text = text.replace("\uff08", " ").replace("\uff09", " ")
+    text = text.replace("\u3010", " ").replace("\u3011", " ")
+    text = text.replace("\u2018", "").replace("\u2019", "")
+    text = text.replace("\u2014\u2014", " ")
+    return text
+def _spell_out_number(text: str, inflect_parser) -> str:
+    new_text = []
+    st = None
+    for i, c in enumerate(text):
+        if not c.isdigit():
+            if st is not None:
+                num_str = inflect_parser.number_to_words(text[st:i])
+                new_text.append(num_str)
+                st = None
+            new_text.append(c)
+        else:
+            if st is None:
+                st = i
+    if st is not None and st < len(text):
+        num_str = inflect_parser.number_to_words(text[st:])
+        new_text.append(num_str)
+    return "".join(new_text)
+def _replace_blank(text: str) -> str:
+    out_str = []
+    for i, c in enumerate(text):
+        if c == " ":
+            if (
+                i + 1 < len(text) and text[i + 1].isascii() and text[i + 1] != " "
+                and i - 1 >= 0 and text[i - 1].isascii() and text[i - 1] != " "
+            ):
+                out_str.append(c)
+        else:
+            out_str.append(c)
+    return "".join(out_str)
+def _clean_markdown(md_text: str) -> str:
+    import regex
+    md_text = re.sub(r"```.*?```", "", md_text, flags=re.DOTALL)
+    md_text = re.sub(r"`[^`]*`", "", md_text)
+    md_text = re.sub(r"!\[[^\]]*\]\([^\)]+\)", "", md_text)
+    md_text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", md_text)
+    md_text = re.sub(r"^(\s*)-\s+", r"\1", md_text, flags=re.MULTILINE)
+    md_text = re.sub(r"<[^>]+>", "", md_text)
+    md_text = re.sub(r"^#{1,6}\s*", "", md_text, flags=re.MULTILINE)
+    md_text = re.sub(r"\n\s*\n", "\n", md_text)
+    md_text = md_text.strip()
+    return md_text
+def _clean_text(text: str) -> str:
+    import regex
+    text = _clean_markdown(text)
+    text = regex.compile(r"\p{Emoji_Presentation}|\p{Emoji}\uFE0F", flags=regex.UNICODE).sub("", text)
+    text = text.replace("\n", " ").replace("\t", " ")
+    text = text.replace("\u201c", '"').replace("\u201d", '"')
+    return text
+def _get_text_normalizer():
+    global _text_normalizer
+    if _text_normalizer is not None:
+        return _text_normalizer
+    with _text_normalizer_lock:
+        if _text_normalizer is not None:
+            return _text_normalizer
+        from wetext import Normalizer
+        import inflect
+        _text_normalizer = {
+            "zh_tn": Normalizer(lang="zh", operator="tn", remove_erhua=True),
+            "en_tn": Normalizer(lang="en", operator="tn"),
+            "inflect": inflect.engine(),
+        }
+        logger.info("TextNormalizer loaded.")
+    return _text_normalizer
+def normalize_text(text: str) -> str:
+    """Normalize text (numbers, dates, abbreviations) for TTS input."""
+    tn = _get_text_normalizer()
+    lang = "zh" if _contains_chinese(text) else "en"
+    text = _clean_text(text)
+    if lang == "zh":
+        text = text.replace("=", "\u7b49\u4e8e")
+        if re.search(r"([\d$%^*_+\u2265\u2264\u2260\u00d7\u00f7?=])", text):
+            text = re.sub(r"(?<=[a-zA-Z0-9])-(?=\d)", " - ", text)
+        text = tn["zh_tn"].normalize(text)
+        text = _replace_blank(text)
+        text = _replace_corner_mark(text)
+        text = _remove_bracket(text)
+    else:
+        text = tn["en_tn"].normalize(text)
+        text = _spell_out_number(text, tn["inflect"])
+    return text
+def _safe_prompt_wav_recognition(
+    use_prompt_text: bool, prompt_wav: Optional[str], request: Optional[gr.Request] = None
+) -> str:
+    if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
+        return ""
+    try:
+        return _api_asr(prompt_wav)
+    except Exception as exc:
+        logger.warning(f"ASR recognition failed: {exc}")
+        raise gr.Error(_get_i18n_text("asr_failed_error", request)) from exc
+# ---------- Audio helpers ----------
+def _get_audio_duration_seconds(audio_path: str) -> float:
+    import soundfile as sf
+    info = sf.info(audio_path)
+    return float(info.frames) / float(info.samplerate)
 def _validate_reference_audio_duration(
         raise gr.Error(_get_i18n_text("reference_audio_too_long_error", request))
+# ---------- Nano-vLLM HTTP API Client ----------
+def _api_generate(payload: dict) -> str:
+    """Call POST /generate, receive streaming MP3, save to temp file and return path."""
+    import requests
+    url = f"{NANOVLLM_API_BASE}/generate"
+    logger.info(f"Calling {url} ...")
+    resp = requests.post(url, json=payload, stream=True, timeout=300)
+    resp.raise_for_status()
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
     try:
+        for chunk in resp.iter_content(chunk_size=64 * 1024):
+            tmp.write(chunk)
+        tmp.close()
+        return tmp.name
+    except Exception:
+        tmp.close()
+        if os.path.exists(tmp.name):
+            os.unlink(tmp.name)
+        raise
+def _api_get_info() -> dict:
+    import requests
+    resp = requests.get(f"{NANOVLLM_API_BASE}/info", timeout=10)
+    resp.raise_for_status()
+    return resp.json()
+# ---------- Generation via HTTP API ----------
+def generate_tts_audio(
+    text_input: str,
+    control_instruction: str = "",
+    reference_wav_path_input: Optional[str] = None,
+    use_prompt_text: bool = False,
+    prompt_text_input: str = "",
+    cfg_value_input: float = 2.0,
+    do_normalize: bool = True,
+    denoise: bool = True,
+    request: Optional[gr.Request] = None,
 ) -> str:
+    _begin_generation_request()
+    request_payload = {
+        "event": "tts_request",
+        "ui_language": _resolve_ui_language(request),
+        "text": (text_input or "").strip(),
+        "control_instruction": (control_instruction or "").strip(),
+        "use_prompt_text": bool(use_prompt_text),
+        "prompt_text": (prompt_text_input or "").strip(),
+        "cfg_value": float(cfg_value_input),
+        "do_normalize": bool(do_normalize),
+        "denoise": bool(denoise),
+        "has_reference_audio": bool(reference_wav_path_input and reference_wav_path_input.strip()),
+    }
+    if request_payload["has_reference_audio"]:
+        try:
+            request_payload["reference_audio_duration_seconds"] = round(
+                _get_audio_duration_seconds(reference_wav_path_input), 3
+            )
+        except Exception as exc:
+            request_payload["reference_audio_duration_error"] = str(exc)
     try:
+        text = (text_input or "").strip()
+        if not text:
+            raise ValueError("Please input text to synthesize.")
+        control = (control_instruction or "").strip()
+        final_text = f"({control}){text}" if control and not use_prompt_text else text
+        if do_normalize:
+            try:
+                original = final_text
+                final_text = normalize_text(final_text)
+                if final_text != original:
+                    logger.info(f"Text normalized: '{original[:60]}' -> '{final_text[:60]}'")
+            except Exception as exc:
+                logger.warning(f"Text normalization failed, using original: {exc}")
+        prompt_text_clean = (prompt_text_input or "").strip()
+        if use_prompt_text and not reference_wav_path_input:
+            raise ValueError("Ultimate Cloning Mode requires a reference audio clip.")
+        if use_prompt_text and not prompt_text_clean:
+            raise ValueError(
+                "Ultimate Cloning Mode requires a transcript. "
+                "Please wait for ASR or fill it in manually."
+            )
+        if not use_prompt_text:
+            prompt_text_clean = ""
+        has_ref = reference_wav_path_input and reference_wav_path_input.strip()
+        if has_ref:
+            _validate_reference_audio_duration(reference_wav_path_input, request)
+        denoised_tmp = None
+        api_payload: dict = {
+            "target_text": final_text,
+            "cfg_value": float(cfg_value_input),
+        }
+        try:
+            if has_ref:
+                actual_ref_path = reference_wav_path_input
+                if denoise:
+                    logger.info("Applying server-side denoise to reference audio ...")
+                    try:
+                        denoised_tmp = _api_denoise(reference_wav_path_input)
+                        actual_ref_path = denoised_tmp
+                        logger.info("Denoise completed.")
+                    except Exception as exc:
+                        logger.warning(f"Denoise failed, using original audio: {exc}")
+                ref_path = Path(actual_ref_path)
+                wav_b64 = base64.b64encode(ref_path.read_bytes()).decode("utf-8")
+                wav_fmt = ref_path.suffix.lstrip(".").lower() or "wav"
+                if use_prompt_text:
+                    logger.info("[Ultimate Cloning] reference audio + transcript")
+                    api_payload["prompt_wav_base64"] = wav_b64
+                    api_payload["prompt_wav_format"] = wav_fmt
+                    api_payload["prompt_text"] = prompt_text_clean
+                    api_payload["ref_audio_wav_base64"] = wav_b64
+                    api_payload["ref_audio_wav_format"] = wav_fmt
+                else:
+                    logger.info("[Controllable Cloning] reference audio only")
+                    api_payload["ref_audio_wav_base64"] = wav_b64
+                    api_payload["ref_audio_wav_format"] = wav_fmt
+            else:
+                logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
+            logger.info(f"Generating: '{final_text[:80]}...'")
+            mp3_path = _api_generate(api_payload)
+        finally:
+            if denoised_tmp and os.path.exists(denoised_tmp):
+                try:
+                    os.unlink(denoised_tmp)
+                except OSError:
+                    pass
+        try:
+            _append_request_log({**request_payload, "status": "success"})
+        except Exception as exc:
+            logger.warning(f"Failed to append request log: {exc}")
+        return mp3_path
+    except (ValueError, gr.Error) as exc:
+        try:
+            _append_request_log({**request_payload, "status": "rejected", "error": str(exc)})
+        except Exception:
+            pass
+        if isinstance(exc, gr.Error):
+            raise
+        raise gr.Error(str(exc)) from exc
     except Exception as exc:
+        logger.exception("Generation failed")
+        try:
+            _append_request_log({**request_payload, "status": "error", "error": str(exc)})
+        except Exception:
+            pass
+        raise gr.Error(_get_i18n_text("backend_retry_error", request)) from exc
+    finally:
+        _end_generation_request()
+# ---------- Inline i18n (en + zh-CN) ----------
 _USAGE_INSTRUCTIONS_EN = (
     "**VoxCPM2 — Three Modes of Speech Generation:**\n\n"
     )
+# ---------- Theme & CSS ----------
 DEFAULT_TARGET_TEXT = (
     "VoxCPM2 is a creative multilingual TTS model from ModelBest, "
     font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
 )
 # ---------- UI ----------

requirements.txt CHANGED Viewed

@@ -1,23 +1,7 @@
 gradio==6.0.0
-huggingface-hub
-funasr
-modelscope>=1.22.0
 numpy>=1.21.0
-torch==2.5.1
-torchaudio==2.5.1
-voxcpm
-transformers>=4.51.0
-addict
-simplejson
-sortedcontainers
-xxhash
-tqdm
-librosa
-pydantic
 soundfile>=0.13.1
-torchcodec
-packaging
-psutil
-ninja
-setuptools
-wheel

 gradio==6.0.0
+inflect
 numpy>=1.21.0
+regex
+requests
 soundfile>=0.13.1
+wetext