liuxin Cursor commited on
Commit
03b4e88
·
1 Parent(s): 0e68e0f

refactor: switch to remote nanovllm API with text normalization

Browse files

Replace local GPU inference (voxcpm, funasr, modelscope) with remote
nanovllm API calls for TTS, ASR, and denoising. Add client-side text
normalization via wetext. Preserve request logging with active request
counting and detailed payload fields.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (2) hide show
  1. app.py +386 -359
  2. requirements.txt +4 -20
app.py CHANGED
@@ -1,28 +1,22 @@
 
1
  import json
2
  import logging
3
  import os
 
4
  import sys
5
  import tempfile
6
  from datetime import datetime, timezone
7
  from pathlib import Path
8
- from threading import Lock, Semaphore
9
  from typing import Optional, Tuple
10
 
11
  import gradio as gr
12
  import numpy as np
13
- import spaces
14
 
15
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
16
- os.environ["OPENBLAS_NUM_THREADS"] = "4"
17
- os.environ["OMP_NUM_THREADS"] = "4"
18
- os.environ["MKL_NUM_THREADS"] = "4"
19
-
20
- import torch
21
- import torch._dynamo
22
- torch._dynamo.config.disable = True
23
- torch.set_float32_matmul_precision("high")
24
-
25
- DEFAULT_MODEL_REF = "openbmb/VoxCPM2"
26
 
27
  logging.basicConfig(
28
  level=logging.INFO,
@@ -30,39 +24,13 @@ logging.basicConfig(
30
  handlers=[logging.StreamHandler(sys.stdout)],
31
  )
32
  logger = logging.getLogger(__name__)
 
 
33
  DEFAULT_ASR_MODEL_REF = "FunAudioLLM/SenseVoiceSmall"
34
- DEFAULT_ZIPENHANCER_MODEL = "iic/speech_zipenhancer_ans_multiloss_16k_base"
35
  MAX_REFERENCE_AUDIO_SECONDS = 50.0
 
36
  _persistent_root = None
37
  _request_log_dir = None
38
-
39
-
40
- def _configure_cache_dirs() -> None:
41
- global _persistent_root, _request_log_dir
42
- persistent_root = Path(os.environ.get("SPACE_PERSISTENT_ROOT", "/data")).expanduser()
43
- if not persistent_root.exists():
44
- logger.info("Persistent storage not detected. Request logs disabled.")
45
- return
46
-
47
- logs_dir = Path(
48
- os.environ.get("REQUEST_LOG_DIR", str(persistent_root / "logs"))
49
- ).expanduser()
50
- logs_dir.mkdir(parents=True, exist_ok=True)
51
- _persistent_root = persistent_root
52
- _request_log_dir = logs_dir
53
- logger.info(f"Persistent storage detected at {persistent_root}")
54
- logger.info(f"Request logs will be written to daily files under {_request_log_dir}")
55
-
56
-
57
- _configure_cache_dirs()
58
-
59
- _asr_model = None
60
- _voxcpm_model = None
61
- _denoiser = None
62
- _asr_lock = Lock()
63
- _model_lock = Lock()
64
- _denoiser_lock = Lock()
65
- _denoise_semaphore = Semaphore(int(os.environ.get("DENOISE_MAX_CONCURRENT", "1")))
66
  _active_generation_requests = 0
67
  _active_generation_lock = Lock()
68
 
@@ -92,105 +60,244 @@ def _get_bool_env(name: str, default: bool) -> bool:
92
  raise ValueError(f"Invalid boolean env: {name}={value!r}")
93
 
94
 
95
- def _resolve_model_ref() -> str:
96
- value = os.environ.get("HF_REPO_ID", "").strip()
97
- if value:
98
- return value
99
- return DEFAULT_MODEL_REF
100
 
101
 
102
- def _resolve_asr_model_ref() -> str:
103
- return DEFAULT_ASR_MODEL_REF
 
 
 
 
104
 
 
 
 
 
 
 
 
 
105
 
106
- def _resolve_zipenhancer_model_ref() -> str:
107
- for env_name in ("ZIPENHANCER_MODEL_ID", "ZIPENHANCER_MODEL_PATH"):
108
- value = os.environ.get(env_name, "").strip()
109
- if value:
110
- return value
111
- return DEFAULT_ZIPENHANCER_MODEL
112
 
 
113
 
114
- class _ZipEnhancer:
115
- def __init__(self, model_ref: str):
116
- import torchaudio
117
- from modelscope.pipelines import pipeline
118
- from modelscope.utils.constant import Tasks
119
 
120
- self._torchaudio = torchaudio
121
- self.model_ref = model_ref
122
- self._pipeline = pipeline(Tasks.acoustic_noise_suppression, model=model_ref)
 
 
 
 
 
123
 
124
- def _normalize_loudness(self, wav_path: str) -> None:
125
- audio, sr = self._torchaudio.load(wav_path)
126
- loudness = self._torchaudio.functional.loudness(audio, sr)
127
- normalized_audio = self._torchaudio.functional.gain(audio, -20 - loudness)
128
- self._torchaudio.save(wav_path, normalized_audio, sr)
129
 
130
- def enhance(self, input_path: str) -> str:
131
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
132
- output_path = tmp_file.name
133
- try:
134
- self._pipeline(input_path, output_path=output_path)
135
- self._normalize_loudness(output_path)
136
- return output_path
137
- except Exception:
138
- if os.path.exists(output_path):
139
- try:
140
- os.unlink(output_path)
141
- except OSError:
142
- pass
143
- raise
144
 
145
 
146
- def get_denoiser():
147
- global _denoiser
148
- if _denoiser is not None:
149
- return _denoiser
150
 
151
- with _denoiser_lock:
152
- if _denoiser is not None:
153
- return _denoiser
154
 
155
- model_ref = _resolve_zipenhancer_model_ref()
156
- logger.info(f"Loading ZipEnhancer denoiser from {model_ref} ...")
157
- _denoiser = _ZipEnhancer(model_ref)
158
- logger.info("ZipEnhancer denoiser loaded.")
159
- return _denoiser
160
 
161
 
162
- def _extract_asr_text(asr_result) -> str:
163
- if not asr_result:
164
- return ""
165
 
166
- first_item = asr_result[0]
167
- if isinstance(first_item, dict):
168
- return str(first_item.get("text", "")).split("|>")[-1].strip()
169
- return ""
170
 
 
 
 
171
 
172
- def _get_audio_duration_seconds(audio_path: str) -> float:
173
- import soundfile as sf
 
174
 
175
- info = sf.info(audio_path)
176
- return float(info.frames) / float(info.samplerate)
 
 
 
 
 
177
 
178
 
179
- def _begin_generation_request() -> None:
180
- global _active_generation_requests
181
- with _active_generation_lock:
182
- _active_generation_requests += 1
183
 
 
 
 
184
 
185
- def _end_generation_request() -> None:
186
- global _active_generation_requests
187
- with _active_generation_lock:
188
- _active_generation_requests = max(0, _active_generation_requests - 1)
 
 
189
 
 
 
190
 
191
- def _get_active_generation_requests() -> int:
192
- with _active_generation_lock:
193
- return _active_generation_requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
 
196
  def _validate_reference_audio_duration(
@@ -201,46 +308,180 @@ def _validate_reference_audio_duration(
201
  raise gr.Error(_get_i18n_text("reference_audio_too_long_error", request))
202
 
203
 
204
- def _prepare_reference_audio_path(
205
- audio_path: Optional[str],
206
- *,
207
- denoise: bool,
208
- request: Optional[gr.Request] = None,
209
- ) -> tuple[Optional[str], Optional[str]]:
210
- """Returns (usable_audio_path, temp_path_to_cleanup)."""
211
- if audio_path is None or not audio_path.strip():
212
- return None, None
213
 
214
- _validate_reference_audio_duration(audio_path, request)
 
 
215
 
216
- if not denoise:
217
- return audio_path, None
218
 
219
- logger.info("Applying ZipEnhancer denoising to reference audio ...")
220
- acquired = _denoise_semaphore.acquire(timeout=30)
221
- if not acquired:
222
- raise gr.Error(_get_i18n_text("denoise_busy_error", request))
223
  try:
224
- temp_path = get_denoiser().enhance(audio_path)
225
- return temp_path, temp_path
226
- except Exception as exc:
227
- logger.exception("ZipEnhancer denoising failed")
228
- raise gr.Error(_get_i18n_text("denoise_failed_error", request)) from exc
229
- finally:
230
- _denoise_semaphore.release()
 
 
231
 
232
 
233
- def _safe_prompt_wav_recognition(
234
- use_prompt_text: bool, prompt_wav: Optional[str], request: Optional[gr.Request] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  ) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  try:
237
- return prompt_wav_recognition(use_prompt_text, prompt_wav)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  except Exception as exc:
239
- logger.warning(f"ASR recognition failed: {exc}")
240
- raise gr.Error(_get_i18n_text("asr_failed_error", request)) from exc
 
 
 
 
 
 
241
 
242
 
243
- # ---------- Inline i18n (en + zh-CN only) ----------
 
244
 
245
  _USAGE_INSTRUCTIONS_EN = (
246
  "**VoxCPM2 — Three Modes of Speech Generation:**\n\n"
@@ -399,15 +640,7 @@ def _get_i18n_text(key: str, request: Optional[gr.Request] = None) -> str:
399
  )
400
 
401
 
402
- def _append_request_log(payload: dict) -> None:
403
- if _request_log_dir is None:
404
- return
405
-
406
- now = datetime.now(timezone.utc)
407
- record = {"timestamp": now.isoformat(), **payload}
408
- log_path = _request_log_dir / f"{now.date().isoformat()}.jsonl"
409
- with log_path.open("a", encoding="utf-8") as fp:
410
- fp.write(json.dumps(record, ensure_ascii=False) + "\n")
411
 
412
  DEFAULT_TARGET_TEXT = (
413
  "VoxCPM2 is a creative multilingual TTS model from ModelBest, "
@@ -471,212 +704,6 @@ _APP_THEME = gr.themes.Soft(
471
  font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
472
  )
473
 
474
- def get_asr_model():
475
- global _asr_model
476
- if _asr_model is not None:
477
- return _asr_model
478
- with _asr_lock:
479
- if _asr_model is not None:
480
- return _asr_model
481
- from funasr import AutoModel
482
- from huggingface_hub import snapshot_download
483
-
484
- device = os.environ.get("ASR_DEVICE", "cpu").strip() or "cpu"
485
- asr_model_ref = _resolve_asr_model_ref()
486
- logger.info(f"Downloading ASR model from Hugging Face: {asr_model_ref}")
487
- asr_model_path = snapshot_download(repo_id=asr_model_ref)
488
- logger.info(f"Loading ASR model on {device} ...")
489
- _asr_model = AutoModel(
490
- model=asr_model_path,
491
- disable_update=True,
492
- log_level="INFO",
493
- device=device,
494
- )
495
- logger.info("ASR model loaded.")
496
- return _asr_model
497
-
498
-
499
- # ---------- VoxCPM model (single-process, ZeroGPU compatible) ----------
500
-
501
-
502
- def get_voxcpm_model():
503
- global _voxcpm_model
504
- if _voxcpm_model is not None:
505
- return _voxcpm_model
506
-
507
- with _model_lock:
508
- if _voxcpm_model is not None:
509
- return _voxcpm_model
510
-
511
- from voxcpm import VoxCPM
512
-
513
- model_ref = _resolve_model_ref()
514
- logger.info(f"Loading VoxCPM model from {model_ref} ...")
515
- _voxcpm_model = VoxCPM.from_pretrained(model_ref, load_denoiser=False)
516
- logger.info("VoxCPM model loaded.")
517
- return _voxcpm_model
518
-
519
-
520
- # ---------- GPU-accelerated inference ----------
521
-
522
-
523
- def prompt_wav_recognition(use_prompt_text: bool, prompt_wav: Optional[str]) -> str:
524
- if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
525
- return ""
526
-
527
- asr_model = get_asr_model()
528
- res = asr_model.generate(input=prompt_wav, language="auto", use_itn=True)
529
- return _extract_asr_text(res)
530
-
531
-
532
- def _float_audio_to_int16(wav: np.ndarray) -> np.ndarray:
533
- clipped = np.clip(wav, -1.0, 1.0)
534
- return (clipped * 32767.0).astype(np.int16, copy=False)
535
-
536
-
537
- def _generate_tts_audio_once(
538
- text_input: str,
539
- control_instruction: str = "",
540
- reference_wav_path_input: Optional[str] = None,
541
- use_prompt_text: bool = False,
542
- prompt_text_input: str = "",
543
- cfg_value_input: float = 2.0,
544
- do_normalize: bool = True,
545
- denoise: bool = True,
546
- request: Optional[gr.Request] = None,
547
- ) -> Tuple[int, np.ndarray]:
548
- temp_audio_path = None
549
- try:
550
- model = get_voxcpm_model()
551
-
552
- text = (text_input or "").strip()
553
- if len(text) == 0:
554
- raise ValueError("Please input text to synthesize.")
555
-
556
- control = (control_instruction or "").strip()
557
- final_text = f"({control}){text}" if control and not use_prompt_text else text
558
-
559
- ref_path, temp_audio_path = _prepare_reference_audio_path(
560
- reference_wav_path_input,
561
- denoise=bool(denoise),
562
- request=request,
563
- )
564
-
565
- prompt_text_clean = (prompt_text_input or "").strip()
566
- if use_prompt_text and ref_path is None:
567
- raise ValueError("Ultimate Cloning Mode requires a reference audio clip.")
568
- if use_prompt_text and not prompt_text_clean:
569
- raise ValueError(
570
- "Ultimate Cloning Mode requires a transcript. Please wait for ASR or fill it in manually."
571
- )
572
- if not use_prompt_text:
573
- prompt_text_clean = ""
574
-
575
- generate_kwargs = dict(
576
- text=final_text,
577
- cfg_value=float(cfg_value_input),
578
- inference_timesteps=_get_int_env("VOXCPM_INFERENCE_TIMESTEPS", 10),
579
- )
580
-
581
- if use_prompt_text and ref_path:
582
- logger.info("[Ultimate Cloning] reference audio + transcript")
583
- generate_kwargs["prompt_wav_path"] = ref_path
584
- generate_kwargs["prompt_text"] = prompt_text_clean
585
- generate_kwargs["reference_wav_path"] = ref_path
586
- elif ref_path:
587
- logger.info("[Controllable Cloning] reference audio only")
588
- generate_kwargs["reference_wav_path"] = ref_path
589
- else:
590
- logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
591
-
592
- logger.info(f"Generating: '{final_text[:80]}...'")
593
- wav = model.generate(**generate_kwargs)
594
-
595
- if wav is None or len(wav) == 0:
596
- raise RuntimeError("The model returned no audio.")
597
-
598
- wav = np.asarray(wav, dtype=np.float32)
599
- wav = _float_audio_to_int16(wav)
600
- return (int(model.tts_model.sample_rate), wav)
601
- finally:
602
- if temp_audio_path and os.path.exists(temp_audio_path):
603
- try:
604
- os.unlink(temp_audio_path)
605
- except OSError:
606
- pass
607
-
608
-
609
- @spaces.GPU(duration=300)
610
- def generate_tts_audio(
611
- text_input: str,
612
- control_instruction: str = "",
613
- reference_wav_path_input: Optional[str] = None,
614
- use_prompt_text: bool = False,
615
- prompt_text_input: str = "",
616
- cfg_value_input: float = 2.0,
617
- do_normalize: bool = True,
618
- denoise: bool = True,
619
- request: Optional[gr.Request] = None,
620
- ) -> Tuple[int, np.ndarray]:
621
- _begin_generation_request()
622
- request_payload = {
623
- "event": "tts_request",
624
- "ui_language": _resolve_ui_language(request),
625
- "text": (text_input or "").strip(),
626
- "control_instruction": (control_instruction or "").strip(),
627
- "use_prompt_text": bool(use_prompt_text),
628
- "prompt_text": (prompt_text_input or "").strip(),
629
- "cfg_value": float(cfg_value_input),
630
- "do_normalize": bool(do_normalize),
631
- "denoise": bool(denoise),
632
- "has_reference_audio": bool(reference_wav_path_input and reference_wav_path_input.strip()),
633
- }
634
- if request_payload["has_reference_audio"]:
635
- try:
636
- request_payload["reference_audio_duration_seconds"] = round(
637
- _get_audio_duration_seconds(reference_wav_path_input), 3
638
- )
639
- except Exception as exc:
640
- request_payload["reference_audio_duration_error"] = str(exc)
641
-
642
- try:
643
- try:
644
- result = _generate_tts_audio_once(
645
- text_input=text_input,
646
- control_instruction=control_instruction,
647
- reference_wav_path_input=reference_wav_path_input,
648
- use_prompt_text=use_prompt_text,
649
- prompt_text_input=prompt_text_input,
650
- cfg_value_input=cfg_value_input,
651
- do_normalize=do_normalize,
652
- denoise=denoise,
653
- request=request,
654
- )
655
- try:
656
- _append_request_log({**request_payload, "status": "success"})
657
- except Exception as exc:
658
- logger.warning(f"Failed to append request log: {exc}")
659
- return result
660
- except (ValueError, gr.Error) as exc:
661
- try:
662
- _append_request_log(
663
- {**request_payload, "status": "rejected", "error": str(exc)}
664
- )
665
- except Exception as log_exc:
666
- logger.warning(f"Failed to append request log: {log_exc}")
667
- if isinstance(exc, gr.Error):
668
- raise
669
- raise gr.Error(str(exc)) from exc
670
- except Exception as exc:
671
- logger.exception("Generation failed")
672
- try:
673
- _append_request_log({**request_payload, "status": "error", "error": str(exc)})
674
- except Exception as log_exc:
675
- logger.warning(f"Failed to append request log: {log_exc}")
676
- raise gr.Error(_get_i18n_text("backend_retry_error", request)) from exc
677
- finally:
678
- _end_generation_request()
679
-
680
 
681
  # ---------- UI ----------
682
 
 
1
+ import base64
2
  import json
3
  import logging
4
  import os
5
+ import re
6
  import sys
7
  import tempfile
8
  from datetime import datetime, timezone
9
  from pathlib import Path
10
+ from threading import Lock
11
  from typing import Optional, Tuple
12
 
13
  import gradio as gr
14
  import numpy as np
 
15
 
16
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
+ os.environ.setdefault("OPENBLAS_NUM_THREADS", "4")
18
+ os.environ.setdefault("OMP_NUM_THREADS", "4")
19
+ os.environ.setdefault("MKL_NUM_THREADS", "4")
 
 
 
 
 
 
 
20
 
21
  logging.basicConfig(
22
  level=logging.INFO,
 
24
  handlers=[logging.StreamHandler(sys.stdout)],
25
  )
26
  logger = logging.getLogger(__name__)
27
+
28
+ NANOVLLM_API_BASE = os.environ.get("NANOVLLM_API_BASE", "http://47.85.48.143:8000").rstrip("/")
29
  DEFAULT_ASR_MODEL_REF = "FunAudioLLM/SenseVoiceSmall"
 
30
  MAX_REFERENCE_AUDIO_SECONDS = 50.0
31
+
32
  _persistent_root = None
33
  _request_log_dir = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  _active_generation_requests = 0
35
  _active_generation_lock = Lock()
36
 
 
60
  raise ValueError(f"Invalid boolean env: {name}={value!r}")
61
 
62
 
63
+ # ---------- Request Logging ----------
 
 
 
 
64
 
65
 
66
+ def _configure_cache_dirs() -> None:
67
+ global _persistent_root, _request_log_dir
68
+ persistent_root = Path(os.environ.get("SPACE_PERSISTENT_ROOT", "/data")).expanduser()
69
+ if not persistent_root.exists():
70
+ logger.info("Persistent storage not detected. Request logs disabled.")
71
+ return
72
 
73
+ logs_dir = Path(
74
+ os.environ.get("REQUEST_LOG_DIR", str(persistent_root / "logs"))
75
+ ).expanduser()
76
+ logs_dir.mkdir(parents=True, exist_ok=True)
77
+ _persistent_root = persistent_root
78
+ _request_log_dir = logs_dir
79
+ logger.info(f"Persistent storage detected at {persistent_root}")
80
+ logger.info(f"Request logs will be written to daily files under {_request_log_dir}")
81
 
 
 
 
 
 
 
82
 
83
+ _configure_cache_dirs()
84
 
 
 
 
 
 
85
 
86
+ def _append_request_log(payload: dict) -> None:
87
+ if _request_log_dir is None:
88
+ return
89
+ now = datetime.now(timezone.utc)
90
+ record = {"timestamp": now.isoformat(), **payload}
91
+ log_path = _request_log_dir / f"{now.date().isoformat()}.jsonl"
92
+ with log_path.open("a", encoding="utf-8") as fp:
93
+ fp.write(json.dumps(record, ensure_ascii=False) + "\n")
94
 
 
 
 
 
 
95
 
96
+ def _begin_generation_request() -> None:
97
+ global _active_generation_requests
98
+ with _active_generation_lock:
99
+ _active_generation_requests += 1
 
 
 
 
 
 
 
 
 
 
100
 
101
 
102
+ def _end_generation_request() -> None:
103
+ global _active_generation_requests
104
+ with _active_generation_lock:
105
+ _active_generation_requests = max(0, _active_generation_requests - 1)
106
 
 
 
 
107
 
108
+ def _get_active_generation_requests() -> int:
109
+ with _active_generation_lock:
110
+ return _active_generation_requests
 
 
111
 
112
 
113
+ # ---------- Remote ASR & Denoise via HTTP API ----------
 
 
114
 
 
 
 
 
115
 
116
+ def _api_asr(audio_path: str) -> str:
117
+ """Call POST /asr on the nanovllm server to transcribe audio."""
118
+ import requests
119
 
120
+ path = Path(audio_path)
121
+ wav_b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
122
+ wav_fmt = path.suffix.lstrip(".").lower() or "wav"
123
 
124
+ resp = requests.post(
125
+ f"{NANOVLLM_API_BASE}/asr",
126
+ json={"wav_base64": wav_b64, "wav_format": wav_fmt},
127
+ timeout=60,
128
+ )
129
+ resp.raise_for_status()
130
+ return resp.json().get("text", "")
131
 
132
 
133
+ def _api_denoise(audio_path: str) -> str:
134
+ """Call POST /denoise on the nanovllm server, return path to denoised temp file."""
135
+ import requests
 
136
 
137
+ path = Path(audio_path)
138
+ wav_b64 = base64.b64encode(path.read_bytes()).decode("utf-8")
139
+ wav_fmt = path.suffix.lstrip(".").lower() or "wav"
140
 
141
+ resp = requests.post(
142
+ f"{NANOVLLM_API_BASE}/denoise",
143
+ json={"wav_base64": wav_b64, "wav_format": wav_fmt},
144
+ timeout=120,
145
+ )
146
+ resp.raise_for_status()
147
 
148
+ denoised_b64 = resp.json()["wav_base64"]
149
+ denoised_bytes = base64.b64decode(denoised_b64)
150
 
151
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
152
+ tmp.write(denoised_bytes)
153
+ tmp.close()
154
+ return tmp.name
155
+
156
+
157
+ # ---------- Text Normalization (CPU-only, from VoxCPM text_normalize.py) ----------
158
+
159
+ _chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]+")
160
+ _text_normalizer = None
161
+ _text_normalizer_lock = Lock()
162
+
163
+
164
+ def _contains_chinese(text: str) -> bool:
165
+ return bool(_chinese_char_pattern.search(text))
166
+
167
+
168
+ def _replace_corner_mark(text: str) -> str:
169
+ text = text.replace("\u00b2", "\u5e73\u65b9")
170
+ text = text.replace("\u00b3", "\u7acb\u65b9")
171
+ text = text.replace("\u221a", "\u6839\u53f7")
172
+ text = text.replace("\u2248", "\u7ea6\u7b49\u4e8e")
173
+ text = text.replace("<", "\u5c0f\u4e8e")
174
+ return text
175
+
176
+
177
+ def _remove_bracket(text: str) -> str:
178
+ text = text.replace("\uff08", " ").replace("\uff09", " ")
179
+ text = text.replace("\u3010", " ").replace("\u3011", " ")
180
+ text = text.replace("\u2018", "").replace("\u2019", "")
181
+ text = text.replace("\u2014\u2014", " ")
182
+ return text
183
+
184
+
185
+ def _spell_out_number(text: str, inflect_parser) -> str:
186
+ new_text = []
187
+ st = None
188
+ for i, c in enumerate(text):
189
+ if not c.isdigit():
190
+ if st is not None:
191
+ num_str = inflect_parser.number_to_words(text[st:i])
192
+ new_text.append(num_str)
193
+ st = None
194
+ new_text.append(c)
195
+ else:
196
+ if st is None:
197
+ st = i
198
+ if st is not None and st < len(text):
199
+ num_str = inflect_parser.number_to_words(text[st:])
200
+ new_text.append(num_str)
201
+ return "".join(new_text)
202
+
203
+
204
+ def _replace_blank(text: str) -> str:
205
+ out_str = []
206
+ for i, c in enumerate(text):
207
+ if c == " ":
208
+ if (
209
+ i + 1 < len(text) and text[i + 1].isascii() and text[i + 1] != " "
210
+ and i - 1 >= 0 and text[i - 1].isascii() and text[i - 1] != " "
211
+ ):
212
+ out_str.append(c)
213
+ else:
214
+ out_str.append(c)
215
+ return "".join(out_str)
216
+
217
+
218
+ def _clean_markdown(md_text: str) -> str:
219
+ import regex
220
+
221
+ md_text = re.sub(r"```.*?```", "", md_text, flags=re.DOTALL)
222
+ md_text = re.sub(r"`[^`]*`", "", md_text)
223
+ md_text = re.sub(r"!\[[^\]]*\]\([^\)]+\)", "", md_text)
224
+ md_text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", md_text)
225
+ md_text = re.sub(r"^(\s*)-\s+", r"\1", md_text, flags=re.MULTILINE)
226
+ md_text = re.sub(r"<[^>]+>", "", md_text)
227
+ md_text = re.sub(r"^#{1,6}\s*", "", md_text, flags=re.MULTILINE)
228
+ md_text = re.sub(r"\n\s*\n", "\n", md_text)
229
+ md_text = md_text.strip()
230
+ return md_text
231
+
232
+
233
+ def _clean_text(text: str) -> str:
234
+ import regex
235
+
236
+ text = _clean_markdown(text)
237
+ text = regex.compile(r"\p{Emoji_Presentation}|\p{Emoji}\uFE0F", flags=regex.UNICODE).sub("", text)
238
+ text = text.replace("\n", " ").replace("\t", " ")
239
+ text = text.replace("\u201c", '"').replace("\u201d", '"')
240
+ return text
241
+
242
+
243
+ def _get_text_normalizer():
244
+ global _text_normalizer
245
+ if _text_normalizer is not None:
246
+ return _text_normalizer
247
+ with _text_normalizer_lock:
248
+ if _text_normalizer is not None:
249
+ return _text_normalizer
250
+ from wetext import Normalizer
251
+ import inflect
252
+
253
+ _text_normalizer = {
254
+ "zh_tn": Normalizer(lang="zh", operator="tn", remove_erhua=True),
255
+ "en_tn": Normalizer(lang="en", operator="tn"),
256
+ "inflect": inflect.engine(),
257
+ }
258
+ logger.info("TextNormalizer loaded.")
259
+ return _text_normalizer
260
+
261
+
262
+ def normalize_text(text: str) -> str:
263
+ """Normalize text (numbers, dates, abbreviations) for TTS input."""
264
+ tn = _get_text_normalizer()
265
+ lang = "zh" if _contains_chinese(text) else "en"
266
+ text = _clean_text(text)
267
+ if lang == "zh":
268
+ text = text.replace("=", "\u7b49\u4e8e")
269
+ if re.search(r"([\d$%^*_+\u2265\u2264\u2260\u00d7\u00f7?=])", text):
270
+ text = re.sub(r"(?<=[a-zA-Z0-9])-(?=\d)", " - ", text)
271
+ text = tn["zh_tn"].normalize(text)
272
+ text = _replace_blank(text)
273
+ text = _replace_corner_mark(text)
274
+ text = _remove_bracket(text)
275
+ else:
276
+ text = tn["en_tn"].normalize(text)
277
+ text = _spell_out_number(text, tn["inflect"])
278
+ return text
279
+
280
+
281
+ def _safe_prompt_wav_recognition(
282
+ use_prompt_text: bool, prompt_wav: Optional[str], request: Optional[gr.Request] = None
283
+ ) -> str:
284
+ if not use_prompt_text or prompt_wav is None or not prompt_wav.strip():
285
+ return ""
286
+ try:
287
+ return _api_asr(prompt_wav)
288
+ except Exception as exc:
289
+ logger.warning(f"ASR recognition failed: {exc}")
290
+ raise gr.Error(_get_i18n_text("asr_failed_error", request)) from exc
291
+
292
+
293
+ # ---------- Audio helpers ----------
294
+
295
+
296
+ def _get_audio_duration_seconds(audio_path: str) -> float:
297
+ import soundfile as sf
298
+
299
+ info = sf.info(audio_path)
300
+ return float(info.frames) / float(info.samplerate)
301
 
302
 
303
  def _validate_reference_audio_duration(
 
308
  raise gr.Error(_get_i18n_text("reference_audio_too_long_error", request))
309
 
310
 
311
+ # ---------- Nano-vLLM HTTP API Client ----------
312
+
 
 
 
 
 
 
 
313
 
314
+ def _api_generate(payload: dict) -> str:
315
+ """Call POST /generate, receive streaming MP3, save to temp file and return path."""
316
+ import requests
317
 
318
+ url = f"{NANOVLLM_API_BASE}/generate"
319
+ logger.info(f"Calling {url} ...")
320
 
321
+ resp = requests.post(url, json=payload, stream=True, timeout=300)
322
+ resp.raise_for_status()
323
+
324
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
325
  try:
326
+ for chunk in resp.iter_content(chunk_size=64 * 1024):
327
+ tmp.write(chunk)
328
+ tmp.close()
329
+ return tmp.name
330
+ except Exception:
331
+ tmp.close()
332
+ if os.path.exists(tmp.name):
333
+ os.unlink(tmp.name)
334
+ raise
335
 
336
 
337
+ def _api_get_info() -> dict:
338
+ import requests
339
+
340
+ resp = requests.get(f"{NANOVLLM_API_BASE}/info", timeout=10)
341
+ resp.raise_for_status()
342
+ return resp.json()
343
+
344
+
345
+ # ---------- Generation via HTTP API ----------
346
+
347
+
348
+ def generate_tts_audio(
349
+ text_input: str,
350
+ control_instruction: str = "",
351
+ reference_wav_path_input: Optional[str] = None,
352
+ use_prompt_text: bool = False,
353
+ prompt_text_input: str = "",
354
+ cfg_value_input: float = 2.0,
355
+ do_normalize: bool = True,
356
+ denoise: bool = True,
357
+ request: Optional[gr.Request] = None,
358
  ) -> str:
359
+ _begin_generation_request()
360
+ request_payload = {
361
+ "event": "tts_request",
362
+ "ui_language": _resolve_ui_language(request),
363
+ "text": (text_input or "").strip(),
364
+ "control_instruction": (control_instruction or "").strip(),
365
+ "use_prompt_text": bool(use_prompt_text),
366
+ "prompt_text": (prompt_text_input or "").strip(),
367
+ "cfg_value": float(cfg_value_input),
368
+ "do_normalize": bool(do_normalize),
369
+ "denoise": bool(denoise),
370
+ "has_reference_audio": bool(reference_wav_path_input and reference_wav_path_input.strip()),
371
+ }
372
+ if request_payload["has_reference_audio"]:
373
+ try:
374
+ request_payload["reference_audio_duration_seconds"] = round(
375
+ _get_audio_duration_seconds(reference_wav_path_input), 3
376
+ )
377
+ except Exception as exc:
378
+ request_payload["reference_audio_duration_error"] = str(exc)
379
+
380
  try:
381
+ text = (text_input or "").strip()
382
+ if not text:
383
+ raise ValueError("Please input text to synthesize.")
384
+
385
+ control = (control_instruction or "").strip()
386
+ final_text = f"({control}){text}" if control and not use_prompt_text else text
387
+
388
+ if do_normalize:
389
+ try:
390
+ original = final_text
391
+ final_text = normalize_text(final_text)
392
+ if final_text != original:
393
+ logger.info(f"Text normalized: '{original[:60]}' -> '{final_text[:60]}'")
394
+ except Exception as exc:
395
+ logger.warning(f"Text normalization failed, using original: {exc}")
396
+
397
+ prompt_text_clean = (prompt_text_input or "").strip()
398
+ if use_prompt_text and not reference_wav_path_input:
399
+ raise ValueError("Ultimate Cloning Mode requires a reference audio clip.")
400
+ if use_prompt_text and not prompt_text_clean:
401
+ raise ValueError(
402
+ "Ultimate Cloning Mode requires a transcript. "
403
+ "Please wait for ASR or fill it in manually."
404
+ )
405
+ if not use_prompt_text:
406
+ prompt_text_clean = ""
407
+
408
+ has_ref = reference_wav_path_input and reference_wav_path_input.strip()
409
+ if has_ref:
410
+ _validate_reference_audio_duration(reference_wav_path_input, request)
411
+
412
+ denoised_tmp = None
413
+ api_payload: dict = {
414
+ "target_text": final_text,
415
+ "cfg_value": float(cfg_value_input),
416
+ }
417
+
418
+ try:
419
+ if has_ref:
420
+ actual_ref_path = reference_wav_path_input
421
+ if denoise:
422
+ logger.info("Applying server-side denoise to reference audio ...")
423
+ try:
424
+ denoised_tmp = _api_denoise(reference_wav_path_input)
425
+ actual_ref_path = denoised_tmp
426
+ logger.info("Denoise completed.")
427
+ except Exception as exc:
428
+ logger.warning(f"Denoise failed, using original audio: {exc}")
429
+
430
+ ref_path = Path(actual_ref_path)
431
+ wav_b64 = base64.b64encode(ref_path.read_bytes()).decode("utf-8")
432
+ wav_fmt = ref_path.suffix.lstrip(".").lower() or "wav"
433
+
434
+ if use_prompt_text:
435
+ logger.info("[Ultimate Cloning] reference audio + transcript")
436
+ api_payload["prompt_wav_base64"] = wav_b64
437
+ api_payload["prompt_wav_format"] = wav_fmt
438
+ api_payload["prompt_text"] = prompt_text_clean
439
+ api_payload["ref_audio_wav_base64"] = wav_b64
440
+ api_payload["ref_audio_wav_format"] = wav_fmt
441
+ else:
442
+ logger.info("[Controllable Cloning] reference audio only")
443
+ api_payload["ref_audio_wav_base64"] = wav_b64
444
+ api_payload["ref_audio_wav_format"] = wav_fmt
445
+ else:
446
+ logger.info(f"[Voice Design] control: {control[:50] if control else 'None'}")
447
+
448
+ logger.info(f"Generating: '{final_text[:80]}...'")
449
+ mp3_path = _api_generate(api_payload)
450
+ finally:
451
+ if denoised_tmp and os.path.exists(denoised_tmp):
452
+ try:
453
+ os.unlink(denoised_tmp)
454
+ except OSError:
455
+ pass
456
+
457
+ try:
458
+ _append_request_log({**request_payload, "status": "success"})
459
+ except Exception as exc:
460
+ logger.warning(f"Failed to append request log: {exc}")
461
+
462
+ return mp3_path
463
+
464
+ except (ValueError, gr.Error) as exc:
465
+ try:
466
+ _append_request_log({**request_payload, "status": "rejected", "error": str(exc)})
467
+ except Exception:
468
+ pass
469
+ if isinstance(exc, gr.Error):
470
+ raise
471
+ raise gr.Error(str(exc)) from exc
472
  except Exception as exc:
473
+ logger.exception("Generation failed")
474
+ try:
475
+ _append_request_log({**request_payload, "status": "error", "error": str(exc)})
476
+ except Exception:
477
+ pass
478
+ raise gr.Error(_get_i18n_text("backend_retry_error", request)) from exc
479
+ finally:
480
+ _end_generation_request()
481
 
482
 
483
+ # ---------- Inline i18n (en + zh-CN) ----------
484
+
485
 
486
  _USAGE_INSTRUCTIONS_EN = (
487
  "**VoxCPM2 — Three Modes of Speech Generation:**\n\n"
 
640
  )
641
 
642
 
643
+ # ---------- Theme & CSS ----------
 
 
 
 
 
 
 
 
644
 
645
  DEFAULT_TARGET_TEXT = (
646
  "VoxCPM2 is a creative multilingual TTS model from ModelBest, "
 
704
  font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
705
  )
706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
 
708
  # ---------- UI ----------
709
 
requirements.txt CHANGED
@@ -1,23 +1,7 @@
1
  gradio==6.0.0
2
- huggingface-hub
3
- funasr
4
- modelscope>=1.22.0
5
  numpy>=1.21.0
6
- torch==2.5.1
7
- torchaudio==2.5.1
8
- voxcpm
9
- transformers>=4.51.0
10
- addict
11
- simplejson
12
- sortedcontainers
13
- xxhash
14
- tqdm
15
- librosa
16
- pydantic
17
  soundfile>=0.13.1
18
- torchcodec
19
- packaging
20
- psutil
21
- ninja
22
- setuptools
23
- wheel
 
1
  gradio==6.0.0
2
+ inflect
 
 
3
  numpy>=1.21.0
4
+ regex
5
+ requests
 
 
 
 
 
 
 
 
 
6
  soundfile>=0.13.1
7
+ wetext