omnivoice-taiwanese-hakka

Sleeping

Li Wei Chen commited on Apr 22

Commit

80cdf12

1 Parent(s): 14e913b

fix: correct G2P duration estimation and remove punctuation spaces

When G2P is enabled, estimate audio duration from the original Chinese
text instead of the pinyin output to avoid weight inflation from tone
number digits (weight 3.5 vs CJK 3.0). Also strip spaces around
punctuation in the G2P output.

Files changed (1) hide show

app.py +25 -6

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import logging
 import os
 from dataclasses import dataclass
 from typing import Any
@@ -158,7 +159,10 @@ def apply_g2p(text: str, dialect: str) -> str:
     lang_group = DIALECT_TO_LANG_GROUP.get(dialect, "hak_sx")
     result = g2p(text, lang_group=lang_group, pronunciation_type="pinyin")
-    return " ".join(result.pronunciations).upper()
 def validate_inputs(
@@ -208,11 +212,9 @@ def synthesize(
         return None, startup_status()
     try:
-        input_text = text.strip()
         g2p_note = ""
-        if use_g2p:
-            input_text = apply_g2p(input_text, dialect)
-            g2p_note = f"；G2P 轉換：{input_text}"
         generation_config = RUNTIME.generation_config_cls(
             num_step=int(num_step),
@@ -226,6 +228,21 @@ def synthesize(
             ref_text=ref_text.strip(),
             preprocess_prompt=True,
         )
         generate_kwargs: dict[str, Any] = {
             "text": input_text,
             "voice_clone_prompt": voice_clone_prompt,
@@ -233,7 +250,9 @@ def synthesize(
             "generation_config": generation_config,
             "language": "zh",
         }
-        if speed != DEFAULT_SPEED:
             generate_kwargs["speed"] = float(speed)
         audio = RUNTIME.model.generate(**generate_kwargs)

 import logging
 import os
+import re
 from dataclasses import dataclass
 from typing import Any
     lang_group = DIALECT_TO_LANG_GROUP.get(dialect, "hak_sx")
     result = g2p(text, lang_group=lang_group, pronunciation_type="pinyin")
+    joined = " ".join(result.pronunciations).upper()
+    joined = re.sub(r"\s+([，。！？；：、…「」『』【】〔〕（）])", r"\1", joined)
+    joined = re.sub(r"([，。！？；：、…「」『』【】〔〕（）])\s+", r"\1", joined)
+    return joined
 def validate_inputs(
         return None, startup_status()
     try:
+        original_text = text.strip()
         g2p_note = ""
+        duration_override = None
         generation_config = RUNTIME.generation_config_cls(
             num_step=int(num_step),
             ref_text=ref_text.strip(),
             preprocess_prompt=True,
         )
+        if use_g2p:
+            input_text = apply_g2p(original_text, dialect)
+            g2p_note = f"；G2P 轉換：{input_text}"
+            # Estimate duration from original Chinese text to avoid weight inflation
+            # caused by tone number digits (weight 3.5) in the G2P output.
+            num_ref_tokens = voice_clone_prompt.ref_audio_tokens.size(-1)
+            frame_rate = RUNTIME.model.audio_tokenizer.config.frame_rate
+            est_frames = RUNTIME.model.duration_estimator.estimate_duration(
+                original_text, voice_clone_prompt.ref_text, num_ref_tokens
+            )
+            duration_override = est_frames / float(speed) / frame_rate
+        else:
+            input_text = original_text
         generate_kwargs: dict[str, Any] = {
             "text": input_text,
             "voice_clone_prompt": voice_clone_prompt,
             "generation_config": generation_config,
             "language": "zh",
         }
+        if duration_override is not None:
+            generate_kwargs["duration"] = duration_override
+        elif speed != DEFAULT_SPEED:
             generate_kwargs["speed"] = float(speed)
         audio = RUNTIME.model.generate(**generate_kwargs)