matthewliu0302
/

grit_v1

@@ -17,12 +17,7 @@ tags:
 # vocence_miner_v8
-A naturalness-first prompt-driven TTS, built on top of `magma90909/vocence_miner_v7`. Two things distinguish this checkpoint:
-* **British English coverage.** Phrasings like *"A man with a British English accent"*, *"A Scottish woman, conversational"*, *"a Welsh narrator"* land on a real distribution rather than slipping back to neutral US English.
-* **Conversational subtlety.** Tuned for everyday delivery — *"speaking warmly"*, *"softly sad"*, *"with a touch of anger, controlled"* — rather than theatrical intensity. The model deliberately steps back when you don't ask for drama.
-24 kHz mono WAV output, single forward call, no reference audio, no PEFT runtime. Everything ships in this repo.
 ## Generate

 # vocence_miner_v8
+A naturalness-first prompt-driven TTS, built on top of `magma90909/vocence_miner_v8`.
 ## Generate

miner.py CHANGED Viewed

@@ -58,6 +58,85 @@ class _RuntimeOpts:
             flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
         )
 class Miner:
     """Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
@@ -97,7 +176,7 @@ class Miner:
             raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
-        prompt = self._truncate(instruction, self.opts.max_instruction_chars)
         body = self._truncate(text, self.opts.max_text_chars)
         wavs, sample_rate = self.model.generate_voice_design(

             flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
         )
+_SPEED_ADVERBS = {
+    "slow": "slowly",
+    "normal": "at a normal pace",
+    "fast": "quickly",
+}
+_EMOTION_ADVERBS = {
+    "neutral": "in a neutral manner",
+    "happy": "happily",
+    "sad": "sadly",
+    "angry": "angrily",
+    "calm": "calmly",
+    "excited": "excitedly",
+    "serious": "seriously",
+    "fearful": "fearfully",
+}
+_ACCENT_NAMES = {
+    "us": "American",
+    "uk": "British",
+    "au": "Australian",
+    "in": "Indian",
+    "neutral": "neutral",
+    "other": "neutral",
+}
+_AGE_PHRASES = {
+    "child": "child",
+    "young_adult": "young adult",
+    "adult": "adult",
+    "senior": "senior",
+}
+def _structured_to_natural(instruction: str) -> str:
+    """Rewrite 'gender: X | pitch: Y | ...' as a natural-language sentence.
+    Pass-through for any input that is not a key:value pipe-separated string
+    (so warmup() and ad-hoc callers can still hand in plain prose)."""
+    if "|" not in instruction or ":" not in instruction:
+        return instruction
+    parts: dict[str, str] = {}
+    for chunk in instruction.split("|"):
+        if ":" not in chunk:
+            continue
+        k, v = chunk.split(":", 1)
+        parts[k.strip().lower()] = v.strip().lower()
+    if not any(k in parts for k in ("gender", "pitch", "speed", "age_group",
+                                    "emotion", "tone", "accent")):
+        return instruction
+    age = _AGE_PHRASES.get(parts.get("age_group", "adult"),
+                           parts.get("age_group", "adult").replace("_", " "))
+    gender = parts.get("gender", "neutral")
+    tone = parts.get("tone", "casual")
+    pitch = parts.get("pitch", "mid")
+    speed_raw = parts.get("speed", "normal")
+    emotion_raw = parts.get("emotion", "neutral")
+    accent_raw = parts.get("accent", "neutral")
+    speed_adv = _SPEED_ADVERBS.get(speed_raw, f"at a {speed_raw} pace")
+    emotion_adv = _EMOTION_ADVERBS.get(emotion_raw, f"in a {emotion_raw} manner")
+    accent = _ACCENT_NAMES.get(accent_raw, accent_raw)
+    def _a(word: str) -> str:
+        return "an" if word and word[0].lower() in "aeiou" else "a"
+    if gender == "neutral":
+        speaker = f"{_a(age).capitalize()} {age} speaker"
+    else:
+        speaker = f"{_a(age).capitalize()} {age} {gender} speaker"
+    return (
+        f"{speaker} with {_a(tone)} {tone} tone speaks {speed_adv} "
+        f"and {emotion_adv} at {_a(pitch)} {pitch} pitch, "
+        f"with {_a(accent)} {accent} accent."
+    )
 class Miner:
     """Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
             raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
+        prompt = _structured_to_natural(instruction)
         body = self._truncate(text, self.opts.max_text_chars)
         wavs, sample_rate = self.model.generate_voice_design(

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62868f7d7e199a4279a944f1c412845a5eba8ea758ac3160c0a0230acf86b2f4
-size 3833402520

 version https://git-lfs.github.com/spec/v1
+oid sha256:a3c2016ed27450e0844af38eba8ecb62341b7ea505849fa4e0c3b671e162fa5a
+size 3833402584