Text-to-Speech
Transformers
Safetensors
Qwen3-TTS
English
text-generation
tts
prompttts
qwen3-tts
voice-design
vocence
british-english
uk-accent
Instructions to use matthewliu0302/grit_v1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use matthewliu0302/grit_v1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-to-speech", model="matthewliu0302/grit_v1")# Load model directly from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained("matthewliu0302/grit_v1", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Commit ·
1a78361
1
Parent(s): 4236e3c
fine tune from v8
Browse files- README.md +1 -6
- miner.py +80 -1
- model.safetensors +2 -2
README.md
CHANGED
|
@@ -17,12 +17,7 @@ tags:
|
|
| 17 |
|
| 18 |
# vocence_miner_v8
|
| 19 |
|
| 20 |
-
A naturalness-first prompt-driven TTS, built on top of `magma90909/
|
| 21 |
-
|
| 22 |
-
* **British English coverage.** Phrasings like *"A man with a British English accent"*, *"A Scottish woman, conversational"*, *"a Welsh narrator"* land on a real distribution rather than slipping back to neutral US English.
|
| 23 |
-
* **Conversational subtlety.** Tuned for everyday delivery — *"speaking warmly"*, *"softly sad"*, *"with a touch of anger, controlled"* — rather than theatrical intensity. The model deliberately steps back when you don't ask for drama.
|
| 24 |
-
|
| 25 |
-
24 kHz mono WAV output, single forward call, no reference audio, no PEFT runtime. Everything ships in this repo.
|
| 26 |
|
| 27 |
## Generate
|
| 28 |
|
|
|
|
| 17 |
|
| 18 |
# vocence_miner_v8
|
| 19 |
|
| 20 |
+
A naturalness-first prompt-driven TTS, built on top of `magma90909/vocence_miner_v8`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
## Generate
|
| 23 |
|
miner.py
CHANGED
|
@@ -58,6 +58,85 @@ class _RuntimeOpts:
|
|
| 58 |
flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
|
| 59 |
)
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
class Miner:
|
| 63 |
"""Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
|
|
@@ -97,7 +176,7 @@ class Miner:
|
|
| 97 |
raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
|
| 98 |
|
| 99 |
def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
|
| 100 |
-
prompt =
|
| 101 |
body = self._truncate(text, self.opts.max_text_chars)
|
| 102 |
|
| 103 |
wavs, sample_rate = self.model.generate_voice_design(
|
|
|
|
| 58 |
flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
|
| 59 |
)
|
| 60 |
|
| 61 |
+
_SPEED_ADVERBS = {
|
| 62 |
+
"slow": "slowly",
|
| 63 |
+
"normal": "at a normal pace",
|
| 64 |
+
"fast": "quickly",
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
_EMOTION_ADVERBS = {
|
| 68 |
+
"neutral": "in a neutral manner",
|
| 69 |
+
"happy": "happily",
|
| 70 |
+
"sad": "sadly",
|
| 71 |
+
"angry": "angrily",
|
| 72 |
+
"calm": "calmly",
|
| 73 |
+
"excited": "excitedly",
|
| 74 |
+
"serious": "seriously",
|
| 75 |
+
"fearful": "fearfully",
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
_ACCENT_NAMES = {
|
| 79 |
+
"us": "American",
|
| 80 |
+
"uk": "British",
|
| 81 |
+
"au": "Australian",
|
| 82 |
+
"in": "Indian",
|
| 83 |
+
"neutral": "neutral",
|
| 84 |
+
"other": "neutral",
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
_AGE_PHRASES = {
|
| 88 |
+
"child": "child",
|
| 89 |
+
"young_adult": "young adult",
|
| 90 |
+
"adult": "adult",
|
| 91 |
+
"senior": "senior",
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _structured_to_natural(instruction: str) -> str:
|
| 96 |
+
"""Rewrite 'gender: X | pitch: Y | ...' as a natural-language sentence.
|
| 97 |
+
|
| 98 |
+
Pass-through for any input that is not a key:value pipe-separated string
|
| 99 |
+
(so warmup() and ad-hoc callers can still hand in plain prose)."""
|
| 100 |
+
if "|" not in instruction or ":" not in instruction:
|
| 101 |
+
return instruction
|
| 102 |
+
|
| 103 |
+
parts: dict[str, str] = {}
|
| 104 |
+
for chunk in instruction.split("|"):
|
| 105 |
+
if ":" not in chunk:
|
| 106 |
+
continue
|
| 107 |
+
k, v = chunk.split(":", 1)
|
| 108 |
+
parts[k.strip().lower()] = v.strip().lower()
|
| 109 |
+
|
| 110 |
+
if not any(k in parts for k in ("gender", "pitch", "speed", "age_group",
|
| 111 |
+
"emotion", "tone", "accent")):
|
| 112 |
+
return instruction
|
| 113 |
+
|
| 114 |
+
age = _AGE_PHRASES.get(parts.get("age_group", "adult"),
|
| 115 |
+
parts.get("age_group", "adult").replace("_", " "))
|
| 116 |
+
gender = parts.get("gender", "neutral")
|
| 117 |
+
tone = parts.get("tone", "casual")
|
| 118 |
+
pitch = parts.get("pitch", "mid")
|
| 119 |
+
speed_raw = parts.get("speed", "normal")
|
| 120 |
+
emotion_raw = parts.get("emotion", "neutral")
|
| 121 |
+
accent_raw = parts.get("accent", "neutral")
|
| 122 |
+
|
| 123 |
+
speed_adv = _SPEED_ADVERBS.get(speed_raw, f"at a {speed_raw} pace")
|
| 124 |
+
emotion_adv = _EMOTION_ADVERBS.get(emotion_raw, f"in a {emotion_raw} manner")
|
| 125 |
+
accent = _ACCENT_NAMES.get(accent_raw, accent_raw)
|
| 126 |
+
|
| 127 |
+
def _a(word: str) -> str:
|
| 128 |
+
return "an" if word and word[0].lower() in "aeiou" else "a"
|
| 129 |
+
|
| 130 |
+
if gender == "neutral":
|
| 131 |
+
speaker = f"{_a(age).capitalize()} {age} speaker"
|
| 132 |
+
else:
|
| 133 |
+
speaker = f"{_a(age).capitalize()} {age} {gender} speaker"
|
| 134 |
+
|
| 135 |
+
return (
|
| 136 |
+
f"{speaker} with {_a(tone)} {tone} tone speaks {speed_adv} "
|
| 137 |
+
f"and {emotion_adv} at {_a(pitch)} {pitch} pitch, "
|
| 138 |
+
f"with {_a(accent)} {accent} accent."
|
| 139 |
+
)
|
| 140 |
|
| 141 |
class Miner:
|
| 142 |
"""Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
|
|
|
|
| 176 |
raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
|
| 177 |
|
| 178 |
def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
|
| 179 |
+
prompt = _structured_to_natural(instruction)
|
| 180 |
body = self._truncate(text, self.opts.max_text_chars)
|
| 181 |
|
| 182 |
wavs, sample_rate = self.model.generate_voice_design(
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3c2016ed27450e0844af38eba8ecb62341b7ea505849fa4e0c3b671e162fa5a
|
| 3 |
+
size 3833402584
|