matthewliu0302 commited on
Commit
1a78361
·
1 Parent(s): 4236e3c

fine tune from v8

Browse files
Files changed (3) hide show
  1. README.md +1 -6
  2. miner.py +80 -1
  3. model.safetensors +2 -2
README.md CHANGED
@@ -17,12 +17,7 @@ tags:
17
 
18
  # vocence_miner_v8
19
 
20
- A naturalness-first prompt-driven TTS, built on top of `magma90909/vocence_miner_v7`. Two things distinguish this checkpoint:
21
-
22
- * **British English coverage.** Phrasings like *"A man with a British English accent"*, *"A Scottish woman, conversational"*, *"a Welsh narrator"* land on a real distribution rather than slipping back to neutral US English.
23
- * **Conversational subtlety.** Tuned for everyday delivery — *"speaking warmly"*, *"softly sad"*, *"with a touch of anger, controlled"* — rather than theatrical intensity. The model deliberately steps back when you don't ask for drama.
24
-
25
- 24 kHz mono WAV output, single forward call, no reference audio, no PEFT runtime. Everything ships in this repo.
26
 
27
  ## Generate
28
 
 
17
 
18
  # vocence_miner_v8
19
 
20
+ A naturalness-first prompt-driven TTS, built on top of `magma90909/vocence_miner_v8`.
 
 
 
 
 
21
 
22
  ## Generate
23
 
miner.py CHANGED
@@ -58,6 +58,85 @@ class _RuntimeOpts:
58
  flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
59
  )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  class Miner:
63
  """Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
@@ -97,7 +176,7 @@ class Miner:
97
  raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
98
 
99
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
100
- prompt = self._truncate(instruction, self.opts.max_instruction_chars)
101
  body = self._truncate(text, self.opts.max_text_chars)
102
 
103
  wavs, sample_rate = self.model.generate_voice_design(
 
58
  flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
59
  )
60
 
61
+ _SPEED_ADVERBS = {
62
+ "slow": "slowly",
63
+ "normal": "at a normal pace",
64
+ "fast": "quickly",
65
+ }
66
+
67
+ _EMOTION_ADVERBS = {
68
+ "neutral": "in a neutral manner",
69
+ "happy": "happily",
70
+ "sad": "sadly",
71
+ "angry": "angrily",
72
+ "calm": "calmly",
73
+ "excited": "excitedly",
74
+ "serious": "seriously",
75
+ "fearful": "fearfully",
76
+ }
77
+
78
+ _ACCENT_NAMES = {
79
+ "us": "American",
80
+ "uk": "British",
81
+ "au": "Australian",
82
+ "in": "Indian",
83
+ "neutral": "neutral",
84
+ "other": "neutral",
85
+ }
86
+
87
+ _AGE_PHRASES = {
88
+ "child": "child",
89
+ "young_adult": "young adult",
90
+ "adult": "adult",
91
+ "senior": "senior",
92
+ }
93
+
94
+
95
+ def _structured_to_natural(instruction: str) -> str:
96
+ """Rewrite 'gender: X | pitch: Y | ...' as a natural-language sentence.
97
+
98
+ Pass-through for any input that is not a key:value pipe-separated string
99
+ (so warmup() and ad-hoc callers can still hand in plain prose)."""
100
+ if "|" not in instruction or ":" not in instruction:
101
+ return instruction
102
+
103
+ parts: dict[str, str] = {}
104
+ for chunk in instruction.split("|"):
105
+ if ":" not in chunk:
106
+ continue
107
+ k, v = chunk.split(":", 1)
108
+ parts[k.strip().lower()] = v.strip().lower()
109
+
110
+ if not any(k in parts for k in ("gender", "pitch", "speed", "age_group",
111
+ "emotion", "tone", "accent")):
112
+ return instruction
113
+
114
+ age = _AGE_PHRASES.get(parts.get("age_group", "adult"),
115
+ parts.get("age_group", "adult").replace("_", " "))
116
+ gender = parts.get("gender", "neutral")
117
+ tone = parts.get("tone", "casual")
118
+ pitch = parts.get("pitch", "mid")
119
+ speed_raw = parts.get("speed", "normal")
120
+ emotion_raw = parts.get("emotion", "neutral")
121
+ accent_raw = parts.get("accent", "neutral")
122
+
123
+ speed_adv = _SPEED_ADVERBS.get(speed_raw, f"at a {speed_raw} pace")
124
+ emotion_adv = _EMOTION_ADVERBS.get(emotion_raw, f"in a {emotion_raw} manner")
125
+ accent = _ACCENT_NAMES.get(accent_raw, accent_raw)
126
+
127
+ def _a(word: str) -> str:
128
+ return "an" if word and word[0].lower() in "aeiou" else "a"
129
+
130
+ if gender == "neutral":
131
+ speaker = f"{_a(age).capitalize()} {age} speaker"
132
+ else:
133
+ speaker = f"{_a(age).capitalize()} {age} {gender} speaker"
134
+
135
+ return (
136
+ f"{speaker} with {_a(tone)} {tone} tone speaks {speed_adv} "
137
+ f"and {emotion_adv} at {_a(pitch)} {pitch} pitch, "
138
+ f"with {_a(accent)} {accent} accent."
139
+ )
140
 
141
  class Miner:
142
  """Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
 
176
  raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
177
 
178
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
179
+ prompt = _structured_to_natural(instruction)
180
  body = self._truncate(text, self.opts.max_text_chars)
181
 
182
  wavs, sample_rate = self.model.generate_voice_design(
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62868f7d7e199a4279a944f1c412845a5eba8ea758ac3160c0a0230acf86b2f4
3
- size 3833402520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3c2016ed27450e0844af38eba8ecb62341b7ea505849fa4e0c3b671e162fa5a
3
+ size 3833402584