matthewliu0302 commited on
Commit
09a0f63
·
1 Parent(s): 311edf9

adapt to new guideline

Browse files
Files changed (3) hide show
  1. README.md +0 -72
  2. miner.py +32 -31
  3. vocence_config.yaml +3 -0
README.md CHANGED
@@ -24,75 +24,3 @@ A naturalness-first prompt-driven TTS, built on top of `magma90909/vocence_miner
24
 
25
  24 kHz mono WAV output, single forward call, no reference audio, no PEFT runtime. Everything ships in this repo.
26
 
27
- ## Generate
28
-
29
- ```bash
30
- pip install qwen-tts transformers torch soundfile
31
- ```
32
-
33
- ```python
34
- from qwen_tts import Qwen3TTSModel
35
- import soundfile as sf
36
-
37
- m = Qwen3TTSModel.from_pretrained("magma90909/vocence_miner_v8")
38
-
39
- wavs, sr = m.generate_voice_design(
40
- text="The train to Edinburgh departs from platform four.",
41
- instruct="A man with a British English accent, calm and natural.",
42
- language="english",
43
- )
44
- sf.write("out.wav", wavs[0], sr)
45
- ```
46
-
47
- `demo.py` walks through three preset prompts.
48
-
49
- ## How to write `instruct`
50
-
51
- The model responds best to **subtle, conversational** language — not intensifiers like *"intensely sad"* or *"nearly shouting"*. Stack these elements freely:
52
-
53
- | Layer | Phrasings |
54
- |-------|-----------|
55
- | Accent / region | *British English*, *Scottish*, *Welsh*, *Northern Irish*, *Irish*, *unspecified* |
56
- | Gender | *a man*, *a woman*, *a British woman* |
57
- | Mood | *speaking warmly*, *softly sad*, *quietly pleased*, *with a touch of anger* |
58
- | Persona | *bedtime storyteller, soft and warm*; *news anchor, professional and neutral*; *meditation guide, soft and serene* |
59
- | Pace | *unhurried*, *brisk steady*, *naturally measured* |
60
-
61
- Some example prompts that work well:
62
-
63
- ```
64
- A British man speaks calmly and naturally.
65
- A woman with a Scottish accent, in an everyday speaking tone.
66
- A man, softly sad, calm and unhurried.
67
- A British news anchor, professional and neutral, at a brisk steady pace.
68
- A clear, neutral voice reading the sentence.
69
- ```
70
-
71
- ## Best-fit and not-fit
72
-
73
- **Best at:**
74
- * Natural, everyday English — both US and UK
75
- * Bedtime storyteller / news anchor / meditation guide style reads
76
- * Conversational sadness, warmth, mild anger, gentle pleasure
77
-
78
- **Less suited for:**
79
- * Theatrical / caricatured delivery (loud anger, shouted joy, dramatic sadness)
80
- * Extreme intensifier prompts ("nearly shouting", "intensely sad") — the model intentionally tones these down
81
- * Languages other than English
82
-
83
- CC BY-NC-SA 4.0 — research and non-commercial use only.
84
-
85
- ## Files
86
-
87
- ```
88
- model.safetensors # merged Talker weights (3.6 GB)
89
- speech_tokenizer/ # Qwen3 12 Hz audio codec (~650 MB)
90
- tokenizer.json + ... # text tokenizer
91
- config.json + ... # model configs
92
- miner.py # Vocence engine
93
- chute_config.yml # Chutes build (TEE / pro_6000)
94
- vocence_config.yaml # runtime knobs
95
- demo.py # quick smoke test
96
- ```
97
-
98
- The Vocence files make this repo deployable on **Bittensor SN78 (Vocence)** via the canonical Vocence/Chutes wrapper without modification.
 
24
 
25
  24 kHz mono WAV output, single forward call, no reference audio, no PEFT runtime. Everything ships in this repo.
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
miner.py CHANGED
@@ -8,7 +8,8 @@ snapshot and then drives it through the contract:
8
  generate_wav(instruction: str, text: str) -> tuple[np.ndarray, int]
9
 
10
  All weights, the audio codec, and the tokenizer ship together in the snapshot —
11
- nothing is fetched at runtime.
 
12
  """
13
  from __future__ import annotations
14
 
@@ -18,6 +19,7 @@ from pathlib import Path
18
  from typing import Any
19
 
20
  import numpy as np
 
21
 
22
 
23
  _REPO_REQUIRED_FILE = "config.json"
@@ -30,29 +32,22 @@ class _RuntimeOpts:
30
 
31
  language: str = "English"
32
  sample_rate: int = 24000
33
- max_instruction_chars: int = 600
34
- max_text_chars: int = 2000
35
  device_pref: str = "cuda"
36
  dtype_pref: str = "bfloat16"
37
  flash_attention_2: bool = False
38
 
39
  @classmethod
40
- def from_repo(cls, repo: Path) -> "_RuntimeOpts":
41
- cfg_path = repo / _RUNTIME_CONFIG_FILE
42
- if not cfg_path.is_file():
43
- return cls()
44
- from yaml import safe_load
45
-
46
- with cfg_path.open("r", encoding="utf-8") as fh:
47
- data = safe_load(fh) or {}
48
  runtime = data.get("runtime") or {}
49
  generation = data.get("generation") or {}
50
  limits = data.get("limits") or {}
51
  return cls(
52
- language=str(limits.get("default_language") or runtime.get("default_language") or "English"),
 
 
 
 
53
  sample_rate=int(generation.get("sample_rate", 24000)),
54
- max_instruction_chars=int(limits.get("max_instruction_chars", 600)),
55
- max_text_chars=int(limits.get("max_text_chars", 2000)),
56
  device_pref=str(runtime.get("device_preference", "cuda")).lower(),
57
  dtype_pref=str(runtime.get("dtype", "bfloat16")).lower(),
58
  flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
@@ -60,7 +55,7 @@ class _RuntimeOpts:
60
 
61
 
62
  class Miner:
63
- """Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
64
 
65
  WARMUP_BUDGET_S = 180.0
66
 
@@ -70,8 +65,13 @@ class Miner:
70
  raise FileNotFoundError(
71
  f"Snapshot incomplete: {self.repo / _REPO_REQUIRED_FILE} not found"
72
  )
73
- self.opts = _RuntimeOpts.from_repo(self.repo)
74
- self.model = self._build_model()
 
 
 
 
 
75
 
76
  def __repr__(self) -> str:
77
  return f"<Miner repo={self.repo.name} language={self.opts.language!r}>"
@@ -94,15 +94,16 @@ class Miner:
94
  worker.start()
95
  worker.join(timeout=self.WARMUP_BUDGET_S)
96
  if not outcome["ok"]:
97
- raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
 
 
98
 
99
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
100
- prompt = self._truncate(instruction, self.opts.max_instruction_chars)
101
- body = self._truncate(text, self.opts.max_text_chars)
102
-
103
  wavs, sample_rate = self.model.generate_voice_design(
104
- text=body,
105
- instruct=prompt,
106
  language=self.opts.language,
107
  )
108
  if not wavs or wavs[0] is None:
@@ -115,10 +116,6 @@ class Miner:
115
  # Internal #
116
  # ------------------------------------------------------------------ #
117
 
118
- @staticmethod
119
- def _truncate(value: str, limit: int) -> str:
120
- return value[:limit] if limit and limit > 0 else value
121
-
122
  @staticmethod
123
  def _coerce_mono_float32(arr: Any) -> np.ndarray:
124
  wave = np.asarray(arr, dtype=np.float32)
@@ -126,24 +123,28 @@ class Miner:
126
  wave = wave.mean(axis=1)
127
  return wave
128
 
129
- def _build_model(self):
130
  import torch
131
  from qwen_tts import Qwen3TTSModel
132
 
133
  cuda_available = bool(torch.cuda.is_available())
134
- device_map = "cuda:0" if (self.opts.device_pref == "cuda" and cuda_available) else "cpu"
 
 
135
  torch_dtype = (
136
  torch.bfloat16
137
  if (self.opts.dtype_pref == "bfloat16" and cuda_available)
138
  else torch.float32
139
  )
140
 
141
- attempt_order = ("flash_attention_2", "sdpa") if self.opts.flash_attention_2 else ("sdpa",)
 
 
142
  last_error: BaseException | None = None
143
  for attn in attempt_order:
144
  try:
145
  model = Qwen3TTSModel.from_pretrained(
146
- pretrained_model_name_or_path=str(self.repo),
147
  device_map=device_map,
148
  dtype=torch_dtype,
149
  attn_implementation=attn,
 
8
  generate_wav(instruction: str, text: str) -> tuple[np.ndarray, int]
9
 
10
  All weights, the audio codec, and the tokenizer ship together in the snapshot —
11
+ nothing is fetched at runtime. The HF cache is pre-populated by the wrapper, so
12
+ ``from_pretrained(model_name)`` resolves from disk without hitting the network.
13
  """
14
  from __future__ import annotations
15
 
 
19
  from typing import Any
20
 
21
  import numpy as np
22
+ import yaml
23
 
24
 
25
  _REPO_REQUIRED_FILE = "config.json"
 
32
 
33
  language: str = "English"
34
  sample_rate: int = 24000
 
 
35
  device_pref: str = "cuda"
36
  dtype_pref: str = "bfloat16"
37
  flash_attention_2: bool = False
38
 
39
  @classmethod
40
+ def from_config(cls, data: dict) -> "_RuntimeOpts":
 
 
 
 
 
 
 
41
  runtime = data.get("runtime") or {}
42
  generation = data.get("generation") or {}
43
  limits = data.get("limits") or {}
44
  return cls(
45
+ language=str(
46
+ limits.get("default_language")
47
+ or runtime.get("default_language")
48
+ or "English"
49
+ ),
50
  sample_rate=int(generation.get("sample_rate", 24000)),
 
 
51
  device_pref=str(runtime.get("device_preference", "cuda")).lower(),
52
  dtype_pref=str(runtime.get("dtype", "bfloat16")).lower(),
53
  flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
 
55
 
56
 
57
  class Miner:
58
+ """Loads merged Qwen3-TTS weights and serves the Vocence API."""
59
 
60
  WARMUP_BUDGET_S = 180.0
61
 
 
65
  raise FileNotFoundError(
66
  f"Snapshot incomplete: {self.repo / _REPO_REQUIRED_FILE} not found"
67
  )
68
+
69
+ with (self.repo / _RUNTIME_CONFIG_FILE).open("r", encoding="utf-8") as fh:
70
+ cfg = yaml.safe_load(fh) or {}
71
+ model_name = cfg["model_name"]
72
+
73
+ self.opts = _RuntimeOpts.from_config(cfg)
74
+ self.model = self._build_model(model_name)
75
 
76
  def __repr__(self) -> str:
77
  return f"<Miner repo={self.repo.name} language={self.opts.language!r}>"
 
94
  worker.start()
95
  worker.join(timeout=self.WARMUP_BUDGET_S)
96
  if not outcome["ok"]:
97
+ raise RuntimeError(
98
+ f"Miner warmup did not complete: {outcome['err'] or 'timeout'}"
99
+ )
100
 
101
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
102
+ # The validator's `instruction` and `text` are passed verbatim to the model,
103
+ # per MINER_GUIDE section 8b.C — no truncation / normalization / rewriting.
 
104
  wavs, sample_rate = self.model.generate_voice_design(
105
+ text=text,
106
+ instruct=instruction,
107
  language=self.opts.language,
108
  )
109
  if not wavs or wavs[0] is None:
 
116
  # Internal #
117
  # ------------------------------------------------------------------ #
118
 
 
 
 
 
119
  @staticmethod
120
  def _coerce_mono_float32(arr: Any) -> np.ndarray:
121
  wave = np.asarray(arr, dtype=np.float32)
 
123
  wave = wave.mean(axis=1)
124
  return wave
125
 
126
+ def _build_model(self, model_name):
127
  import torch
128
  from qwen_tts import Qwen3TTSModel
129
 
130
  cuda_available = bool(torch.cuda.is_available())
131
+ device_map = (
132
+ "cuda:0" if (self.opts.device_pref == "cuda" and cuda_available) else "cpu"
133
+ )
134
  torch_dtype = (
135
  torch.bfloat16
136
  if (self.opts.dtype_pref == "bfloat16" and cuda_available)
137
  else torch.float32
138
  )
139
 
140
+ attempt_order = (
141
+ ("flash_attention_2", "sdpa") if self.opts.flash_attention_2 else ("sdpa",)
142
+ )
143
  last_error: BaseException | None = None
144
  for attn in attempt_order:
145
  try:
146
  model = Qwen3TTSModel.from_pretrained(
147
+ model_name,
148
  device_map=device_map,
149
  dtype=torch_dtype,
150
  attn_implementation=attn,
vocence_config.yaml CHANGED
@@ -1,4 +1,7 @@
1
  # Miner + /health metadata. Weights live in this HF repo (no runtime model_id).
 
 
 
2
  runtime:
3
  adapter: "qwen3_tts_repo_snapshot"
4
  device_preference: "cuda"
 
1
  # Miner + /health metadata. Weights live in this HF repo (no runtime model_id).
2
+ # Required: must match the model_name committed on chain.
3
+ model_name: "matthewliu0302/grit_v4"
4
+
5
  runtime:
6
  adapter: "qwen3_tts_repo_snapshot"
7
  device_preference: "cuda"