adapt to new guideline

Browse files

Files changed (3) hide show

README.md +0 -72
miner.py +32 -31
vocence_config.yaml +3 -0

README.md CHANGED Viewed

@@ -24,75 +24,3 @@ A naturalness-first prompt-driven TTS, built on top of `magma90909/vocence_miner
 24 kHz mono WAV output, single forward call, no reference audio, no PEFT runtime. Everything ships in this repo.
-## Generate
-```bash
-pip install qwen-tts transformers torch soundfile
-```
-```python
-from qwen_tts import Qwen3TTSModel
-import soundfile as sf
-m = Qwen3TTSModel.from_pretrained("magma90909/vocence_miner_v8")
-wavs, sr = m.generate_voice_design(
-    text="The train to Edinburgh departs from platform four.",
-    instruct="A man with a British English accent, calm and natural.",
-    language="english",
-)
-sf.write("out.wav", wavs[0], sr)
-```
-`demo.py` walks through three preset prompts.
-## How to write `instruct`
-The model responds best to **subtle, conversational** language — not intensifiers like *"intensely sad"* or *"nearly shouting"*. Stack these elements freely:
-| Layer | Phrasings |
-|-------|-----------|
-| Accent / region | *British English*, *Scottish*, *Welsh*, *Northern Irish*, *Irish*, *unspecified* |
-| Gender | *a man*, *a woman*, *a British woman* |
-| Mood | *speaking warmly*, *softly sad*, *quietly pleased*, *with a touch of anger* |
-| Persona | *bedtime storyteller, soft and warm*; *news anchor, professional and neutral*; *meditation guide, soft and serene* |
-| Pace | *unhurried*, *brisk steady*, *naturally measured* |
-Some example prompts that work well:
-```
-A British man speaks calmly and naturally.
-A woman with a Scottish accent, in an everyday speaking tone.
-A man, softly sad, calm and unhurried.
-A British news anchor, professional and neutral, at a brisk steady pace.
-A clear, neutral voice reading the sentence.
-```
-## Best-fit and not-fit
-**Best at:**
-* Natural, everyday English — both US and UK
-* Bedtime storyteller / news anchor / meditation guide style reads
-* Conversational sadness, warmth, mild anger, gentle pleasure
-**Less suited for:**
-* Theatrical / caricatured delivery (loud anger, shouted joy, dramatic sadness)
-* Extreme intensifier prompts ("nearly shouting", "intensely sad") — the model intentionally tones these down
-* Languages other than English
-CC BY-NC-SA 4.0 — research and non-commercial use only.
-## Files
-```
-model.safetensors            # merged Talker weights (3.6 GB)
-speech_tokenizer/            # Qwen3 12 Hz audio codec (~650 MB)
-tokenizer.json + ...         # text tokenizer
-config.json + ...            # model configs
-miner.py                     # Vocence engine
-chute_config.yml             # Chutes build (TEE / pro_6000)
-vocence_config.yaml          # runtime knobs
-demo.py                      # quick smoke test
-```
-The Vocence files make this repo deployable on **Bittensor SN78 (Vocence)** via the canonical Vocence/Chutes wrapper without modification.


24
25	24 kHz mono WAV output, single forward call, no reference audio, no PEFT runtime. Everything ships in this repo.
26

miner.py CHANGED Viewed

@@ -8,7 +8,8 @@ snapshot and then drives it through the contract:
     generate_wav(instruction: str, text: str) -> tuple[np.ndarray, int]
 All weights, the audio codec, and the tokenizer ship together in the snapshot —
-nothing is fetched at runtime.
 """
 from __future__ import annotations
@@ -18,6 +19,7 @@ from pathlib import Path
 from typing import Any
 import numpy as np
 _REPO_REQUIRED_FILE = "config.json"
@@ -30,29 +32,22 @@ class _RuntimeOpts:
     language: str = "English"
     sample_rate: int = 24000
-    max_instruction_chars: int = 600
-    max_text_chars: int = 2000
     device_pref: str = "cuda"
     dtype_pref: str = "bfloat16"
     flash_attention_2: bool = False
     @classmethod
-    def from_repo(cls, repo: Path) -> "_RuntimeOpts":
-        cfg_path = repo / _RUNTIME_CONFIG_FILE
-        if not cfg_path.is_file():
-            return cls()
-        from yaml import safe_load
-        with cfg_path.open("r", encoding="utf-8") as fh:
-            data = safe_load(fh) or {}
         runtime = data.get("runtime") or {}
         generation = data.get("generation") or {}
         limits = data.get("limits") or {}
         return cls(
-            language=str(limits.get("default_language") or runtime.get("default_language") or "English"),
             sample_rate=int(generation.get("sample_rate", 24000)),
-            max_instruction_chars=int(limits.get("max_instruction_chars", 600)),
-            max_text_chars=int(limits.get("max_text_chars", 2000)),
             device_pref=str(runtime.get("device_preference", "cuda")).lower(),
             dtype_pref=str(runtime.get("dtype", "bfloat16")).lower(),
             flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
@@ -60,7 +55,7 @@ class _RuntimeOpts:
 class Miner:
-    """Loads merged Qwen3-TTS weights from the snapshot and serves the Vocence API."""
     WARMUP_BUDGET_S = 180.0
@@ -70,8 +65,13 @@ class Miner:
             raise FileNotFoundError(
                 f"Snapshot incomplete: {self.repo / _REPO_REQUIRED_FILE} not found"
             )
-        self.opts = _RuntimeOpts.from_repo(self.repo)
-        self.model = self._build_model()
     def __repr__(self) -> str:
         return f"<Miner repo={self.repo.name} language={self.opts.language!r}>"
@@ -94,15 +94,16 @@ class Miner:
         worker.start()
         worker.join(timeout=self.WARMUP_BUDGET_S)
         if not outcome["ok"]:
-            raise RuntimeError(f"Miner warmup did not complete: {outcome['err'] or 'timeout'}")
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
-        prompt = self._truncate(instruction, self.opts.max_instruction_chars)
-        body = self._truncate(text, self.opts.max_text_chars)
         wavs, sample_rate = self.model.generate_voice_design(
-            text=body,
-            instruct=prompt,
             language=self.opts.language,
         )
         if not wavs or wavs[0] is None:
@@ -115,10 +116,6 @@ class Miner:
     # Internal                                                            #
     # ------------------------------------------------------------------ #
-    @staticmethod
-    def _truncate(value: str, limit: int) -> str:
-        return value[:limit] if limit and limit > 0 else value
     @staticmethod
     def _coerce_mono_float32(arr: Any) -> np.ndarray:
         wave = np.asarray(arr, dtype=np.float32)
@@ -126,24 +123,28 @@ class Miner:
             wave = wave.mean(axis=1)
         return wave
-    def _build_model(self):
         import torch
         from qwen_tts import Qwen3TTSModel
         cuda_available = bool(torch.cuda.is_available())
-        device_map = "cuda:0" if (self.opts.device_pref == "cuda" and cuda_available) else "cpu"
         torch_dtype = (
             torch.bfloat16
             if (self.opts.dtype_pref == "bfloat16" and cuda_available)
             else torch.float32
         )
-        attempt_order = ("flash_attention_2", "sdpa") if self.opts.flash_attention_2 else ("sdpa",)
         last_error: BaseException | None = None
         for attn in attempt_order:
             try:
                 model = Qwen3TTSModel.from_pretrained(
-                    pretrained_model_name_or_path=str(self.repo),
                     device_map=device_map,
                     dtype=torch_dtype,
                     attn_implementation=attn,

     generate_wav(instruction: str, text: str) -> tuple[np.ndarray, int]
 All weights, the audio codec, and the tokenizer ship together in the snapshot —
+nothing is fetched at runtime. The HF cache is pre-populated by the wrapper, so
+``from_pretrained(model_name)`` resolves from disk without hitting the network.
 """
 from __future__ import annotations
 from typing import Any
 import numpy as np
+import yaml
 _REPO_REQUIRED_FILE = "config.json"
     language: str = "English"
     sample_rate: int = 24000
     device_pref: str = "cuda"
     dtype_pref: str = "bfloat16"
     flash_attention_2: bool = False
     @classmethod
+    def from_config(cls, data: dict) -> "_RuntimeOpts":
         runtime = data.get("runtime") or {}
         generation = data.get("generation") or {}
         limits = data.get("limits") or {}
         return cls(
+            language=str(
+                limits.get("default_language")
+                or runtime.get("default_language")
+                or "English"
+            ),
             sample_rate=int(generation.get("sample_rate", 24000)),
             device_pref=str(runtime.get("device_preference", "cuda")).lower(),
             dtype_pref=str(runtime.get("dtype", "bfloat16")).lower(),
             flash_attention_2=bool(runtime.get("use_flash_attention_2", False)),
 class Miner:
+    """Loads merged Qwen3-TTS weights and serves the Vocence API."""
     WARMUP_BUDGET_S = 180.0
             raise FileNotFoundError(
                 f"Snapshot incomplete: {self.repo / _REPO_REQUIRED_FILE} not found"
             )
+        with (self.repo / _RUNTIME_CONFIG_FILE).open("r", encoding="utf-8") as fh:
+            cfg = yaml.safe_load(fh) or {}
+        model_name = cfg["model_name"]
+        self.opts = _RuntimeOpts.from_config(cfg)
+        self.model = self._build_model(model_name)
     def __repr__(self) -> str:
         return f"<Miner repo={self.repo.name} language={self.opts.language!r}>"
         worker.start()
         worker.join(timeout=self.WARMUP_BUDGET_S)
         if not outcome["ok"]:
+            raise RuntimeError(
+                f"Miner warmup did not complete: {outcome['err'] or 'timeout'}"
+            )
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
+        # The validator's `instruction` and `text` are passed verbatim to the model,
+        # per MINER_GUIDE section 8b.C — no truncation / normalization / rewriting.
         wavs, sample_rate = self.model.generate_voice_design(
+            text=text,
+            instruct=instruction,
             language=self.opts.language,
         )
         if not wavs or wavs[0] is None:
     # Internal                                                            #
     # ------------------------------------------------------------------ #
     @staticmethod
     def _coerce_mono_float32(arr: Any) -> np.ndarray:
         wave = np.asarray(arr, dtype=np.float32)
             wave = wave.mean(axis=1)
         return wave
+    def _build_model(self, model_name):
         import torch
         from qwen_tts import Qwen3TTSModel
         cuda_available = bool(torch.cuda.is_available())
+        device_map = (
+            "cuda:0" if (self.opts.device_pref == "cuda" and cuda_available) else "cpu"
+        )
         torch_dtype = (
             torch.bfloat16
             if (self.opts.dtype_pref == "bfloat16" and cuda_available)
             else torch.float32
         )
+        attempt_order = (
+            ("flash_attention_2", "sdpa") if self.opts.flash_attention_2 else ("sdpa",)
+        )
         last_error: BaseException | None = None
         for attn in attempt_order:
             try:
                 model = Qwen3TTSModel.from_pretrained(
+                    model_name,
                     device_map=device_map,
                     dtype=torch_dtype,
                     attn_implementation=attn,

vocence_config.yaml CHANGED Viewed

@@ -1,4 +1,7 @@
 # Miner + /health metadata. Weights live in this HF repo (no runtime model_id).
 runtime:
   adapter: "qwen3_tts_repo_snapshot"
   device_preference: "cuda"

 # Miner + /health metadata. Weights live in this HF repo (no runtime model_id).
+# Required: must match the model_name committed on chain.
+model_name: "matthewliu0302/grit_v4"
 runtime:
   adapter: "qwen3_tts_repo_snapshot"
   device_preference: "cuda"