adapt to vocence update

Browse files

Files changed (5) hide show

__pycache__/miner.cpython-312.pyc +0 -0
chute_config.yml +1 -1
miner.py +29 -95
snac_model/{pytorch_model.bin → model.safetensors} +2 -2
vocence_config.yaml +2 -1

__pycache__/miner.cpython-312.pyc ADDED Viewed

Binary file (11.8 kB). View file

chute_config.yml CHANGED Viewed

@@ -4,7 +4,7 @@
 Image:
   from_base: parachutes/base-python:3.12.9
   run_command:
-    - pip install torch torchaudio transformers accelerate huggingface_hub pyyaml soundfile snac
   set_workdir: /app
 NodeSelector:

 Image:
   from_base: parachutes/base-python:3.12.9
   run_command:
+    - pip install torch torchaudio transformers accelerate huggingface_hub pyyaml soundfile snac safetensors
   set_workdir: /app
 NodeSelector:

miner.py CHANGED Viewed

@@ -4,6 +4,8 @@ from pathlib import Path
 import numpy as np
 import torch
 from snac import SNAC
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -21,8 +23,8 @@ BOS_ID = 128000
 TEXT_EOT_ID = 128009
-def build_prompt(tokenizer, description: str, text: str) -> str:
-    """Build formatted prompt for Maya1."""
     soh_token = tokenizer.decode([SOH_ID])
     eoh_token = tokenizer.decode([EOH_ID])
     soa_token = tokenizer.decode([SOA_ID])
@@ -30,7 +32,7 @@ def build_prompt(tokenizer, description: str, text: str) -> str:
     eot_token = tokenizer.decode([TEXT_EOT_ID])
     bos_token = tokenizer.bos_token
-    formatted_text = f'<description="{description}"> {text}'
     prompt = (
         soh_token + bos_token + formatted_text + eot_token +
@@ -85,89 +87,19 @@ def unpack_snac_from_7(snac_tokens: list) -> list:
     return [l1, l2, l3]
-def format_description(description: str) -> str:
-    parts = description.strip().split("|")
-    data = {}
-    # Parse into dict
-    for part in parts:
-        if ":" in part:
-            key, value = part.split(":", 1)
-            data[key.strip()] = value.strip()
-    # Build components
-    gender = data.get("gender", "")
-    age_group = data.get("age_group", "")
-    accent = data.get("accent", "")
-    pitch = data.get("pitch", "")
-    speed = data.get("speed", "")
-    emotion = data.get("emotion", "")
-    tone = data.get("tone", "")
-    # Convert to natural language
-    sentence1 = f"Realistic {gender} voice"
-    if age_group == "senior":
-        sentence1 += " in the 40s age"
-    elif age_group == "adult":
-        sentence1 += " in the 30s age"
-    elif age_group == "young_adult":
-        sentence1 += " in the 20s age"
-    else:
-        sentence1 += " in the 20s age"
-    if accent:
-        if accent.lower() == "us":
-            accent = "American"
-        elif accent.lower() == "uk":
-            accent = "British"
-        elif accent.lower() == "au":
-            accent = "Australian"
-        elif accent.lower() == "in":
-            accent = "Indian"
-        elif accent.lower() == "neutral":
-            accent = "Asian American"
-        elif accent.lower() == "other":
-            accent = "American"
-        sentence1 += f" with {accent.lower()} accent"
-    sentence2_parts = []
-    if pitch:
-        sentence2_parts.append(f"{pitch.capitalize()} pitch")
-    if emotion:
-        # Emotion: neutral, energetic, excited, sad, sarcastic, dry
-        if emotion.lower() == "happy":
-            emotion = "energetic"
-        elif emotion.lower() == "angry":
-            emotion = "sarcastic"
-        elif emotion.lower() == "calm":
-            emotion = "neutral"
-        elif emotion.lower() == "serious":
-            emotion = "dry"
-        elif emotion.lower() == "fearful":
-            emotion = "sad"
-        sentence2_parts.append(f"{emotion} timbre")
-    if speed:
-        if speed.lower() == "normal":
-            speed = "conversational"
-        sentence2_parts.append(f"{speed} pacing")
-    if tone:
-        # Timbre: `deep`, `warm`, `gravelly`, `smooth`, `raspy`, `nasally`, `throaty`, `harsh`
-        if tone.lower() == "cold":
-            tone = "harsh"
-        elif tone.lower() == "friendly":
-            tone = "warm"
-        elif tone.lower() == "formal":
-            tone = "smooth"
-        elif tone.lower() == "casual":
-            tone = "gravelly"
-        elif tone.lower() == "authoritative":
-            tone = "throaty"
-        sentence2_parts.append(f"{tone} tone")
-    sentence2 = ", ".join(sentence2_parts)
-    return sentence1 + ". " + sentence2 + "."
 class Miner:
@@ -177,34 +109,36 @@ class Miner:
         self._repo_path = Path(path_hf_repo).resolve()
         self._device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model = AutoModelForCausalLM.from_pretrained(
-            str(self._repo_path),
             torch_dtype=torch.bfloat16,
             device_map="auto",
             trust_remote_code=True,
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            str(self._repo_path),
             trust_remote_code=True,
         )
-        snac_path = self._repo_path / "snac_model"
-        if snac_path.exists():
-            self.snac_model = SNAC.from_pretrained(str(snac_path)).eval()
-        else:
-            self.snac_model = SNAC.from_pretrained("snac_model").eval()
         if torch.cuda.is_available():
             self.snac_model = self.snac_model.to("cuda")
     def warmup(self) -> None:
         _ = self.generate_wav(
-            instruction="| gender: male | pitch: mid | speed: normal | age_group: adult | emotion: calm | tone: formal | accent: us",
             text="This is a warmup utterance for the voice engine.",
         )
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
-        description = format_description(instruction)
-        prompt = build_prompt(self.tokenizer, description, text)
         inputs = self.tokenizer(prompt, return_tensors="pt")
         if torch.cuda.is_available():

 import numpy as np
 import torch
+import yaml
+from safetensors.torch import load_file
 from snac import SNAC
 from transformers import AutoModelForCausalLM, AutoTokenizer
 TEXT_EOT_ID = 128009
+def build_prompt(tokenizer, instruction: str, text: str) -> str:
+    """Build Maya1 prompt: control tokens + verbatim instruction/text."""
     soh_token = tokenizer.decode([SOH_ID])
     eoh_token = tokenizer.decode([EOH_ID])
     soa_token = tokenizer.decode([SOA_ID])
     eot_token = tokenizer.decode([TEXT_EOT_ID])
     bos_token = tokenizer.bos_token
+    formatted_text = f'<description="{instruction}"> {text}'
     prompt = (
         soh_token + bos_token + formatted_text + eot_token +
     return [l1, l2, l3]
+def _load_snac(repo_path: Path) -> SNAC:
+    """Load SNAC decoder weights from repo-local safetensors (no .bin)."""
+    snac_dir = repo_path / "snac_model"
+    weights_path = snac_dir / "model.safetensors"
+    config_path = snac_dir / "config.json"
+    if not weights_path.is_file() or not config_path.is_file():
+        raise FileNotFoundError(
+            f"SNAC assets missing under {snac_dir}: need config.json and model.safetensors"
+        )
+    model = SNAC.from_config(str(config_path))
+    model.load_state_dict(load_file(weights_path, device="cpu"))
+    return model.eval()
 class Miner:
         self._repo_path = Path(path_hf_repo).resolve()
         self._device = "cuda" if torch.cuda.is_available() else "cpu"
+        with (self._repo_path / "vocence_config.yaml").open() as f:
+            config = yaml.safe_load(f) or {}
+        model_name = config["model_name"]
         self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
             torch_dtype=torch.bfloat16,
             device_map="auto",
             trust_remote_code=True,
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
             trust_remote_code=True,
         )
+        self.snac_model = _load_snac(self._repo_path)
         if torch.cuda.is_available():
             self.snac_model = self.snac_model.to("cuda")
     def warmup(self) -> None:
         _ = self.generate_wav(
+            instruction=(
+                "A calm adult male speaker with an American accent, mid-pitched voice, "
+                "normal speaking pace, and a formal tone."
+            ),
             text="This is a warmup utterance for the voice engine.",
         )
     def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
+        prompt = build_prompt(self.tokenizer, instruction, text)
         inputs = self.tokenizer(prompt, return_tensors="pt")
         if torch.cuda.is_available():

snac_model/{pytorch_model.bin → model.safetensors} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b8164cc6606bfa627f1a784734c1e539891518f1191ed9194fe1e3b9b4bff40
-size 79488254

 version https://git-lfs.github.com/spec/v1
+oid sha256:248cee7e77b5b8f7968515517371408963c26ad074800742531ca1faae5857a8
+size 79404024

vocence_config.yaml CHANGED Viewed

@@ -1,4 +1,5 @@
-# Optional PromptTTS settings read by your miner.py. Example values.
 runtime:
   adapter: "example"

+# Required: must match the model_name committed on chain.
+model_name: "ranupthestairs/vocence-tts"
 runtime:
   adapter: "example"