matthewliu0302 commited on
Commit
cdb447a
·
1 Parent(s): b497527

adapt to vocence update

Browse files
__pycache__/miner.cpython-312.pyc ADDED
Binary file (11.8 kB). View file
 
chute_config.yml CHANGED
@@ -4,7 +4,7 @@
4
  Image:
5
  from_base: parachutes/base-python:3.12.9
6
  run_command:
7
- - pip install torch torchaudio transformers accelerate huggingface_hub pyyaml soundfile snac
8
  set_workdir: /app
9
 
10
  NodeSelector:
 
4
  Image:
5
  from_base: parachutes/base-python:3.12.9
6
  run_command:
7
+ - pip install torch torchaudio transformers accelerate huggingface_hub pyyaml soundfile snac safetensors
8
  set_workdir: /app
9
 
10
  NodeSelector:
miner.py CHANGED
@@ -4,6 +4,8 @@ from pathlib import Path
4
 
5
  import numpy as np
6
  import torch
 
 
7
  from snac import SNAC
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
9
 
@@ -21,8 +23,8 @@ BOS_ID = 128000
21
  TEXT_EOT_ID = 128009
22
 
23
 
24
- def build_prompt(tokenizer, description: str, text: str) -> str:
25
- """Build formatted prompt for Maya1."""
26
  soh_token = tokenizer.decode([SOH_ID])
27
  eoh_token = tokenizer.decode([EOH_ID])
28
  soa_token = tokenizer.decode([SOA_ID])
@@ -30,7 +32,7 @@ def build_prompt(tokenizer, description: str, text: str) -> str:
30
  eot_token = tokenizer.decode([TEXT_EOT_ID])
31
  bos_token = tokenizer.bos_token
32
 
33
- formatted_text = f'<description="{description}"> {text}'
34
 
35
  prompt = (
36
  soh_token + bos_token + formatted_text + eot_token +
@@ -85,89 +87,19 @@ def unpack_snac_from_7(snac_tokens: list) -> list:
85
  return [l1, l2, l3]
86
 
87
 
88
- def format_description(description: str) -> str:
89
- parts = description.strip().split("|")
90
- data = {}
91
-
92
- # Parse into dict
93
- for part in parts:
94
- if ":" in part:
95
- key, value = part.split(":", 1)
96
- data[key.strip()] = value.strip()
97
 
98
- # Build components
99
- gender = data.get("gender", "")
100
- age_group = data.get("age_group", "")
101
- accent = data.get("accent", "")
102
- pitch = data.get("pitch", "")
103
- speed = data.get("speed", "")
104
- emotion = data.get("emotion", "")
105
- tone = data.get("tone", "")
106
-
107
- # Convert to natural language
108
- sentence1 = f"Realistic {gender} voice"
109
-
110
- if age_group == "senior":
111
- sentence1 += " in the 40s age"
112
- elif age_group == "adult":
113
- sentence1 += " in the 30s age"
114
- elif age_group == "young_adult":
115
- sentence1 += " in the 20s age"
116
- else:
117
- sentence1 += " in the 20s age"
118
-
119
- if accent:
120
- if accent.lower() == "us":
121
- accent = "American"
122
- elif accent.lower() == "uk":
123
- accent = "British"
124
- elif accent.lower() == "au":
125
- accent = "Australian"
126
- elif accent.lower() == "in":
127
- accent = "Indian"
128
- elif accent.lower() == "neutral":
129
- accent = "Asian American"
130
- elif accent.lower() == "other":
131
- accent = "American"
132
- sentence1 += f" with {accent.lower()} accent"
133
-
134
- sentence2_parts = []
135
- if pitch:
136
- sentence2_parts.append(f"{pitch.capitalize()} pitch")
137
- if emotion:
138
- # Emotion: neutral, energetic, excited, sad, sarcastic, dry
139
- if emotion.lower() == "happy":
140
- emotion = "energetic"
141
- elif emotion.lower() == "angry":
142
- emotion = "sarcastic"
143
- elif emotion.lower() == "calm":
144
- emotion = "neutral"
145
- elif emotion.lower() == "serious":
146
- emotion = "dry"
147
- elif emotion.lower() == "fearful":
148
- emotion = "sad"
149
- sentence2_parts.append(f"{emotion} timbre")
150
- if speed:
151
- if speed.lower() == "normal":
152
- speed = "conversational"
153
- sentence2_parts.append(f"{speed} pacing")
154
- if tone:
155
- # Timbre: `deep`, `warm`, `gravelly`, `smooth`, `raspy`, `nasally`, `throaty`, `harsh`
156
- if tone.lower() == "cold":
157
- tone = "harsh"
158
- elif tone.lower() == "friendly":
159
- tone = "warm"
160
- elif tone.lower() == "formal":
161
- tone = "smooth"
162
- elif tone.lower() == "casual":
163
- tone = "gravelly"
164
- elif tone.lower() == "authoritative":
165
- tone = "throaty"
166
- sentence2_parts.append(f"{tone} tone")
167
-
168
- sentence2 = ", ".join(sentence2_parts)
169
-
170
- return sentence1 + ". " + sentence2 + "."
171
 
172
 
173
  class Miner:
@@ -177,34 +109,36 @@ class Miner:
177
  self._repo_path = Path(path_hf_repo).resolve()
178
  self._device = "cuda" if torch.cuda.is_available() else "cpu"
179
 
 
 
 
 
180
  self.model = AutoModelForCausalLM.from_pretrained(
181
- str(self._repo_path),
182
  torch_dtype=torch.bfloat16,
183
  device_map="auto",
184
  trust_remote_code=True,
185
  )
186
  self.tokenizer = AutoTokenizer.from_pretrained(
187
- str(self._repo_path),
188
  trust_remote_code=True,
189
  )
190
 
191
- snac_path = self._repo_path / "snac_model"
192
- if snac_path.exists():
193
- self.snac_model = SNAC.from_pretrained(str(snac_path)).eval()
194
- else:
195
- self.snac_model = SNAC.from_pretrained("snac_model").eval()
196
  if torch.cuda.is_available():
197
  self.snac_model = self.snac_model.to("cuda")
198
 
199
  def warmup(self) -> None:
200
  _ = self.generate_wav(
201
- instruction="| gender: male | pitch: mid | speed: normal | age_group: adult | emotion: calm | tone: formal | accent: us",
 
 
 
202
  text="This is a warmup utterance for the voice engine.",
203
  )
204
 
205
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
206
- description = format_description(instruction)
207
- prompt = build_prompt(self.tokenizer, description, text)
208
 
209
  inputs = self.tokenizer(prompt, return_tensors="pt")
210
  if torch.cuda.is_available():
 
4
 
5
  import numpy as np
6
  import torch
7
+ import yaml
8
+ from safetensors.torch import load_file
9
  from snac import SNAC
10
  from transformers import AutoModelForCausalLM, AutoTokenizer
11
 
 
23
  TEXT_EOT_ID = 128009
24
 
25
 
26
+ def build_prompt(tokenizer, instruction: str, text: str) -> str:
27
+ """Build Maya1 prompt: control tokens + verbatim instruction/text."""
28
  soh_token = tokenizer.decode([SOH_ID])
29
  eoh_token = tokenizer.decode([EOH_ID])
30
  soa_token = tokenizer.decode([SOA_ID])
 
32
  eot_token = tokenizer.decode([TEXT_EOT_ID])
33
  bos_token = tokenizer.bos_token
34
 
35
+ formatted_text = f'<description="{instruction}"> {text}'
36
 
37
  prompt = (
38
  soh_token + bos_token + formatted_text + eot_token +
 
87
  return [l1, l2, l3]
88
 
89
 
90
+ def _load_snac(repo_path: Path) -> SNAC:
91
+ """Load SNAC decoder weights from repo-local safetensors (no .bin)."""
92
+ snac_dir = repo_path / "snac_model"
93
+ weights_path = snac_dir / "model.safetensors"
94
+ config_path = snac_dir / "config.json"
95
+ if not weights_path.is_file() or not config_path.is_file():
96
+ raise FileNotFoundError(
97
+ f"SNAC assets missing under {snac_dir}: need config.json and model.safetensors"
98
+ )
99
 
100
+ model = SNAC.from_config(str(config_path))
101
+ model.load_state_dict(load_file(weights_path, device="cpu"))
102
+ return model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  class Miner:
 
109
  self._repo_path = Path(path_hf_repo).resolve()
110
  self._device = "cuda" if torch.cuda.is_available() else "cpu"
111
 
112
+ with (self._repo_path / "vocence_config.yaml").open() as f:
113
+ config = yaml.safe_load(f) or {}
114
+ model_name = config["model_name"]
115
+
116
  self.model = AutoModelForCausalLM.from_pretrained(
117
+ model_name,
118
  torch_dtype=torch.bfloat16,
119
  device_map="auto",
120
  trust_remote_code=True,
121
  )
122
  self.tokenizer = AutoTokenizer.from_pretrained(
123
+ model_name,
124
  trust_remote_code=True,
125
  )
126
 
127
+ self.snac_model = _load_snac(self._repo_path)
 
 
 
 
128
  if torch.cuda.is_available():
129
  self.snac_model = self.snac_model.to("cuda")
130
 
131
  def warmup(self) -> None:
132
  _ = self.generate_wav(
133
+ instruction=(
134
+ "A calm adult male speaker with an American accent, mid-pitched voice, "
135
+ "normal speaking pace, and a formal tone."
136
+ ),
137
  text="This is a warmup utterance for the voice engine.",
138
  )
139
 
140
  def generate_wav(self, instruction: str, text: str) -> tuple[np.ndarray, int]:
141
+ prompt = build_prompt(self.tokenizer, instruction, text)
 
142
 
143
  inputs = self.tokenizer(prompt, return_tensors="pt")
144
  if torch.cuda.is_available():
snac_model/{pytorch_model.bin → model.safetensors} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b8164cc6606bfa627f1a784734c1e539891518f1191ed9194fe1e3b9b4bff40
3
- size 79488254
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:248cee7e77b5b8f7968515517371408963c26ad074800742531ca1faae5857a8
3
+ size 79404024
vocence_config.yaml CHANGED
@@ -1,4 +1,5 @@
1
- # Optional PromptTTS settings read by your miner.py. Example values.
 
2
 
3
  runtime:
4
  adapter: "example"
 
1
+ # Required: must match the model_name committed on chain.
2
+ model_name: "ranupthestairs/vocence-tts"
3
 
4
  runtime:
5
  adapter: "example"