Instructions to use Praha-Labs/PrahaTTS-ML with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Chatterbox
How to use Praha-Labs/PrahaTTS-ML with Chatterbox:
# pip install chatterbox-tts import torchaudio as ta from chatterbox.tts import ChatterboxTTS model = ChatterboxTTS.from_pretrained(device="cuda") text = "Ezreal and Jinx teamed up with Ahri, Yasuo, and Teemo to take down the enemy's Nexus in an epic late-game pentakill." wav = model.generate(text) ta.save("test-1.wav", wav, model.sr) # If you want to synthesize with a different voice, specify the audio prompt AUDIO_PROMPT_PATH="YOUR_FILE.wav" wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH) ta.save("test-2.wav", wav, model.sr) - Notebooks
- Google Colab
- Kaggle
Upload folder using huggingface_hub
Browse files- README.md +28 -0
- adapter_config.json +49 -0
- adapter_model.safetensors +3 -0
- config_indic.py +65 -0
- tokenizer_indic.json +0 -0
- tokenizer_indic.json.manifest.json +128 -0
README.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- ml
|
| 5 |
+
base_model: ResembleAI/chatterbox
|
| 6 |
+
tags:
|
| 7 |
+
- text-to-speech
|
| 8 |
+
- tts
|
| 9 |
+
- malayalam
|
| 10 |
+
- chatterbox
|
| 11 |
+
- lora
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# PrahaTTS-ML
|
| 15 |
+
|
| 16 |
+
Malayalam LoRA adapter for ResembleAI Chatterbox non-turbo TTS.
|
| 17 |
+
|
| 18 |
+
This repository contains the selected 17k-step adapter checkpoint, chosen by listening quality rather than lowest training loss.
|
| 19 |
+
|
| 20 |
+
## Contents
|
| 21 |
+
|
| 22 |
+
- `adapter_config.json`
|
| 23 |
+
- `adapter_model.safetensors`
|
| 24 |
+
- `tokenizer_indic.json`
|
| 25 |
+
- `tokenizer_indic.json.manifest.json`
|
| 26 |
+
- `config_indic.py`
|
| 27 |
+
|
| 28 |
+
This is not a merged full model. Use it with the base Chatterbox non-turbo model and the included Indic tokenizer.
|
adapter_config.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": {
|
| 4 |
+
"base_model_class": "T3",
|
| 5 |
+
"parent_library": "src.chatterbox_.models.t3.t3"
|
| 6 |
+
},
|
| 7 |
+
"base_model_name_or_path": null,
|
| 8 |
+
"bias": "none",
|
| 9 |
+
"corda_config": null,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 256,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": [
|
| 25 |
+
"text_emb",
|
| 26 |
+
"text_head"
|
| 27 |
+
],
|
| 28 |
+
"peft_type": "LORA",
|
| 29 |
+
"qalora_group_size": 16,
|
| 30 |
+
"r": 128,
|
| 31 |
+
"rank_pattern": {},
|
| 32 |
+
"revision": null,
|
| 33 |
+
"target_modules": [
|
| 34 |
+
"o_proj",
|
| 35 |
+
"down_proj",
|
| 36 |
+
"v_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"q_proj",
|
| 39 |
+
"gate_proj",
|
| 40 |
+
"up_proj",
|
| 41 |
+
"spkr_enc"
|
| 42 |
+
],
|
| 43 |
+
"target_parameters": null,
|
| 44 |
+
"task_type": null,
|
| 45 |
+
"trainable_token_indices": null,
|
| 46 |
+
"use_dora": false,
|
| 47 |
+
"use_qalora": false,
|
| 48 |
+
"use_rslora": false
|
| 49 |
+
}
|
adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48e25755212b840d2ba40a187126a7fbd49fd02f0b7c9de2a58b4e1b33bde1d8
|
| 3 |
+
size 383549136
|
config_indic.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class IndicTrainConfig:
|
| 7 |
+
# Base Chatterbox model files downloaded by setup.py.
|
| 8 |
+
model_dir: str = "./pretrained_models"
|
| 9 |
+
|
| 10 |
+
# Dataset layout. Mixed-language metadata should include a language column:
|
| 11 |
+
# filename|raw_text|normalized_text|language_id
|
| 12 |
+
csv_path: str = "./IndicFinetuning/datasets/MalayalamDataset/metadata.csv"
|
| 13 |
+
metadata_path: str = "./IndicFinetuning/datasets/metadata.json"
|
| 14 |
+
wav_dir: str = "./IndicFinetuning/datasets/MalayalamDataset/wavs"
|
| 15 |
+
preprocessed_dir: str = "./IndicFinetuning/datasets/MalayalamDataset/preprocess"
|
| 16 |
+
output_dir: str = "./IndicFinetuning/outputs"
|
| 17 |
+
tokenizer_path: str = "./IndicFinetuning/tokenizer/tokenizer_indic.json"
|
| 18 |
+
|
| 19 |
+
# Model selection.
|
| 20 |
+
is_turbo: bool = False
|
| 21 |
+
is_lora: bool = True
|
| 22 |
+
|
| 23 |
+
# Toggle languages here. For single-language Malayalam training, keep ["ml"].
|
| 24 |
+
target_languages: List[str] = field(default_factory=lambda: ["ml"])
|
| 25 |
+
default_language: str = "ml"
|
| 26 |
+
metadata_language_column: Optional[int] = 3
|
| 27 |
+
add_language_tag: bool = True
|
| 28 |
+
normalize_unicode: str = "NFC"
|
| 29 |
+
|
| 30 |
+
# Dataset format.
|
| 31 |
+
ljspeech: bool = True
|
| 32 |
+
json_format: bool = False
|
| 33 |
+
preprocess: bool = True
|
| 34 |
+
|
| 35 |
+
# Inference smoke test.
|
| 36 |
+
is_inference: bool = False
|
| 37 |
+
inference_language: str = "ml"
|
| 38 |
+
inference_prompt_path: str = "/workspace/Indic-ChatterBox/IndicFinetuning/outputs/reference_trimmed.wav"
|
| 39 |
+
inference_test_text: str = "പ്രണവേ എനിക്ക് നിന്നെ കാണാൻ really തോന്നുന്നു ഇന്ന് whole day mind full of thoughts ആയിരുന്നു നീ എവിടെയാ, എന്താ doing എന്ന് constantly ഓർമ്മ വരുന്നു just come back once, എനിക്ക് സംസാരിക്കണം നിന്നോട്"
|
| 40 |
+
|
| 41 |
+
# Vocabulary. Update after building the Indic tokenizer.
|
| 42 |
+
new_vocab_size: int = 2573
|
| 43 |
+
|
| 44 |
+
# LoRA.
|
| 45 |
+
lora_r: int = 128
|
| 46 |
+
lora_alpha: int = 256
|
| 47 |
+
lora_target_modules: List[str] = field(default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "spkr_enc"])
|
| 48 |
+
turbo_lora_target_modules: List[str] = field(default_factory=lambda: ["c_attn", "c_proj", "c_fc", "spkr_enc"])
|
| 49 |
+
lora_modules_to_save: List[str] = field(default_factory=lambda: ["text_emb", "text_head"])
|
| 50 |
+
|
| 51 |
+
# Training.
|
| 52 |
+
batch_size: int = 16
|
| 53 |
+
grad_accum: int = 1
|
| 54 |
+
learning_rate: float = 1e-4
|
| 55 |
+
num_epochs: int = 10
|
| 56 |
+
save_steps: int = 500
|
| 57 |
+
save_total_limit: int = 5
|
| 58 |
+
dataloader_num_workers: int = 8
|
| 59 |
+
|
| 60 |
+
# Sequence constraints.
|
| 61 |
+
start_text_token: int = 255
|
| 62 |
+
stop_text_token: int = 0
|
| 63 |
+
max_text_len: int = 256
|
| 64 |
+
max_speech_len: int = 850
|
| 65 |
+
prompt_duration: float = 3.0
|
tokenizer_indic.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_indic.json.manifest.json
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"languages": [
|
| 3 |
+
"ml"
|
| 4 |
+
],
|
| 5 |
+
"added_token_count": 119,
|
| 6 |
+
"final_vocab_size": 2573,
|
| 7 |
+
"added_tokens": [
|
| 8 |
+
"[ml]",
|
| 9 |
+
"ഀ",
|
| 10 |
+
"ഁ",
|
| 11 |
+
"ം",
|
| 12 |
+
"ഃ",
|
| 13 |
+
"ഄ",
|
| 14 |
+
"അ",
|
| 15 |
+
"ആ",
|
| 16 |
+
"ഇ",
|
| 17 |
+
"ഈ",
|
| 18 |
+
"ഉ",
|
| 19 |
+
"ഊ",
|
| 20 |
+
"ഋ",
|
| 21 |
+
"ഌ",
|
| 22 |
+
"എ",
|
| 23 |
+
"ഏ",
|
| 24 |
+
"ഐ",
|
| 25 |
+
"ഒ",
|
| 26 |
+
"ഓ",
|
| 27 |
+
"ഔ",
|
| 28 |
+
"ക",
|
| 29 |
+
"ഖ",
|
| 30 |
+
"ഗ",
|
| 31 |
+
"ഘ",
|
| 32 |
+
"ങ",
|
| 33 |
+
"ച",
|
| 34 |
+
"ഛ",
|
| 35 |
+
"ജ",
|
| 36 |
+
"ഝ",
|
| 37 |
+
"ഞ",
|
| 38 |
+
"ട",
|
| 39 |
+
"ഠ",
|
| 40 |
+
"ഡ",
|
| 41 |
+
"ഢ",
|
| 42 |
+
"ണ",
|
| 43 |
+
"ത",
|
| 44 |
+
"ഥ",
|
| 45 |
+
"ദ",
|
| 46 |
+
"ധ",
|
| 47 |
+
"ന",
|
| 48 |
+
"ഩ",
|
| 49 |
+
"പ",
|
| 50 |
+
"ഫ",
|
| 51 |
+
"ബ",
|
| 52 |
+
"ഭ",
|
| 53 |
+
"മ",
|
| 54 |
+
"യ",
|
| 55 |
+
"ര",
|
| 56 |
+
"റ",
|
| 57 |
+
"ല",
|
| 58 |
+
"ള",
|
| 59 |
+
"ഴ",
|
| 60 |
+
"വ",
|
| 61 |
+
"ശ",
|
| 62 |
+
"ഷ",
|
| 63 |
+
"സ",
|
| 64 |
+
"ഹ",
|
| 65 |
+
"ഺ",
|
| 66 |
+
"഻",
|
| 67 |
+
"഼",
|
| 68 |
+
"ഽ",
|
| 69 |
+
"ാ",
|
| 70 |
+
"ി",
|
| 71 |
+
"ീ",
|
| 72 |
+
"ു",
|
| 73 |
+
"ൂ",
|
| 74 |
+
"ൃ",
|
| 75 |
+
"ൄ",
|
| 76 |
+
"െ",
|
| 77 |
+
"േ",
|
| 78 |
+
"ൈ",
|
| 79 |
+
"ൊ",
|
| 80 |
+
"ോ",
|
| 81 |
+
"ൌ",
|
| 82 |
+
"്",
|
| 83 |
+
"ൎ",
|
| 84 |
+
"൏",
|
| 85 |
+
"ൔ",
|
| 86 |
+
"ൕ",
|
| 87 |
+
"ൖ",
|
| 88 |
+
"ൗ",
|
| 89 |
+
"൘",
|
| 90 |
+
"൙",
|
| 91 |
+
"൚",
|
| 92 |
+
"൛",
|
| 93 |
+
"൜",
|
| 94 |
+
"൝",
|
| 95 |
+
"൞",
|
| 96 |
+
"ൟ",
|
| 97 |
+
"ൠ",
|
| 98 |
+
"ൡ",
|
| 99 |
+
"ൢ",
|
| 100 |
+
"ൣ",
|
| 101 |
+
"൦",
|
| 102 |
+
"൧",
|
| 103 |
+
"൨",
|
| 104 |
+
"൩",
|
| 105 |
+
"൪",
|
| 106 |
+
"൫",
|
| 107 |
+
"൬",
|
| 108 |
+
"൭",
|
| 109 |
+
"൮",
|
| 110 |
+
"൯",
|
| 111 |
+
"൰",
|
| 112 |
+
"൱",
|
| 113 |
+
"൲",
|
| 114 |
+
"൳",
|
| 115 |
+
"൴",
|
| 116 |
+
"൵",
|
| 117 |
+
"൶",
|
| 118 |
+
"൷",
|
| 119 |
+
"൸",
|
| 120 |
+
"൹",
|
| 121 |
+
"ൺ",
|
| 122 |
+
"ൻ",
|
| 123 |
+
"ർ",
|
| 124 |
+
"ൽ",
|
| 125 |
+
"ൾ",
|
| 126 |
+
"ൿ"
|
| 127 |
+
]
|
| 128 |
+
}
|