Umbaji001 commited on
Commit
ae2f129
·
verified ·
1 Parent(s): 570d526

Upload best MMS-TTS fine-tune for Adja (Eyaa-Tom)

Browse files
Files changed (6) hide show
  1. README.md +73 -0
  2. added_tokens.json +3 -0
  3. config.json +83 -0
  4. model.safetensors +3 -0
  5. tokenizer_config.json +33 -0
  6. vocab.json +48 -0
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - adj
4
+ tags:
5
+ - text-to-speech
6
+ - mms-tts
7
+ - vits
8
+ - adj
9
+ - west-africa
10
+ - eyaa-tom
11
+ - finetuned
12
+ license: cc-by-nc-4.0
13
+ base_model: facebook/mms-tts-adj
14
+ pipeline_tag: text-to-speech
15
+ ---
16
+
17
+ # MMS-TTS Adja — Eyaa-Tom Fine-tuned
18
+
19
+ Fine-tuned version of [facebook/mms-tts-adj](https://huggingface.co/facebook/mms-tts-adj)
20
+ on the **Eyaa-Tom** dataset for **Adja** (`adj`).
21
+
22
+ > Adja/Aja-Gbe. Fine-tuned from facebook/mms-tts-adj (closest MMS checkpoint to ISO ajg).
23
+
24
+ ## Language Details
25
+ | Field | Value |
26
+ |-------|-------|
27
+ | Language | Adja |
28
+ | ISO 639-3 (MMS) | `adj` |
29
+ | Your ISO | `ajg` |
30
+ | Region | Togo/Benin |
31
+ | Family | Gbe (Niger-Congo) |
32
+ | Base model | [facebook/mms-tts-adj](https://huggingface.co/facebook/mms-tts-adj) |
33
+
34
+ ## Training Statistics
35
+ | Metric | Value |
36
+ |--------|-------|
37
+ | Training samples | 5 |
38
+ | Validation samples | 1 |
39
+ | Best validation mel-L1 | 3.3801 |
40
+ | Uploaded variant | `best` |
41
+
42
+ ## Usage
43
+
44
+ ```python
45
+ from transformers import VitsModel, VitsTokenizer
46
+ import torch, torchaudio
47
+
48
+ model = VitsModel.from_pretrained("Umbaji001/eyaa-tom-mms-tts-adj")
49
+ tokenizer = VitsTokenizer.from_pretrained("Umbaji001/eyaa-tom-mms-tts-adj")
50
+
51
+ inputs = tokenizer("your text here", return_tensors="pt")
52
+ with torch.no_grad():
53
+ waveform = model(**inputs).waveform[0]
54
+
55
+ torchaudio.save("output.wav", waveform.unsqueeze(0), model.config.sampling_rate)
56
+ ```
57
+
58
+ ## Training Details
59
+ - **Loss**: Mel-spectrogram L1 (avoids VITS training restriction)
60
+ - **Optimizer**: AdamW — lr=2e-4, betas=(0.8, 0.99)
61
+ - **Scheduler**: ExponentialLR γ=0.999
62
+ - **Epochs**: 6 | **Batch size**: 4 (effective 16 w/ grad accumulation)
63
+
64
+ ## Citation
65
+ ```bibtex
66
+ @article{pratap2023mms,
67
+ title={Scaling Speech Technology to 1,000+ Languages},
68
+ author={Pratap, Vineel et al.},
69
+ journal={arXiv preprint arXiv:2305.13516},
70
+ year={2023}
71
+ }
72
+ ```
73
+ *Fine-tuned: 2026-02-25 — Eyaa-Tom project*
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<unk>": 46
3
+ }
config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "architectures": [
4
+ "VitsModel"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "depth_separable_channels": 2,
8
+ "depth_separable_num_layers": 3,
9
+ "dtype": "float32",
10
+ "duration_predictor_dropout": 0.5,
11
+ "duration_predictor_filter_channels": 256,
12
+ "duration_predictor_flow_bins": 10,
13
+ "duration_predictor_kernel_size": 3,
14
+ "duration_predictor_num_flows": 4,
15
+ "duration_predictor_tail_bound": 5.0,
16
+ "ffn_dim": 768,
17
+ "ffn_kernel_size": 3,
18
+ "flow_size": 192,
19
+ "hidden_act": "relu",
20
+ "hidden_dropout": 0.1,
21
+ "hidden_size": 192,
22
+ "initializer_range": 0.02,
23
+ "layer_norm_eps": 1e-05,
24
+ "layerdrop": 0.1,
25
+ "leaky_relu_slope": 0.1,
26
+ "model_type": "vits",
27
+ "noise_scale": 0.667,
28
+ "noise_scale_duration": 0.8,
29
+ "num_attention_heads": 2,
30
+ "num_hidden_layers": 6,
31
+ "num_speakers": 1,
32
+ "pad_token_id": null,
33
+ "posterior_encoder_num_wavenet_layers": 16,
34
+ "prior_encoder_num_flows": 4,
35
+ "prior_encoder_num_wavenet_layers": 4,
36
+ "resblock_dilation_sizes": [
37
+ [
38
+ 1,
39
+ 3,
40
+ 5
41
+ ],
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ]
52
+ ],
53
+ "resblock_kernel_sizes": [
54
+ 3,
55
+ 7,
56
+ 11
57
+ ],
58
+ "sampling_rate": 16000,
59
+ "speaker_embedding_size": 0,
60
+ "speaking_rate": 1.0,
61
+ "spectrogram_bins": 513,
62
+ "transformers_version": "5.0.0",
63
+ "upsample_initial_channel": 512,
64
+ "upsample_kernel_sizes": [
65
+ 16,
66
+ 16,
67
+ 4,
68
+ 4
69
+ ],
70
+ "upsample_rates": [
71
+ 8,
72
+ 8,
73
+ 2,
74
+ 2
75
+ ],
76
+ "use_bias": true,
77
+ "use_stochastic_duration_prediction": true,
78
+ "vocab_size": 46,
79
+ "wavenet_dilation_rate": 1,
80
+ "wavenet_dropout": 0.0,
81
+ "wavenet_kernel_size": 5,
82
+ "window_size": 4
83
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d9e521bee58d031589349c67dcb39b9793f787ba966e08504a204a8e8965c2d
3
+ size 145233656
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_blank": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "|",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "46": {
13
+ "content": "<unk>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ }
20
+ },
21
+ "additional_special_tokens": null,
22
+ "backend": "custom",
23
+ "clean_up_tokenization_spaces": true,
24
+ "is_local": false,
25
+ "is_uroman": false,
26
+ "language": "adj",
27
+ "model_max_length": 1000000000000000019884624838656,
28
+ "normalize": true,
29
+ "pad_token": "|",
30
+ "phonemize": false,
31
+ "tokenizer_class": "VitsTokenizer",
32
+ "unk_token": "<unk>"
33
+ }
vocab.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 45,
3
+ "'": 10,
4
+ "-": 26,
5
+ "a": 1,
6
+ "b": 12,
7
+ "c": 20,
8
+ "d": 22,
9
+ "e": 2,
10
+ "f": 24,
11
+ "g": 19,
12
+ "h": 32,
13
+ "i": 8,
14
+ "j": 23,
15
+ "k": 7,
16
+ "l": 6,
17
+ "m": 5,
18
+ "n": 4,
19
+ "o": 13,
20
+ "p": 25,
21
+ "r": 18,
22
+ "s": 11,
23
+ "t": 21,
24
+ "u": 17,
25
+ "v": 39,
26
+ "w": 15,
27
+ "y": 14,
28
+ "z": 43,
29
+ "|": 0,
30
+ "à": 37,
31
+ "á": 31,
32
+ "è": 42,
33
+ "é": 33,
34
+ "ê": 38,
35
+ "ì": 34,
36
+ "í": 29,
37
+ "ò": 35,
38
+ "ó": 27,
39
+ "ô": 44,
40
+ "ù": 40,
41
+ "ú": 36,
42
+ "ŋ": 9,
43
+ "ɔ": 16,
44
+ "ɛ": 3,
45
+ "̀": 30,
46
+ "́": 28,
47
+ "̂": 41
48
+ }