ESPnet
TensorBoard
audio
language-identification
qingzhengwang commited on
Commit
dccc7f3
·
1 Parent(s): a82a034

Update model

Browse files
README.md ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - language-identification
6
+ -
7
+ language:
8
+ datasets:
9
+ -
10
+ license: cc-by-4.0
11
+ ---
12
+
13
+ ## ESPnet2 model
14
+
15
+ ### ``
16
+
17
+ This model was trained by using recipe in [espnet](https://github.com/espnet/espnet/).
18
+
19
+ ### Demo: How to use in ESPnet2
20
+
21
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
22
+ if you haven't done that already.
23
+
24
+ ```bash
25
+ cd espnet
26
+
27
+ pip install -e .
28
+ cd egs2/voxlingua107/lid1
29
+ ./run.sh --skip_data_prep false --skip_train true --download_model
30
+ ```
31
+
32
+
33
+
34
+ ## config
35
+
36
+ <details><summary>expand</summary>
37
+
38
+ ```
39
+ config: /work/nvme/bbjs/qwang20/espnet/egs2/lid_delta/lid1/conf/mms_1b_ecapa/mms_ecapa_bs3min_baseline.yaml
40
+ print_config: false
41
+ log_level: INFO
42
+ drop_last_iter: false
43
+ dry_run: false
44
+ iterator_type: category
45
+ valid_iterator_type: category
46
+ output_dir: exp_voxlingua107_raw/lid_mms_ecapa_bs3min_baseline_delta_raw
47
+ ngpu: 1
48
+ seed: 3702
49
+ num_workers: 8
50
+ num_att_plot: 0
51
+ dist_backend: nccl
52
+ dist_init_method: env://
53
+ dist_world_size: null
54
+ dist_rank: null
55
+ local_rank: 0
56
+ dist_master_addr: null
57
+ dist_master_port: null
58
+ dist_launcher: null
59
+ multiprocessing_distributed: false
60
+ unused_parameters: true
61
+ sharded_ddp: false
62
+ use_deepspeed: false
63
+ deepspeed_config: null
64
+ gradient_as_bucket_view: true
65
+ ddp_comm_hook: null
66
+ cudnn_enabled: true
67
+ cudnn_benchmark: true
68
+ cudnn_deterministic: false
69
+ use_tf32: false
70
+ collect_stats: false
71
+ write_collected_feats: false
72
+ max_epoch: 30
73
+ patience: null
74
+ val_scheduler_criterion:
75
+ - valid
76
+ - loss
77
+ early_stopping_criterion:
78
+ - valid
79
+ - loss
80
+ - min
81
+ best_model_criterion:
82
+ - - valid
83
+ - accuracy
84
+ - max
85
+ keep_nbest_models: 2
86
+ nbest_averaging_interval: 0
87
+ grad_clip: 9999
88
+ grad_clip_type: 2.0
89
+ grad_noise: false
90
+ accum_grad: 2
91
+ no_forward_run: false
92
+ resume: true
93
+ train_dtype: float32
94
+ use_amp: true
95
+ log_interval: 100
96
+ use_matplotlib: true
97
+ use_tensorboard: true
98
+ create_graph_in_tensorboard: false
99
+ use_wandb: true
100
+ wandb_project: lid
101
+ wandb_id: null
102
+ wandb_entity: qingzhew-carnegie-mellon-university
103
+ wandb_name: null
104
+ wandb_model_log_interval: -1
105
+ detect_anomaly: false
106
+ use_adapter: false
107
+ adapter: lora
108
+ save_strategy: all
109
+ adapter_conf: {}
110
+ pretrain_path: null
111
+ init_param: []
112
+ ignore_init_mismatch: false
113
+ freeze_param: []
114
+ num_iters_per_epoch: 1000
115
+ batch_size: 20
116
+ valid_batch_size: null
117
+ batch_bins: 2880000
118
+ valid_batch_bins: null
119
+ category_sample_size: 10
120
+ train_shape_file:
121
+ - exp_voxlingua107_raw/lid_stats_16k/train/speech_shape
122
+ valid_shape_file:
123
+ - exp_voxlingua107_raw/lid_stats_16k/valid/speech_shape
124
+ batch_type: catpow
125
+ upsampling_factor: 0.5
126
+ language_upsampling_factor: 0.5
127
+ dataset_upsampling_factor: 0.5
128
+ dataset_scaling_factor: 1.2
129
+ max_batch_size: 16
130
+ valid_batch_type: null
131
+ fold_length:
132
+ - 120000
133
+ sort_in_batch: descending
134
+ shuffle_within_batch: false
135
+ sort_batch: descending
136
+ multiple_iterator: false
137
+ chunk_length: 500
138
+ chunk_shift_ratio: 0.5
139
+ num_cache_chunks: 1024
140
+ chunk_excluded_key_prefixes: []
141
+ chunk_default_fs: null
142
+ chunk_max_abs_length: null
143
+ chunk_discard_short_samples: true
144
+ train_data_path_and_name_and_type:
145
+ - - dump/raw/train_voxlingua107/wav.scp
146
+ - speech
147
+ - sound
148
+ - - dump/raw/train_voxlingua107/utt2lang
149
+ - lid_labels
150
+ - text
151
+ valid_data_path_and_name_and_type:
152
+ - - dump/raw/dev_voxlingua107/wav.scp
153
+ - speech
154
+ - sound
155
+ - - dump/raw/dev_voxlingua107/utt2lang
156
+ - lid_labels
157
+ - text
158
+ multi_task_dataset: false
159
+ allow_variable_data_keys: false
160
+ max_cache_size: 0.0
161
+ max_cache_fd: 32
162
+ allow_multi_rates: false
163
+ valid_max_cache_size: null
164
+ exclude_weight_decay: false
165
+ exclude_weight_decay_conf: {}
166
+ optim: adam
167
+ optim_conf:
168
+ lr: 5.0e-06
169
+ betas:
170
+ - 0.9
171
+ - 0.98
172
+ scheduler: tristagelr
173
+ scheduler_conf:
174
+ max_steps: 30000
175
+ warmup_ratio: 0.3
176
+ hold_ratio: 0.2
177
+ decay_ratio: 0.5
178
+ init_lr_scale: 0.6
179
+ final_lr_scale: 0.1
180
+ init: null
181
+ use_preprocessor: true
182
+ input_size: null
183
+ target_duration: 3.0
184
+ lang2utt: dump/raw/train_voxlingua107/lang2utt
185
+ lang_num: 107
186
+ sample_rate: 16000
187
+ num_eval: 10
188
+ rir_scp: ''
189
+ model: espnet
190
+ model_conf:
191
+ extract_feats_in_collect_stats: false
192
+ frontend: s3prl
193
+ frontend_conf:
194
+ frontend_conf:
195
+ upstream: hf_wav2vec2_custom
196
+ path_or_url: facebook/mms-1b
197
+ download_dir: ./hub
198
+ multilayer_feature: true
199
+ specaug: null
200
+ specaug_conf: {}
201
+ normalize: utterance_mvn
202
+ normalize_conf:
203
+ norm_vars: false
204
+ encoder: ecapa_tdnn
205
+ encoder_conf:
206
+ model_scale: 8
207
+ ndim: 512
208
+ output_size: 1536
209
+ pooling: chn_attn_stat
210
+ pooling_conf: {}
211
+ projector: rawnet3
212
+ projector_conf:
213
+ output_size: 192
214
+ encoder_condition: rawnet3
215
+ encoder_condition_conf: {}
216
+ pooling_condition: chn_attn_stat
217
+ pooling_condition_conf: {}
218
+ projector_condition: rawnet3
219
+ projector_condition_conf: {}
220
+ preprocessor: lid
221
+ preprocessor_conf:
222
+ fix_duration: false
223
+ sample_rate: 16000
224
+ noise_apply_prob: 0.0
225
+ noise_info:
226
+ - - 1.0
227
+ - dump/raw/musan_speech.scp
228
+ - - 4
229
+ - 7
230
+ - - 13
231
+ - 20
232
+ - - 1.0
233
+ - dump/raw/musan_noise.scp
234
+ - - 1
235
+ - 1
236
+ - - 0
237
+ - 15
238
+ - - 1.0
239
+ - dump/raw/musan_music.scp
240
+ - - 1
241
+ - 1
242
+ - - 5
243
+ - 15
244
+ rir_apply_prob: 0.0
245
+ rir_scp: dump/raw/rirs.scp
246
+ loss: aamsoftmax_sc_topk
247
+ loss_conf:
248
+ margin: 0.5
249
+ scale: 30
250
+ K: 3
251
+ mp: 0.06
252
+ k_top: 5
253
+ required:
254
+ - output_dir
255
+ version: '202412'
256
+ distributed: false
257
+ ```
258
+
259
+ </details>
260
+
261
+
262
+
263
+ ### Citing ESPnet
264
+
265
+ ```BibTex
266
+ @inproceedings{watanabe2018espnet,
267
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
268
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
269
+ year={2018},
270
+ booktitle={Proceedings of Interspeech},
271
+ pages={2207--2211},
272
+ doi={10.21437/Interspeech.2018-1456},
273
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
274
+ }
275
+
276
+
277
+
278
+
279
+
280
+
281
+ ```
282
+
283
+ or arXiv:
284
+
285
+ ```bibtex
286
+ @misc{watanabe2018espnet,
287
+ title={ESPnet: End-to-End Speech Processing Toolkit},
288
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
289
+ year={2018},
290
+ eprint={1804.00015},
291
+ archivePrefix={arXiv},
292
+ primaryClass={cs.CL}
293
+ }
294
+ ```
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/config.yaml ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: /work/nvme/bbjs/qwang20/espnet/egs2/lid_delta/lid1/conf/mms_1b_ecapa/mms_ecapa_bs3min_baseline.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: category
7
+ valid_iterator_type: category
8
+ output_dir: exp_voxlingua107_raw/lid_mms_ecapa_bs3min_baseline_delta_raw
9
+ ngpu: 1
10
+ seed: 3702
11
+ num_workers: 8
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: true
30
+ cudnn_deterministic: false
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 30
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - valid
45
+ - accuracy
46
+ - max
47
+ keep_nbest_models: 2
48
+ nbest_averaging_interval: 0
49
+ grad_clip: 9999
50
+ grad_clip_type: 2.0
51
+ grad_noise: false
52
+ accum_grad: 2
53
+ no_forward_run: false
54
+ resume: true
55
+ train_dtype: float32
56
+ use_amp: true
57
+ log_interval: 100
58
+ use_matplotlib: true
59
+ use_tensorboard: true
60
+ create_graph_in_tensorboard: false
61
+ use_wandb: true
62
+ wandb_project: lid
63
+ wandb_id: null
64
+ wandb_entity: qingzhew-carnegie-mellon-university
65
+ wandb_name: null
66
+ wandb_model_log_interval: -1
67
+ detect_anomaly: false
68
+ use_adapter: false
69
+ adapter: lora
70
+ save_strategy: all
71
+ adapter_conf: {}
72
+ pretrain_path: null
73
+ init_param: []
74
+ ignore_init_mismatch: false
75
+ freeze_param: []
76
+ num_iters_per_epoch: 1000
77
+ batch_size: 20
78
+ valid_batch_size: null
79
+ batch_bins: 2880000
80
+ valid_batch_bins: null
81
+ category_sample_size: 10
82
+ train_shape_file:
83
+ - exp_voxlingua107_raw/lid_stats_16k/train/speech_shape
84
+ valid_shape_file:
85
+ - exp_voxlingua107_raw/lid_stats_16k/valid/speech_shape
86
+ batch_type: catpow
87
+ upsampling_factor: 0.5
88
+ language_upsampling_factor: 0.5
89
+ dataset_upsampling_factor: 0.5
90
+ dataset_scaling_factor: 1.2
91
+ max_batch_size: 16
92
+ valid_batch_type: null
93
+ fold_length:
94
+ - 120000
95
+ sort_in_batch: descending
96
+ shuffle_within_batch: false
97
+ sort_batch: descending
98
+ multiple_iterator: false
99
+ chunk_length: 500
100
+ chunk_shift_ratio: 0.5
101
+ num_cache_chunks: 1024
102
+ chunk_excluded_key_prefixes: []
103
+ chunk_default_fs: null
104
+ chunk_max_abs_length: null
105
+ chunk_discard_short_samples: true
106
+ train_data_path_and_name_and_type:
107
+ - - dump/raw/train_voxlingua107/wav.scp
108
+ - speech
109
+ - sound
110
+ - - dump/raw/train_voxlingua107/utt2lang
111
+ - lid_labels
112
+ - text
113
+ valid_data_path_and_name_and_type:
114
+ - - dump/raw/dev_voxlingua107/wav.scp
115
+ - speech
116
+ - sound
117
+ - - dump/raw/dev_voxlingua107/utt2lang
118
+ - lid_labels
119
+ - text
120
+ multi_task_dataset: false
121
+ allow_variable_data_keys: false
122
+ max_cache_size: 0.0
123
+ max_cache_fd: 32
124
+ allow_multi_rates: false
125
+ valid_max_cache_size: null
126
+ exclude_weight_decay: false
127
+ exclude_weight_decay_conf: {}
128
+ optim: adam
129
+ optim_conf:
130
+ lr: 5.0e-06
131
+ betas:
132
+ - 0.9
133
+ - 0.98
134
+ scheduler: tristagelr
135
+ scheduler_conf:
136
+ max_steps: 30000
137
+ warmup_ratio: 0.3
138
+ hold_ratio: 0.2
139
+ decay_ratio: 0.5
140
+ init_lr_scale: 0.6
141
+ final_lr_scale: 0.1
142
+ init: null
143
+ use_preprocessor: true
144
+ input_size: null
145
+ target_duration: 3.0
146
+ lang2utt: dump/raw/train_voxlingua107/lang2utt
147
+ lang_num: 107
148
+ sample_rate: 16000
149
+ num_eval: 10
150
+ rir_scp: ''
151
+ model: espnet
152
+ model_conf:
153
+ extract_feats_in_collect_stats: false
154
+ frontend: s3prl
155
+ frontend_conf:
156
+ frontend_conf:
157
+ upstream: hf_wav2vec2_custom
158
+ path_or_url: facebook/mms-1b
159
+ download_dir: ./hub
160
+ multilayer_feature: true
161
+ specaug: null
162
+ specaug_conf: {}
163
+ normalize: utterance_mvn
164
+ normalize_conf:
165
+ norm_vars: false
166
+ encoder: ecapa_tdnn
167
+ encoder_conf:
168
+ model_scale: 8
169
+ ndim: 512
170
+ output_size: 1536
171
+ pooling: chn_attn_stat
172
+ pooling_conf: {}
173
+ projector: rawnet3
174
+ projector_conf:
175
+ output_size: 192
176
+ encoder_condition: rawnet3
177
+ encoder_condition_conf: {}
178
+ pooling_condition: chn_attn_stat
179
+ pooling_condition_conf: {}
180
+ projector_condition: rawnet3
181
+ projector_condition_conf: {}
182
+ preprocessor: lid
183
+ preprocessor_conf:
184
+ fix_duration: false
185
+ sample_rate: 16000
186
+ noise_apply_prob: 0.0
187
+ noise_info:
188
+ - - 1.0
189
+ - dump/raw/musan_speech.scp
190
+ - - 4
191
+ - 7
192
+ - - 13
193
+ - 20
194
+ - - 1.0
195
+ - dump/raw/musan_noise.scp
196
+ - - 1
197
+ - 1
198
+ - - 0
199
+ - 15
200
+ - - 1.0
201
+ - dump/raw/musan_music.scp
202
+ - - 1
203
+ - 1
204
+ - - 5
205
+ - 15
206
+ rir_apply_prob: 0.0
207
+ rir_scp: dump/raw/rirs.scp
208
+ loss: aamsoftmax_sc_topk
209
+ loss_conf:
210
+ margin: 0.5
211
+ scale: 30
212
+ K: 3
213
+ mp: 0.06
214
+ k_top: 5
215
+ required:
216
+ - output_dir
217
+ version: '202412'
218
+ distributed: false
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/accuracy.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/backward_time.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/class_loss.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/clip.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/forward_time.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/gpu_max_cached_mem_GB.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/grad_norm.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/iter_time.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/loss.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/loss_scale.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/optim0_lr0.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/optim_step_time.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/images/train_time.png ADDED
exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/valid.accuracy.ave.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78c98a730eb82b01208cf443d64ddb229a48318bfd8f0f77131d7e0bb9c49aaa
3
+ size 3887810526
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ model_file: exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/valid.accuracy.ave.pth
4
+ python: 3.11.13 (main, Jun 5 2025, 13:12:00) [GCC 11.2.0]
5
+ timestamp: 1750910893.727296
6
+ torch: 2.4.0+cu118
7
+ yaml_files:
8
+ train_config: exp_voxlingua107_raw/lid_mms_ecapa_baseline_raw/config.yaml