nithinraok eustlb HF Staff commited on
Commit
593ce35
·
1 Parent(s): 6d590f7

trfms-integration (#39)

Browse files

- Upload processor (3a5abaf60ef0a67fae59b9e02dd3789290f08ce9)
- Upload ParakeetForTDT (c8f5c22ac90fb7d088227e148faa093e435e37c9)


Co-authored-by: Eustache Le Bihan <eustlb@users.noreply.huggingface.co>

README.md CHANGED
@@ -27,13 +27,11 @@ language:
27
  - sv
28
  - ru
29
  - uk
30
-
31
  pipeline_tag: automatic-speech-recognition
32
  library_name: nemo
33
  datasets:
34
  - nvidia/Granary
35
  - nemo/asr-set-3.0
36
- thumbnail: null
37
  tags:
38
  - automatic-speech-recognition
39
  - speech
@@ -50,12 +48,14 @@ widget:
50
  src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
51
  - example_title: Librispeech sample 2
52
  src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
 
 
53
  model-index:
54
  - name: parakeet-tdt-0.6b-v3
55
  results:
56
  - task:
57
- name: Automatic Speech Recognition
58
  type: automatic-speech-recognition
 
59
  dataset:
60
  name: AMI (Meetings test)
61
  type: edinburghcstr/ami
@@ -64,12 +64,12 @@ model-index:
64
  args:
65
  language: en
66
  metrics:
67
- - name: Test WER
68
- type: wer
69
  value: 11.31
 
70
  - task:
71
- name: Automatic Speech Recognition
72
  type: automatic-speech-recognition
 
73
  dataset:
74
  name: Earnings-22
75
  type: revdotcom/earnings22
@@ -77,12 +77,12 @@ model-index:
77
  args:
78
  language: en
79
  metrics:
80
- - name: Test WER
81
- type: wer
82
  value: 11.42
 
83
  - task:
84
- name: Automatic Speech Recognition
85
  type: automatic-speech-recognition
 
86
  dataset:
87
  name: GigaSpeech
88
  type: speechcolab/gigaspeech
@@ -90,12 +90,12 @@ model-index:
90
  args:
91
  language: en
92
  metrics:
93
- - name: Test WER
94
- type: wer
95
  value: 9.59
 
96
  - task:
97
- name: Automatic Speech Recognition
98
  type: automatic-speech-recognition
 
99
  dataset:
100
  name: LibriSpeech (clean)
101
  type: librispeech_asr
@@ -104,23 +104,12 @@ model-index:
104
  args:
105
  language: en
106
  metrics:
107
- - name: Test WER
108
- type: wer
109
  value: 1.93
110
- - task:
111
- name: Automatic Speech Recognition
112
- type: automatic-speech-recognition
113
- dataset:
114
- name: LibriSpeech (other)
115
- type: librispeech_asr
116
- config: other
117
- split: test
118
- args:
119
- language: en
120
- metrics:
121
- - name: Test WER
122
- type: wer
123
  value: 3.59
 
124
  - task:
125
  type: Automatic Speech Recognition
126
  name: automatic-speech-recognition
@@ -132,9 +121,9 @@ model-index:
132
  args:
133
  language: en
134
  metrics:
135
- - name: Test WER
136
- type: wer
137
  value: 3.97
 
138
  - task:
139
  type: Automatic Speech Recognition
140
  name: automatic-speech-recognition
@@ -146,12 +135,12 @@ model-index:
146
  args:
147
  language: en
148
  metrics:
149
- - name: Test WER
150
- type: wer
151
  value: 2.75
 
152
  - task:
153
- name: Automatic Speech Recognition
154
  type: automatic-speech-recognition
 
155
  dataset:
156
  name: Vox Populi
157
  type: facebook/voxpopuli
@@ -160,9 +149,9 @@ model-index:
160
  args:
161
  language: en
162
  metrics:
163
- - name: Test WER
164
- type: wer
165
  value: 6.14
 
166
  - task:
167
  type: Automatic Speech Recognition
168
  name: automatic-speech-recognition
@@ -174,9 +163,9 @@ model-index:
174
  args:
175
  language: bg
176
  metrics:
177
- - name: Test WER (Bg)
178
- type: wer
179
  value: 12.64
 
180
  - task:
181
  type: Automatic Speech Recognition
182
  name: automatic-speech-recognition
@@ -188,9 +177,9 @@ model-index:
188
  args:
189
  language: cs
190
  metrics:
191
- - name: Test WER (Cs)
192
- type: wer
193
  value: 11.01
 
194
  - task:
195
  type: Automatic Speech Recognition
196
  name: automatic-speech-recognition
@@ -202,9 +191,9 @@ model-index:
202
  args:
203
  language: da
204
  metrics:
205
- - name: Test WER (Da)
206
- type: wer
207
  value: 18.41
 
208
  - task:
209
  type: Automatic Speech Recognition
210
  name: automatic-speech-recognition
@@ -216,9 +205,9 @@ model-index:
216
  args:
217
  language: de
218
  metrics:
219
- - name: Test WER (De)
220
- type: wer
221
  value: 5.04
 
222
  - task:
223
  type: Automatic Speech Recognition
224
  name: automatic-speech-recognition
@@ -230,9 +219,9 @@ model-index:
230
  args:
231
  language: el
232
  metrics:
233
- - name: Test WER (El)
234
- type: wer
235
- value: 20.70
236
  - task:
237
  type: Automatic Speech Recognition
238
  name: automatic-speech-recognition
@@ -244,9 +233,9 @@ model-index:
244
  args:
245
  language: en
246
  metrics:
247
- - name: Test WER (En)
248
- type: wer
249
  value: 4.85
 
250
  - task:
251
  type: Automatic Speech Recognition
252
  name: automatic-speech-recognition
@@ -258,9 +247,9 @@ model-index:
258
  args:
259
  language: es
260
  metrics:
261
- - name: Test WER (Es)
262
- type: wer
263
  value: 3.45
 
264
  - task:
265
  type: Automatic Speech Recognition
266
  name: automatic-speech-recognition
@@ -272,9 +261,9 @@ model-index:
272
  args:
273
  language: et
274
  metrics:
275
- - name: Test WER (Et)
276
- type: wer
277
  value: 17.73
 
278
  - task:
279
  type: Automatic Speech Recognition
280
  name: automatic-speech-recognition
@@ -286,9 +275,9 @@ model-index:
286
  args:
287
  language: fi
288
  metrics:
289
- - name: Test WER (Fi)
290
- type: wer
291
  value: 13.21
 
292
  - task:
293
  type: Automatic Speech Recognition
294
  name: automatic-speech-recognition
@@ -300,9 +289,9 @@ model-index:
300
  args:
301
  language: fr
302
  metrics:
303
- - name: Test WER (Fr)
304
- type: wer
305
  value: 5.15
 
306
  - task:
307
  type: Automatic Speech Recognition
308
  name: automatic-speech-recognition
@@ -314,9 +303,9 @@ model-index:
314
  args:
315
  language: hr
316
  metrics:
317
- - name: Test WER (Hr)
318
- type: wer
319
  value: 12.46
 
320
  - task:
321
  type: Automatic Speech Recognition
322
  name: automatic-speech-recognition
@@ -328,9 +317,9 @@ model-index:
328
  args:
329
  language: hu
330
  metrics:
331
- - name: Test WER (Hu)
332
- type: wer
333
  value: 15.72
 
334
  - task:
335
  type: Automatic Speech Recognition
336
  name: automatic-speech-recognition
@@ -342,9 +331,9 @@ model-index:
342
  args:
343
  language: it
344
  metrics:
345
- - name: Test WER (It)
346
- type: wer
347
- value: 3.00
348
  - task:
349
  type: Automatic Speech Recognition
350
  name: automatic-speech-recognition
@@ -356,9 +345,9 @@ model-index:
356
  args:
357
  language: lt
358
  metrics:
359
- - name: Test WER (Lt)
360
- type: wer
361
  value: 20.35
 
362
  - task:
363
  type: Automatic Speech Recognition
364
  name: automatic-speech-recognition
@@ -370,9 +359,9 @@ model-index:
370
  args:
371
  language: lv
372
  metrics:
373
- - name: Test WER (Lv)
374
- type: wer
375
  value: 22.84
 
376
  - task:
377
  type: Automatic Speech Recognition
378
  name: automatic-speech-recognition
@@ -384,9 +373,9 @@ model-index:
384
  args:
385
  language: mt
386
  metrics:
387
- - name: Test WER (Mt)
388
- type: wer
389
  value: 20.46
 
390
  - task:
391
  type: Automatic Speech Recognition
392
  name: automatic-speech-recognition
@@ -398,9 +387,9 @@ model-index:
398
  args:
399
  language: nl
400
  metrics:
401
- - name: Test WER (Nl)
402
- type: wer
403
  value: 7.48
 
404
  - task:
405
  type: Automatic Speech Recognition
406
  name: automatic-speech-recognition
@@ -412,9 +401,9 @@ model-index:
412
  args:
413
  language: pl
414
  metrics:
415
- - name: Test WER (Pl)
416
- type: wer
417
  value: 7.31
 
418
  - task:
419
  type: Automatic Speech Recognition
420
  name: automatic-speech-recognition
@@ -426,9 +415,9 @@ model-index:
426
  args:
427
  language: pt
428
  metrics:
429
- - name: Test WER (Pt)
430
- type: wer
431
  value: 4.76
 
432
  - task:
433
  type: Automatic Speech Recognition
434
  name: automatic-speech-recognition
@@ -440,9 +429,9 @@ model-index:
440
  args:
441
  language: ro
442
  metrics:
443
- - name: Test WER (Ro)
444
- type: wer
445
  value: 12.44
 
446
  - task:
447
  type: Automatic Speech Recognition
448
  name: automatic-speech-recognition
@@ -454,9 +443,9 @@ model-index:
454
  args:
455
  language: ru
456
  metrics:
457
- - name: Test WER (Ru)
458
- type: wer
459
  value: 5.51
 
460
  - task:
461
  type: Automatic Speech Recognition
462
  name: automatic-speech-recognition
@@ -468,9 +457,9 @@ model-index:
468
  args:
469
  language: sk
470
  metrics:
471
- - name: Test WER (Sk)
472
- type: wer
473
  value: 8.82
 
474
  - task:
475
  type: Automatic Speech Recognition
476
  name: automatic-speech-recognition
@@ -482,9 +471,9 @@ model-index:
482
  args:
483
  language: sl
484
  metrics:
485
- - name: Test WER (Sl)
486
- type: wer
487
  value: 24.03
 
488
  - task:
489
  type: Automatic Speech Recognition
490
  name: automatic-speech-recognition
@@ -496,9 +485,9 @@ model-index:
496
  args:
497
  language: sv
498
  metrics:
499
- - name: Test WER (Sv)
500
- type: wer
501
  value: 15.08
 
502
  - task:
503
  type: Automatic Speech Recognition
504
  name: automatic-speech-recognition
@@ -510,10 +499,9 @@ model-index:
510
  args:
511
  language: uk
512
  metrics:
513
- - name: Test WER (Uk)
514
- type: wer
515
  value: 6.79
516
- # Multilingual LibriSpeech ASR Results
517
  - task:
518
  type: Automatic Speech Recognition
519
  name: automatic-speech-recognition
@@ -525,9 +513,9 @@ model-index:
525
  args:
526
  language: es
527
  metrics:
528
- - name: Test WER (Es)
529
- type: wer
530
  value: 4.39
 
531
  - task:
532
  type: Automatic Speech Recognition
533
  name: automatic-speech-recognition
@@ -539,9 +527,9 @@ model-index:
539
  args:
540
  language: fr
541
  metrics:
542
- - name: Test WER (Fr)
543
- type: wer
544
  value: 4.97
 
545
  - task:
546
  type: Automatic Speech Recognition
547
  name: automatic-speech-recognition
@@ -553,9 +541,9 @@ model-index:
553
  args:
554
  language: it
555
  metrics:
556
- - name: Test WER (It)
557
- type: wer
558
  value: 10.08
 
559
  - task:
560
  type: Automatic Speech Recognition
561
  name: automatic-speech-recognition
@@ -567,9 +555,9 @@ model-index:
567
  args:
568
  language: nl
569
  metrics:
570
- - name: Test WER (Nl)
571
- type: wer
572
  value: 12.78
 
573
  - task:
574
  type: Automatic Speech Recognition
575
  name: automatic-speech-recognition
@@ -581,9 +569,9 @@ model-index:
581
  args:
582
  language: pl
583
  metrics:
584
- - name: Test WER (Pl)
585
- type: wer
586
  value: 7.28
 
587
  - task:
588
  type: Automatic Speech Recognition
589
  name: automatic-speech-recognition
@@ -595,10 +583,9 @@ model-index:
595
  args:
596
  language: pt
597
  metrics:
598
- - name: Test WER (Pt)
599
- type: wer
600
- value: 7.50
601
- # CoVoST2 ASR Results
602
  - task:
603
  type: Automatic Speech Recognition
604
  name: automatic-speech-recognition
@@ -610,9 +597,9 @@ model-index:
610
  args:
611
  language: de
612
  metrics:
613
- - name: Test WER (De)
614
- type: wer
615
  value: 4.84
 
616
  - task:
617
  type: Automatic Speech Recognition
618
  name: automatic-speech-recognition
@@ -624,9 +611,9 @@ model-index:
624
  args:
625
  language: en
626
  metrics:
627
- - name: Test WER (En)
628
- type: wer
629
- value: 6.80
630
  - task:
631
  type: Automatic Speech Recognition
632
  name: automatic-speech-recognition
@@ -638,9 +625,9 @@ model-index:
638
  args:
639
  language: es
640
  metrics:
641
- - name: Test WER (Es)
642
- type: wer
643
  value: 3.41
 
644
  - task:
645
  type: Automatic Speech Recognition
646
  name: automatic-speech-recognition
@@ -652,9 +639,9 @@ model-index:
652
  args:
653
  language: et
654
  metrics:
655
- - name: Test WER (Et)
656
- type: wer
657
  value: 22.04
 
658
  - task:
659
  type: Automatic Speech Recognition
660
  name: automatic-speech-recognition
@@ -666,9 +653,9 @@ model-index:
666
  args:
667
  language: fr
668
  metrics:
669
- - name: Test WER (Fr)
670
- type: wer
671
  value: 6.05
 
672
  - task:
673
  type: Automatic Speech Recognition
674
  name: automatic-speech-recognition
@@ -680,9 +667,9 @@ model-index:
680
  args:
681
  language: it
682
  metrics:
683
- - name: Test WER (It)
684
- type: wer
685
  value: 3.69
 
686
  - task:
687
  type: Automatic Speech Recognition
688
  name: automatic-speech-recognition
@@ -694,9 +681,9 @@ model-index:
694
  args:
695
  language: lv
696
  metrics:
697
- - name: Test WER (Lv)
698
- type: wer
699
  value: 38.36
 
700
  - task:
701
  type: Automatic Speech Recognition
702
  name: automatic-speech-recognition
@@ -708,9 +695,9 @@ model-index:
708
  args:
709
  language: nl
710
  metrics:
711
- - name: Test WER (Nl)
712
- type: wer
713
- value: 6.50
714
  - task:
715
  type: Automatic Speech Recognition
716
  name: automatic-speech-recognition
@@ -722,9 +709,9 @@ model-index:
722
  args:
723
  language: pt
724
  metrics:
725
- - name: Test WER (Pt)
726
- type: wer
727
  value: 3.96
 
728
  - task:
729
  type: Automatic Speech Recognition
730
  name: automatic-speech-recognition
@@ -736,9 +723,9 @@ model-index:
736
  args:
737
  language: ru
738
  metrics:
739
- - name: Test WER (Ru)
740
- type: wer
741
- value: 3.00
742
  - task:
743
  type: Automatic Speech Recognition
744
  name: automatic-speech-recognition
@@ -750,9 +737,9 @@ model-index:
750
  args:
751
  language: sl
752
  metrics:
753
- - name: Test WER (Sl)
754
- type: wer
755
- value: 31.80
756
  - task:
757
  type: Automatic Speech Recognition
758
  name: automatic-speech-recognition
@@ -764,9 +751,9 @@ model-index:
764
  args:
765
  language: sv
766
  metrics:
767
- - name: Test WER (Sv)
768
- type: wer
769
  value: 20.16
 
770
  - task:
771
  type: Automatic Speech Recognition
772
  name: automatic-speech-recognition
@@ -778,11 +765,9 @@ model-index:
778
  args:
779
  language: uk
780
  metrics:
781
- - name: Test WER (Uk)
782
- type: wer
783
- value: 5.10
784
- metrics:
785
- - wer
786
  ---
787
 
788
  # **<span style="color:#76b900;">🦜 parakeet-tdt-0.6b-v3: Multilingual Speech-to-Text Model</span>**
 
27
  - sv
28
  - ru
29
  - uk
 
30
  pipeline_tag: automatic-speech-recognition
31
  library_name: nemo
32
  datasets:
33
  - nvidia/Granary
34
  - nemo/asr-set-3.0
 
35
  tags:
36
  - automatic-speech-recognition
37
  - speech
 
48
  src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
49
  - example_title: Librispeech sample 2
50
  src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
51
+ metrics:
52
+ - wer
53
  model-index:
54
  - name: parakeet-tdt-0.6b-v3
55
  results:
56
  - task:
 
57
  type: automatic-speech-recognition
58
+ name: Automatic Speech Recognition
59
  dataset:
60
  name: AMI (Meetings test)
61
  type: edinburghcstr/ami
 
64
  args:
65
  language: en
66
  metrics:
67
+ - type: wer
 
68
  value: 11.31
69
+ name: Test WER
70
  - task:
 
71
  type: automatic-speech-recognition
72
+ name: Automatic Speech Recognition
73
  dataset:
74
  name: Earnings-22
75
  type: revdotcom/earnings22
 
77
  args:
78
  language: en
79
  metrics:
80
+ - type: wer
 
81
  value: 11.42
82
+ name: Test WER
83
  - task:
 
84
  type: automatic-speech-recognition
85
+ name: Automatic Speech Recognition
86
  dataset:
87
  name: GigaSpeech
88
  type: speechcolab/gigaspeech
 
90
  args:
91
  language: en
92
  metrics:
93
+ - type: wer
 
94
  value: 9.59
95
+ name: Test WER
96
  - task:
 
97
  type: automatic-speech-recognition
98
+ name: Automatic Speech Recognition
99
  dataset:
100
  name: LibriSpeech (clean)
101
  type: librispeech_asr
 
104
  args:
105
  language: en
106
  metrics:
107
+ - type: wer
 
108
  value: 1.93
109
+ name: Test WER
110
+ - type: wer
 
 
 
 
 
 
 
 
 
 
 
111
  value: 3.59
112
+ name: Test WER
113
  - task:
114
  type: Automatic Speech Recognition
115
  name: automatic-speech-recognition
 
121
  args:
122
  language: en
123
  metrics:
124
+ - type: wer
 
125
  value: 3.97
126
+ name: Test WER
127
  - task:
128
  type: Automatic Speech Recognition
129
  name: automatic-speech-recognition
 
135
  args:
136
  language: en
137
  metrics:
138
+ - type: wer
 
139
  value: 2.75
140
+ name: Test WER
141
  - task:
 
142
  type: automatic-speech-recognition
143
+ name: Automatic Speech Recognition
144
  dataset:
145
  name: Vox Populi
146
  type: facebook/voxpopuli
 
149
  args:
150
  language: en
151
  metrics:
152
+ - type: wer
 
153
  value: 6.14
154
+ name: Test WER
155
  - task:
156
  type: Automatic Speech Recognition
157
  name: automatic-speech-recognition
 
163
  args:
164
  language: bg
165
  metrics:
166
+ - type: wer
 
167
  value: 12.64
168
+ name: Test WER (Bg)
169
  - task:
170
  type: Automatic Speech Recognition
171
  name: automatic-speech-recognition
 
177
  args:
178
  language: cs
179
  metrics:
180
+ - type: wer
 
181
  value: 11.01
182
+ name: Test WER (Cs)
183
  - task:
184
  type: Automatic Speech Recognition
185
  name: automatic-speech-recognition
 
191
  args:
192
  language: da
193
  metrics:
194
+ - type: wer
 
195
  value: 18.41
196
+ name: Test WER (Da)
197
  - task:
198
  type: Automatic Speech Recognition
199
  name: automatic-speech-recognition
 
205
  args:
206
  language: de
207
  metrics:
208
+ - type: wer
 
209
  value: 5.04
210
+ name: Test WER (De)
211
  - task:
212
  type: Automatic Speech Recognition
213
  name: automatic-speech-recognition
 
219
  args:
220
  language: el
221
  metrics:
222
+ - type: wer
223
+ value: 20.7
224
+ name: Test WER (El)
225
  - task:
226
  type: Automatic Speech Recognition
227
  name: automatic-speech-recognition
 
233
  args:
234
  language: en
235
  metrics:
236
+ - type: wer
 
237
  value: 4.85
238
+ name: Test WER (En)
239
  - task:
240
  type: Automatic Speech Recognition
241
  name: automatic-speech-recognition
 
247
  args:
248
  language: es
249
  metrics:
250
+ - type: wer
 
251
  value: 3.45
252
+ name: Test WER (Es)
253
  - task:
254
  type: Automatic Speech Recognition
255
  name: automatic-speech-recognition
 
261
  args:
262
  language: et
263
  metrics:
264
+ - type: wer
 
265
  value: 17.73
266
+ name: Test WER (Et)
267
  - task:
268
  type: Automatic Speech Recognition
269
  name: automatic-speech-recognition
 
275
  args:
276
  language: fi
277
  metrics:
278
+ - type: wer
 
279
  value: 13.21
280
+ name: Test WER (Fi)
281
  - task:
282
  type: Automatic Speech Recognition
283
  name: automatic-speech-recognition
 
289
  args:
290
  language: fr
291
  metrics:
292
+ - type: wer
 
293
  value: 5.15
294
+ name: Test WER (Fr)
295
  - task:
296
  type: Automatic Speech Recognition
297
  name: automatic-speech-recognition
 
303
  args:
304
  language: hr
305
  metrics:
306
+ - type: wer
 
307
  value: 12.46
308
+ name: Test WER (Hr)
309
  - task:
310
  type: Automatic Speech Recognition
311
  name: automatic-speech-recognition
 
317
  args:
318
  language: hu
319
  metrics:
320
+ - type: wer
 
321
  value: 15.72
322
+ name: Test WER (Hu)
323
  - task:
324
  type: Automatic Speech Recognition
325
  name: automatic-speech-recognition
 
331
  args:
332
  language: it
333
  metrics:
334
+ - type: wer
335
+ value: 3.0
336
+ name: Test WER (It)
337
  - task:
338
  type: Automatic Speech Recognition
339
  name: automatic-speech-recognition
 
345
  args:
346
  language: lt
347
  metrics:
348
+ - type: wer
 
349
  value: 20.35
350
+ name: Test WER (Lt)
351
  - task:
352
  type: Automatic Speech Recognition
353
  name: automatic-speech-recognition
 
359
  args:
360
  language: lv
361
  metrics:
362
+ - type: wer
 
363
  value: 22.84
364
+ name: Test WER (Lv)
365
  - task:
366
  type: Automatic Speech Recognition
367
  name: automatic-speech-recognition
 
373
  args:
374
  language: mt
375
  metrics:
376
+ - type: wer
 
377
  value: 20.46
378
+ name: Test WER (Mt)
379
  - task:
380
  type: Automatic Speech Recognition
381
  name: automatic-speech-recognition
 
387
  args:
388
  language: nl
389
  metrics:
390
+ - type: wer
 
391
  value: 7.48
392
+ name: Test WER (Nl)
393
  - task:
394
  type: Automatic Speech Recognition
395
  name: automatic-speech-recognition
 
401
  args:
402
  language: pl
403
  metrics:
404
+ - type: wer
 
405
  value: 7.31
406
+ name: Test WER (Pl)
407
  - task:
408
  type: Automatic Speech Recognition
409
  name: automatic-speech-recognition
 
415
  args:
416
  language: pt
417
  metrics:
418
+ - type: wer
 
419
  value: 4.76
420
+ name: Test WER (Pt)
421
  - task:
422
  type: Automatic Speech Recognition
423
  name: automatic-speech-recognition
 
429
  args:
430
  language: ro
431
  metrics:
432
+ - type: wer
 
433
  value: 12.44
434
+ name: Test WER (Ro)
435
  - task:
436
  type: Automatic Speech Recognition
437
  name: automatic-speech-recognition
 
443
  args:
444
  language: ru
445
  metrics:
446
+ - type: wer
 
447
  value: 5.51
448
+ name: Test WER (Ru)
449
  - task:
450
  type: Automatic Speech Recognition
451
  name: automatic-speech-recognition
 
457
  args:
458
  language: sk
459
  metrics:
460
+ - type: wer
 
461
  value: 8.82
462
+ name: Test WER (Sk)
463
  - task:
464
  type: Automatic Speech Recognition
465
  name: automatic-speech-recognition
 
471
  args:
472
  language: sl
473
  metrics:
474
+ - type: wer
 
475
  value: 24.03
476
+ name: Test WER (Sl)
477
  - task:
478
  type: Automatic Speech Recognition
479
  name: automatic-speech-recognition
 
485
  args:
486
  language: sv
487
  metrics:
488
+ - type: wer
 
489
  value: 15.08
490
+ name: Test WER (Sv)
491
  - task:
492
  type: Automatic Speech Recognition
493
  name: automatic-speech-recognition
 
499
  args:
500
  language: uk
501
  metrics:
502
+ - type: wer
 
503
  value: 6.79
504
+ name: Test WER (Uk)
505
  - task:
506
  type: Automatic Speech Recognition
507
  name: automatic-speech-recognition
 
513
  args:
514
  language: es
515
  metrics:
516
+ - type: wer
 
517
  value: 4.39
518
+ name: Test WER (Es)
519
  - task:
520
  type: Automatic Speech Recognition
521
  name: automatic-speech-recognition
 
527
  args:
528
  language: fr
529
  metrics:
530
+ - type: wer
 
531
  value: 4.97
532
+ name: Test WER (Fr)
533
  - task:
534
  type: Automatic Speech Recognition
535
  name: automatic-speech-recognition
 
541
  args:
542
  language: it
543
  metrics:
544
+ - type: wer
 
545
  value: 10.08
546
+ name: Test WER (It)
547
  - task:
548
  type: Automatic Speech Recognition
549
  name: automatic-speech-recognition
 
555
  args:
556
  language: nl
557
  metrics:
558
+ - type: wer
 
559
  value: 12.78
560
+ name: Test WER (Nl)
561
  - task:
562
  type: Automatic Speech Recognition
563
  name: automatic-speech-recognition
 
569
  args:
570
  language: pl
571
  metrics:
572
+ - type: wer
 
573
  value: 7.28
574
+ name: Test WER (Pl)
575
  - task:
576
  type: Automatic Speech Recognition
577
  name: automatic-speech-recognition
 
583
  args:
584
  language: pt
585
  metrics:
586
+ - type: wer
587
+ value: 7.5
588
+ name: Test WER (Pt)
 
589
  - task:
590
  type: Automatic Speech Recognition
591
  name: automatic-speech-recognition
 
597
  args:
598
  language: de
599
  metrics:
600
+ - type: wer
 
601
  value: 4.84
602
+ name: Test WER (De)
603
  - task:
604
  type: Automatic Speech Recognition
605
  name: automatic-speech-recognition
 
611
  args:
612
  language: en
613
  metrics:
614
+ - type: wer
615
+ value: 6.8
616
+ name: Test WER (En)
617
  - task:
618
  type: Automatic Speech Recognition
619
  name: automatic-speech-recognition
 
625
  args:
626
  language: es
627
  metrics:
628
+ - type: wer
 
629
  value: 3.41
630
+ name: Test WER (Es)
631
  - task:
632
  type: Automatic Speech Recognition
633
  name: automatic-speech-recognition
 
639
  args:
640
  language: et
641
  metrics:
642
+ - type: wer
 
643
  value: 22.04
644
+ name: Test WER (Et)
645
  - task:
646
  type: Automatic Speech Recognition
647
  name: automatic-speech-recognition
 
653
  args:
654
  language: fr
655
  metrics:
656
+ - type: wer
 
657
  value: 6.05
658
+ name: Test WER (Fr)
659
  - task:
660
  type: Automatic Speech Recognition
661
  name: automatic-speech-recognition
 
667
  args:
668
  language: it
669
  metrics:
670
+ - type: wer
 
671
  value: 3.69
672
+ name: Test WER (It)
673
  - task:
674
  type: Automatic Speech Recognition
675
  name: automatic-speech-recognition
 
681
  args:
682
  language: lv
683
  metrics:
684
+ - type: wer
 
685
  value: 38.36
686
+ name: Test WER (Lv)
687
  - task:
688
  type: Automatic Speech Recognition
689
  name: automatic-speech-recognition
 
695
  args:
696
  language: nl
697
  metrics:
698
+ - type: wer
699
+ value: 6.5
700
+ name: Test WER (Nl)
701
  - task:
702
  type: Automatic Speech Recognition
703
  name: automatic-speech-recognition
 
709
  args:
710
  language: pt
711
  metrics:
712
+ - type: wer
 
713
  value: 3.96
714
+ name: Test WER (Pt)
715
  - task:
716
  type: Automatic Speech Recognition
717
  name: automatic-speech-recognition
 
723
  args:
724
  language: ru
725
  metrics:
726
+ - type: wer
727
+ value: 3.0
728
+ name: Test WER (Ru)
729
  - task:
730
  type: Automatic Speech Recognition
731
  name: automatic-speech-recognition
 
737
  args:
738
  language: sl
739
  metrics:
740
+ - type: wer
741
+ value: 31.8
742
+ name: Test WER (Sl)
743
  - task:
744
  type: Automatic Speech Recognition
745
  name: automatic-speech-recognition
 
751
  args:
752
  language: sv
753
  metrics:
754
+ - type: wer
 
755
  value: 20.16
756
+ name: Test WER (Sv)
757
  - task:
758
  type: Automatic Speech Recognition
759
  name: automatic-speech-recognition
 
765
  args:
766
  language: uk
767
  metrics:
768
+ - type: wer
769
+ value: 5.1
770
+ name: Test WER (Uk)
 
 
771
  ---
772
 
773
  # **<span style="color:#76b900;">🦜 parakeet-tdt-0.6b-v3: Multilingual Speech-to-Text Model</span>**
config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ParakeetForTDT"
4
+ ],
5
+ "blank_token_id": 8192,
6
+ "decoder_hidden_size": 640,
7
+ "dtype": "float32",
8
+ "durations": [
9
+ 0,
10
+ 1,
11
+ 2,
12
+ 3,
13
+ 4
14
+ ],
15
+ "encoder_config": {
16
+ "activation_dropout": 0.1,
17
+ "attention_bias": false,
18
+ "attention_dropout": 0.1,
19
+ "conv_kernel_size": 9,
20
+ "convolution_bias": false,
21
+ "dropout": 0.1,
22
+ "dropout_positions": 0.0,
23
+ "hidden_act": "silu",
24
+ "hidden_size": 1024,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 4096,
27
+ "layerdrop": 0.1,
28
+ "max_position_embeddings": 5000,
29
+ "model_type": "parakeet_encoder",
30
+ "num_attention_heads": 8,
31
+ "num_hidden_layers": 24,
32
+ "num_key_value_heads": 8,
33
+ "num_mel_bins": 128,
34
+ "scale_input": false,
35
+ "subsampling_conv_channels": 256,
36
+ "subsampling_conv_kernel_size": 3,
37
+ "subsampling_conv_stride": 2,
38
+ "subsampling_factor": 8
39
+ },
40
+ "hidden_act": "relu",
41
+ "initializer_range": 0.02,
42
+ "is_encoder_decoder": true,
43
+ "max_symbols_per_step": 10,
44
+ "model_type": "parakeet_tdt",
45
+ "num_decoder_layers": 2,
46
+ "pad_token_id": 2,
47
+ "transformers_version": "5.6.0.dev0",
48
+ "vocab_size": 8193
49
+ }
generation_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 8192,
4
+ "output_attentions": false,
5
+ "output_hidden_states": false,
6
+ "pad_token_id": 2,
7
+ "suppress_tokens": [
8
+ 8193,
9
+ 8194,
10
+ 8195,
11
+ 8196,
12
+ 8197
13
+ ],
14
+ "transformers_version": "5.6.0.dev0"
15
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a2026366188c8c68598edbbff92f8d11590a08e0ae2e6775544e7b07d6a5e11
3
+ size 2508311120
processor_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "blank_token": "<blank>",
3
+ "feature_extractor": {
4
+ "feature_extractor_type": "ParakeetFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 512,
8
+ "padding_side": "right",
9
+ "padding_value": 0.0,
10
+ "preemphasis": 0.97,
11
+ "return_attention_mask": true,
12
+ "sampling_rate": 16000,
13
+ "win_length": 400
14
+ },
15
+ "processor_class": "ParakeetProcessor"
16
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "model_max_length": 1000000000000000019884624838656,
5
+ "pad_token": "<pad>",
6
+ "processor_class": "ParakeetProcessor",
7
+ "tokenizer_class": "ParakeetTokenizer",
8
+ "unk_token": "<unk>"
9
+ }