shadow RASMUS commited on
Commit
52c6032
·
0 Parent(s):

Duplicate from RASMUS/Whisper-youtube-crosslingual-subtitles

Browse files

Co-authored-by: TOIVANEN <RASMUS@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +16 -0
  3. app.py +604 -0
  4. requirements.txt +17 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper Youtube Crosslingual Subtitles
3
+ emoji: 🦀
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.14.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ tags:
12
+ - whisper-event
13
+ duplicated_from: RASMUS/Whisper-youtube-crosslingual-subtitles
14
+ ---
15
+
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import json
4
+ import base64
5
+
6
+ os.system('git clone https://github.com/ggerganov/whisper.cpp.git')
7
+ os.system('make -C ./whisper.cpp')
8
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh small')
9
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base')
10
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh medium')
11
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh large')
12
+ os.system('bash ./whisper.cpp/models/download-ggml-model.sh base.en')
13
+
14
+
15
+ import gradio as gr
16
+ from pathlib import Path
17
+ import pysrt
18
+ import pandas as pd
19
+ import re
20
+ import time
21
+
22
+ from pytube import YouTube
23
+
24
+ headers = {'Authorization': os.environ['DeepL_API_KEY']}
25
+
26
+
27
+ import torch
28
+
29
+ whisper_models = ["base", "small", "medium", "large", "base.en"]
30
+
31
+ custom_models = ["belarus-small"]
32
+
33
+ combined_models = []
34
+ combined_models.extend(whisper_models)
35
+ combined_models.extend(custom_models)
36
+
37
+ usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
38
+ usage = json.loads(usage.text)
39
+ deepL_character_usage = str(usage['character_count'])
40
+ print("deepL_character_usage")
41
+
42
+
43
+
44
+ LANGUAGES = {
45
+ "en": "English",
46
+ "zh": "Chinese",
47
+ "de": "German",
48
+ "es": "Spanish",
49
+ "ru": "Russian",
50
+ "ko": "Korean",
51
+ "fr": "French",
52
+ "ja": "Japanese",
53
+ "pt": "Portuguese",
54
+ "tr": "Turkish",
55
+ "pl": "Polish",
56
+ "ca": "Catalan",
57
+ "nl": "Dutch",
58
+ "ar": "Arabic",
59
+ "sv": "Swedish",
60
+ "it": "Italian",
61
+ "id": "Indonesian",
62
+ "hi": "Hindi",
63
+ "fi": "Finnish",
64
+ "vi": "Vietnamese",
65
+ "he": "Hebrew",
66
+ "uk": "Ukrainian",
67
+ "el": "Greek",
68
+ "ms": "Malay",
69
+ "cs": "Czech",
70
+ "ro": "Romanian",
71
+ "da": "Danish",
72
+ "hu": "Hungarian",
73
+ "ta": "Tamil",
74
+ "no": "Norwegian",
75
+ "th": "Thai",
76
+ "ur": "Urdu",
77
+ "hr": "Croatian",
78
+ "bg": "Bulgarian",
79
+ "lt": "Lithuanian",
80
+ "la": "Latin",
81
+ "mi": "Maori",
82
+ "ml": "Malayalam",
83
+ "cy": "Welsh",
84
+ "sk": "Slovak",
85
+ "te": "Telugu",
86
+ "fa": "Persian",
87
+ "lv": "Latvian",
88
+ "bn": "Bengali",
89
+ "sr": "Serbian",
90
+ "az": "Azerbaijani",
91
+ "sl": "Slovenian",
92
+ "kn": "Kannada",
93
+ "et": "Estonian",
94
+ "mk": "Macedonian",
95
+ "br": "Breton",
96
+ "eu": "Basque",
97
+ "is": "Icelandic",
98
+ "hy": "Armenian",
99
+ "ne": "Nepali",
100
+ "mn": "Mongolian",
101
+ "bs": "Bosnian",
102
+ "kk": "Kazakh",
103
+ "sq": "Albanian",
104
+ "sw": "Swahili",
105
+ "gl": "Galician",
106
+ "mr": "Marathi",
107
+ "pa": "Punjabi",
108
+ "si": "Sinhala",
109
+ "km": "Khmer",
110
+ "sn": "Shona",
111
+ "yo": "Yoruba",
112
+ "so": "Somali",
113
+ "af": "Afrikaans",
114
+ "oc": "Occitan",
115
+ "ka": "Georgian",
116
+ "be": "Belarusian",
117
+ "tg": "Tajik",
118
+ "sd": "Sindhi",
119
+ "gu": "Gujarati",
120
+ "am": "Amharic",
121
+ "yi": "Yiddish",
122
+ "lo": "Lao",
123
+ "uz": "Uzbek",
124
+ "fo": "Faroese",
125
+ "ht": "Haitian creole",
126
+ "ps": "Pashto",
127
+ "tk": "Turkmen",
128
+ "nn": "Nynorsk",
129
+ "mt": "Maltese",
130
+ "sa": "Sanskrit",
131
+ "lb": "Luxembourgish",
132
+ "my": "Myanmar",
133
+ "bo": "Tibetan",
134
+ "tl": "Tagalog",
135
+ "mg": "Malagasy",
136
+ "as": "Assamese",
137
+ "tt": "Tatar",
138
+ "haw": "Hawaiian",
139
+ "ln": "Lingala",
140
+ "ha": "Hausa",
141
+ "ba": "Bashkir",
142
+ "jw": "Javanese",
143
+ "su": "Sundanese",
144
+ }
145
+
146
+ # language code lookup by name, with a few language aliases
147
+ source_languages = {
148
+ **{language: code for code, language in LANGUAGES.items()},
149
+ "Burmese": "my",
150
+ "Valencian": "ca",
151
+ "Flemish": "nl",
152
+ "Haitian": "ht",
153
+ "Letzeburgesch": "lb",
154
+ "Pushto": "ps",
155
+ "Panjabi": "pa",
156
+ "Moldavian": "ro",
157
+ "Moldovan": "ro",
158
+ "Sinhalese": "si",
159
+ "Castilian": "es",
160
+ "Let the model analyze": "Let the model analyze"
161
+ }
162
+
163
+ DeepL_language_codes_for_translation = {
164
+ "Bulgarian": "BG",
165
+ "Czech": "CS",
166
+ "Danish": "DA",
167
+ "German": "DE",
168
+ "Greek": "EL",
169
+ "English": "EN",
170
+ "Spanish": "ES",
171
+ "Estonian": "ET",
172
+ "Finnish": "FI",
173
+ "French": "FR",
174
+ "Hungarian": "HU",
175
+ "Indonesian": "ID",
176
+ "Italian": "IT",
177
+ "Japanese": "JA",
178
+ "Lithuanian": "LT",
179
+ "Latvian": "LV",
180
+ "Dutch": "NL",
181
+ "Polish": "PL",
182
+ "Portuguese": "PT",
183
+ "Romanian": "RO",
184
+ "Russian": "RU",
185
+ "Slovak": "SK",
186
+ "Slovenian": "SL",
187
+ "Swedish": "SV",
188
+ "Turkish": "TR",
189
+ "Ukrainian": "UK",
190
+ "Chinese": "ZH"
191
+ }
192
+
193
+
194
+ transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False)
195
+
196
+
197
+ source_language_list = [key[0] for key in source_languages.items()]
198
+ translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()]
199
+
200
+
201
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
202
+ print("DEVICE IS: ")
203
+ print(device)
204
+
205
+ videos_out_path = Path("./videos_out")
206
+ videos_out_path.mkdir(parents=True, exist_ok=True)
207
+
208
+
209
+ def get_youtube(video_url):
210
+ yt = YouTube(video_url)
211
+ abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()
212
+ print("LADATATTU POLKUUN")
213
+ print(abs_video_path)
214
+
215
+
216
+ return abs_video_path
217
+
218
+ def speech_to_text(video_file_path, selected_source_lang, whisper_model):
219
+ """
220
+ # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models.
221
+ # Currently supports only English audio
222
+ This space allows you to:
223
+ 1. Download youtube video with a given url
224
+ 2. Watch it in the first video component
225
+ 3. Run automatic speech recognition on the video using fast Whisper models
226
+ 4. Translate the recognized transcriptions to 26 languages supported by deepL (If free API usage for the month is not yet fully consumed)
227
+ 5. Download generated subtitles in .vtt and .srt formats
228
+ 6. Watch the the original video with generated subtitles
229
+
230
+ Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper
231
+ This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp
232
+ """
233
+
234
+ if(video_file_path == None):
235
+ raise ValueError("Error no video input")
236
+ print(video_file_path)
237
+ try:
238
+
239
+
240
+
241
+ _,file_ending = os.path.splitext(f'{video_file_path}')
242
+ print(f'file enging is {file_ending}')
243
+ print("starting conversion to wav")
244
+ os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{video_file_path.replace(file_ending, ".wav")}"')
245
+ print("conversion to wav ready")
246
+
247
+ except Exception as e:
248
+ raise RuntimeError("Error Running inference with local model", e)
249
+
250
+ try:
251
+
252
+ print("starting whisper c++")
253
+ srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
254
+ os.system(f'rm -f {srt_path}')
255
+ if selected_source_lang == "Let the model analyze":
256
+ os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l "auto" -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
257
+ else:
258
+ if whisper_model in custom_models:
259
+ os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./converted_models/ggml-{whisper_model}.bin -osrt')
260
+ else:
261
+ os.system(f'./whisper.cpp/main "{video_file_path.replace(file_ending, ".wav")}" -t 4 -l {source_languages.get(selected_source_lang)} -m ./whisper.cpp/models/ggml-{whisper_model}.bin -osrt')
262
+ print("starting whisper done with whisper")
263
+ except Exception as e:
264
+ raise RuntimeError("Error running Whisper cpp model")
265
+
266
+ try:
267
+
268
+ df = pd.DataFrame(columns = ['start','end','text'])
269
+ srt_path = str(video_file_path.replace(file_ending, ".wav")) + ".srt"
270
+ subs = pysrt.open(srt_path)
271
+
272
+
273
+ objects = []
274
+ for sub in subs:
275
+
276
+
277
+ start_hours = str(str(sub.start.hours) + "00")[0:2] if len(str(sub.start.hours)) == 2 else str("0" + str(sub.start.hours) + "00")[0:2]
278
+ end_hours = str(str(sub.end.hours) + "00")[0:2] if len(str(sub.end.hours)) == 2 else str("0" + str(sub.end.hours) + "00")[0:2]
279
+
280
+ start_minutes = str(str(sub.start.minutes) + "00")[0:2] if len(str(sub.start.minutes)) == 2 else str("0" + str(sub.start.minutes) + "00")[0:2]
281
+ end_minutes = str(str(sub.end.minutes) + "00")[0:2] if len(str(sub.end.minutes)) == 2 else str("0" + str(sub.end.minutes) + "00")[0:2]
282
+
283
+ start_seconds = str(str(sub.start.seconds) + "00")[0:2] if len(str(sub.start.seconds)) == 2 else str("0" + str(sub.start.seconds) + "00")[0:2]
284
+ end_seconds = str(str(sub.end.seconds) + "00")[0:2] if len(str(sub.end.seconds)) == 2 else str("0" + str(sub.end.seconds) + "00")[0:2]
285
+
286
+ start_millis = str(str(sub.start.milliseconds) + "000")[0:3]
287
+ end_millis = str(str(sub.end.milliseconds) + "000")[0:3]
288
+ objects.append([sub.text, f'{start_hours}:{start_minutes}:{start_seconds}.{start_millis}', f'{end_hours}:{end_minutes}:{end_seconds}.{end_millis}'])
289
+
290
+ for object in objects:
291
+ srt_to_df = {
292
+ 'start': [object[1]],
293
+ 'end': [object[2]],
294
+ 'text': [object[0]]
295
+ }
296
+
297
+ df = pd.concat([df, pd.DataFrame(srt_to_df)])
298
+ except Exception as e:
299
+ print("Error creating srt df")
300
+
301
+
302
+ try:
303
+ usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
304
+ usage = json.loads(usage.text)
305
+ char_count = str(usage['character_count'])
306
+
307
+ print('Usage is at: ' + str(usage['character_count']) + ' characters')
308
+
309
+ if usage['character_count'] >= 490000:
310
+ print("USAGE CLOSE TO LIMIT")
311
+
312
+ except Exception as e:
313
+ print('Error with DeepL API requesting usage count')
314
+
315
+
316
+ return df
317
+
318
+
319
+
320
+
321
+ def translate_transcriptions(df, selected_translation_lang_2):
322
+ if selected_translation_lang_2 is None:
323
+ selected_translation_lang_2 = 'English'
324
+ df.reset_index(inplace=True)
325
+
326
+ print("start_translation")
327
+ translations = []
328
+
329
+
330
+
331
+ text_combined = ""
332
+ for i, sentence in enumerate(df['text']):
333
+ if i == 0:
334
+ text_combined = sentence
335
+ else:
336
+ text_combined = text_combined + '\n' + sentence
337
+
338
+ data = {'text': text_combined,
339
+ 'tag_spitting': 'xml',
340
+ 'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2)
341
+ }
342
+ try:
343
+
344
+ usage = requests.get('https://api-free.deepl.com/v2/usage', headers=headers)
345
+ usage = json.loads(usage.text)
346
+ deepL_character_usage = str(usage['character_count'])
347
+ try:
348
+ print('Usage is at: ' + deepL_character_usage + 'characters')
349
+ except Exception as e:
350
+ print(e)
351
+
352
+ if int(deepL_character_usage) <= 490000:
353
+ print("STILL CHARACTERS LEFT")
354
+ response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
355
+
356
+ # Print the response from the server
357
+ translated_sentences = json.loads(response.text)
358
+ translated_sentences = translated_sentences['translations'][0]['text'].split('\n')
359
+ df['translation'] = translated_sentences
360
+
361
+ else:
362
+ df['translation'] = df['text']
363
+
364
+ except Exception as e:
365
+ print("EXCEPTION WITH DEEPL API")
366
+ print(e)
367
+ df['translation'] = df['text']
368
+
369
+ print("translations done")
370
+
371
+ print("Starting SRT-file creation")
372
+ print(df.head())
373
+ df.reset_index(inplace=True)
374
+ with open('subtitles.vtt','w', encoding="utf-8") as file:
375
+ print("Starting WEBVTT-file creation")
376
+
377
+ for i in range(len(df)):
378
+ if i == 0:
379
+ file.write('WEBVTT')
380
+ file.write('\n')
381
+
382
+ else:
383
+ file.write(str(i+1))
384
+ file.write('\n')
385
+ start = df.iloc[i]['start']
386
+
387
+
388
+ file.write(f"{start.strip()}")
389
+
390
+ stop = df.iloc[i]['end']
391
+
392
+
393
+ file.write(' --> ')
394
+ file.write(f"{stop}")
395
+ file.write('\n')
396
+ file.writelines(df.iloc[i]['translation'])
397
+ if int(i) != len(df)-1:
398
+ file.write('\n\n')
399
+
400
+ print("WEBVTT DONE")
401
+
402
+ with open('subtitles.srt','w', encoding="utf-8") as file:
403
+ print("Starting SRT-file creation")
404
+
405
+ for i in range(len(df)):
406
+ file.write(str(i+1))
407
+ file.write('\n')
408
+ start = df.iloc[i]['start']
409
+
410
+
411
+ file.write(f"{start.strip()}")
412
+
413
+ stop = df.iloc[i]['end']
414
+
415
+
416
+ file.write(' --> ')
417
+ file.write(f"{stop}")
418
+ file.write('\n')
419
+ file.writelines(df.iloc[i]['translation'])
420
+ if int(i) != len(df)-1:
421
+ file.write('\n\n')
422
+
423
+ print("SRT DONE")
424
+ subtitle_files = ['subtitles.vtt','subtitles.srt']
425
+
426
+ return df, subtitle_files
427
+
428
+ # def burn_srt_to_video(srt_file, video_in):
429
+
430
+ # print("Starting creation of video wit srt")
431
+
432
+ # try:
433
+ # video_out = video_in.replace('.mp4', '_out.mp4')
434
+ # print(os.system('ls -lrth'))
435
+ # print(video_in)
436
+ # print(video_out)
437
+ # command = 'ffmpeg -i "{}" -y -vf subtitles=./subtitles.srt "{}"'.format(video_in, video_out)
438
+ # os.system(command)
439
+
440
+ # return video_out
441
+
442
+ # except Exception as e:
443
+ # print(e)
444
+ # return video_out
445
+
446
+ def create_video_player(subtitle_files, video_in):
447
+
448
+ with open(video_in, "rb") as file:
449
+ video_base64 = base64.b64encode(file.read())
450
+ with open('./subtitles.vtt', "rb") as file:
451
+ subtitle_base64 = base64.b64encode(file.read())
452
+
453
+ video_player = f'''<video id="video" controls preload="metadata">
454
+ <source src="data:video/mp4;base64,{str(video_base64)[2:-1]}" type="video/mp4" />
455
+ <track
456
+ label="English"
457
+ kind="subtitles"
458
+ srclang="en"
459
+ src="data:text/vtt;base64,{str(subtitle_base64)[2:-1]}"
460
+ default />
461
+ </video>
462
+ '''
463
+ #video_player = gr.HTML(video_player)
464
+ return video_player
465
+
466
+
467
+
468
+
469
+ # ---- Gradio Layout -----
470
+ video_in = gr.Video(label="Video file", mirror_webcam=False)
471
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
472
+ video_out = gr.Video(label="Video Out", mirror_webcam=False)
473
+
474
+
475
+
476
+ df_init = pd.DataFrame(columns=['start','end','text', 'translation'])
477
+
478
+ selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True)
479
+ selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True)
480
+ selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True)
481
+
482
+ transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
483
+ transcription_and_translation_df = gr.DataFrame(value=df_init,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
484
+
485
+ subtitle_files = gr.File(
486
+ label="Download srt-file",
487
+ file_count="multiple",
488
+ type="file",
489
+ interactive=False,
490
+ )
491
+
492
+ video_player = gr.HTML('<p>video will be played here after you press the button at step 4')
493
+
494
+
495
+ demo = gr.Blocks(css='''
496
+ #cut_btn, #reset_btn { align-self:stretch; }
497
+ #\\31 3 { max-width: 540px; }
498
+ .output-markdown {max-width: 65ch !important;}
499
+ ''')
500
+ demo.encrypt = False
501
+
502
+
503
+
504
+
505
+ with demo:
506
+ transcription_var = gr.Variable()
507
+
508
+ with gr.Row():
509
+ with gr.Column():
510
+ gr.Markdown('''
511
+ ### This space allows you to:
512
+ 1. Download youtube video with a given url
513
+ 2. Watch it in the first video component
514
+ 3. Run automatic speech recognition on the video using fast Whisper models
515
+ 4. Translate the recognized transcriptions to 26 languages supported by deepL
516
+ 5. Download generated subtitles in .vtt and .srt formats
517
+ 6. Watch the the original video with generated subtitles
518
+ ''')
519
+
520
+ with gr.Column():
521
+ gr.Markdown('''
522
+ ### 1. Copy any non-private Youtube video URL to box below or click one of the examples.
523
+ (But please **consider using short videos** so others won't get queued) <br>
524
+ Then press button "1. Download Youtube video"-button:
525
+ ''')
526
+ examples = gr.Examples(examples=
527
+ [ "https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24",
528
+ "https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren",
529
+ "https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision"],
530
+ label="Examples", inputs=[youtube_url_in])
531
+ # Inspiration from https://huggingface.co/spaces/vumichien/whisper-speaker-diarization
532
+
533
+ with gr.Row():
534
+ with gr.Column():
535
+ youtube_url_in.render()
536
+ download_youtube_btn = gr.Button("Step 1. Download Youtube video")
537
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
538
+ video_in])
539
+ print(video_in)
540
+
541
+
542
+ with gr.Row():
543
+ with gr.Column():
544
+ video_in.render()
545
+ with gr.Column():
546
+ gr.Markdown('''
547
+ ##### Here you can start the transcription and translation process.
548
+ Be aware that processing will last some time. With base model it is around 3x speed
549
+ **Please select source language** for better transcriptions. Using 'Let the model analyze' makes mistakes sometimes and may lead to bad transcriptions
550
+ ''')
551
+ selected_source_lang.render()
552
+ selected_whisper_model.render()
553
+ transcribe_btn = gr.Button("Step 2. Transcribe audio")
554
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], [transcription_df])
555
+
556
+
557
+ with gr.Row():
558
+ gr.Markdown('''
559
+ ##### Here you will get transcription output
560
+ ##### ''')
561
+
562
+ with gr.Row():
563
+ with gr.Column():
564
+ transcription_df.render()
565
+
566
+ with gr.Row():
567
+ with gr.Column():
568
+ gr.Markdown('''
569
+ ### PLEASE READ BELOW
570
+ ### ALL FREE TRANSLATION CREDITS USED FOR THIS MONTH
571
+ Here you will can translate transcriptions to 26 languages.
572
+ If spoken language is not in the list, translation might not work. In this case original transcriptions are used.
573
+ ''')
574
+ gr.Markdown(f'''
575
+ DeepL API character usage:
576
+ {deepL_character_usage if deepL_character_usage is not None else ''}/500 000 characters
577
+ If usage is over 490 000 characters original transcriptions will be used for subtitles.
578
+ API usage resets on 5th of every month.
579
+ ''')
580
+ selected_translation_lang_2.render()
581
+ translate_transcriptions_button = gr.Button("Step 3. Translate transcription")
582
+ translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], [transcription_and_translation_df, subtitle_files])
583
+ transcription_and_translation_df.render()
584
+
585
+ with gr.Row():
586
+ with gr.Column():
587
+ gr.Markdown('''##### From here you can download subtitles in .srt or .vtt format''')
588
+ subtitle_files.render()
589
+
590
+ with gr.Row():
591
+ with gr.Column():
592
+ gr.Markdown('''
593
+ ##### Now press the Step 4. Button to create output video with translated transcriptions
594
+ ##### ''')
595
+ create_video_button = gr.Button("Step 4. Create and add subtitles to video")
596
+ print(video_in)
597
+ create_video_button.click(create_video_player, [subtitle_files,video_in], [
598
+ video_player])
599
+ video_player.render()
600
+
601
+
602
+
603
+
604
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.12
2
+ ffmpeg-python
3
+ pandas==1.5.0
4
+ pytube==12.1.0
5
+ sacremoses
6
+ sentencepiece
7
+ tokenizers
8
+ torch
9
+ torchaudio
10
+ tqdm==4.64.1
11
+ EasyNMT==2.0.2
12
+ tqdm
13
+ nltk
14
+ transformers
15
+ pysrt
16
+ psutil==5.9.2
17
+ requests