import os import re import subprocess import gradio as gr import torch from scipy.io.wavfile import write from transformers import pipeline hf_token = os.environ.get("HF_TOKEN") MODEL_NAME = "openai/whisper-large-v3-turbo" BATCH_SIZE = 8 device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, token=hf_token, ) def split_process(audio, chosen_out_track, progress=gr.Progress(track_tqdm=True)): if audio is None: raise gr.Error( "No audio file submitted! Please upload or record an audio file before submitting your request." ) os.makedirs("out", exist_ok=True) progress(0.02, desc="Preparing audio...") write("test.wav", audio[0], audio[1]) progress(0.05, desc="Starting vocal separation...") cmd = [ "python3", "-m", "demucs.separate", "-n", "mdx_extra_q", "-j", "4", "test.wav", "-o", "out", ] process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, ) percent_re = re.compile(r"(\d{1,3})%") completed_bars = 0 last_percent = 0 max_bars = 4 logs = [] for line in process.stdout: logs.append(line) match = percent_re.search(line) if not match: continue percent = int(match.group(1)) if percent < last_percent and last_percent >= 90: completed_bars += 1 last_percent = percent demucs_ratio = min( (completed_bars + percent / 100) / max_bars, 1.0, ) overall_progress = 0.05 + 0.85 * demucs_ratio progress( overall_progress, desc=f"Separating vocals with Demucs... {percent}%", ) returncode = process.wait() if returncode != 0: raise gr.Error( "Demucs failed while separating the audio.\n\n" f"{''.join(logs)[-2000:]}" ) if chosen_out_track == "vocals": output_path = "./out/mdx_extra_q/test/vocals.wav" elif chosen_out_track == "bass": output_path = "./out/mdx_extra_q/test/bass.wav" elif chosen_out_track == "drums": output_path = "./out/mdx_extra_q/test/drums.wav" elif chosen_out_track == "other": output_path = "./out/mdx_extra_q/test/other.wav" elif chosen_out_track == "all-in": output_path = "test.wav" else: raise gr.Error(f"Unknown output track: {chosen_out_track}") if not os.path.exists(output_path): raise gr.Error(f"Expected output file was not created: {output_path}") progress(0.90, desc="Vocal separation complete.") return output_path def transcribe(inputs, task): if inputs is None: raise gr.Error( "No audio file submitted! Please upload or record an audio file before submitting your request." ) result = pipe( inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True, ) return result["text"] def format_lyrics(text): if not text: return "" # Remove unwanted subtitle artifacts text = re.sub( r"Sous-?titrage Société Radio-Canada", "", text, flags=re.IGNORECASE, ) # Remove repeated newlines text = re.sub(r"\n+", "\n", text).strip() # Insert a newline before capital letters, like in the original app formatted_text = re.sub(r"(?

Song To Lyrics

Send the audio file of your favorite song, and get the lyrics!
Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper.

""" ) song_in = gr.Audio( label="Song input", type="numpy", sources="upload", ) getlyrics_btn = gr.Button("Get Lyrics!") vocals_out = gr.Audio(label="Vocals Only") lyrics_res = gr.Textbox(label="Lyrics") getlyrics_btn.click( fn=infer, inputs=[song_in], outputs=[vocals_out, lyrics_res], ) demo.queue().launch(css=css, ssr_mode=False)