import os
import re
import subprocess

import gradio as gr
import torch
from scipy.io.wavfile import write
from transformers import pipeline


hf_token = os.environ.get("HF_TOKEN")

MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
    token=hf_token,
)


def split_process(audio, chosen_out_track, progress=gr.Progress(track_tqdm=True)):
    if audio is None:
        raise gr.Error(
            "No audio file submitted! Please upload or record an audio file before submitting your request."
        )

    os.makedirs("out", exist_ok=True)

    progress(0.02, desc="Preparing audio...")
    write("test.wav", audio[0], audio[1])

    progress(0.05, desc="Starting vocal separation...")

    cmd = [
        "python3",
        "-m",
        "demucs.separate",
        "-n",
        "mdx_extra_q",
        "-j",
        "4",
        "test.wav",
        "-o",
        "out",
    ]

    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
    )

    percent_re = re.compile(r"(\d{1,3})%")
    completed_bars = 0
    last_percent = 0
    max_bars = 4
    logs = []

    for line in process.stdout:
        logs.append(line)

        match = percent_re.search(line)
        if not match:
            continue

        percent = int(match.group(1))

        if percent < last_percent and last_percent >= 90:
            completed_bars += 1

        last_percent = percent

        demucs_ratio = min(
            (completed_bars + percent / 100) / max_bars,
            1.0,
        )

        overall_progress = 0.05 + 0.85 * demucs_ratio

        progress(
            overall_progress,
            desc=f"Separating vocals with Demucs... {percent}%",
        )

    returncode = process.wait()

    if returncode != 0:
        raise gr.Error(
            "Demucs failed while separating the audio.\n\n"
            f"{''.join(logs)[-2000:]}"
        )

    if chosen_out_track == "vocals":
        output_path = "./out/mdx_extra_q/test/vocals.wav"
    elif chosen_out_track == "bass":
        output_path = "./out/mdx_extra_q/test/bass.wav"
    elif chosen_out_track == "drums":
        output_path = "./out/mdx_extra_q/test/drums.wav"
    elif chosen_out_track == "other":
        output_path = "./out/mdx_extra_q/test/other.wav"
    elif chosen_out_track == "all-in":
        output_path = "test.wav"
    else:
        raise gr.Error(f"Unknown output track: {chosen_out_track}")

    if not os.path.exists(output_path):
        raise gr.Error(f"Expected output file was not created: {output_path}")

    progress(0.90, desc="Vocal separation complete.")

    return output_path


def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error(
            "No audio file submitted! Please upload or record an audio file before submitting your request."
        )

    result = pipe(
        inputs,
        batch_size=BATCH_SIZE,
        generate_kwargs={"task": task},
        return_timestamps=True,
    )

    return result["text"]


def format_lyrics(text):
    if not text:
        return ""

    # Remove unwanted subtitle artifacts
    text = re.sub(
        r"Sous-?titrage Société Radio-Canada",
        "",
        text,
        flags=re.IGNORECASE,
    )

    # Remove repeated newlines
    text = re.sub(r"\n+", "\n", text).strip()

    # Insert a newline before capital letters, like in the original app
    formatted_text = re.sub(r"(?<!^)([A-Z])", r"\n\1", text)

    # Remove leading whitespace on each line
    formatted_text = re.sub(
        r"^[ \t]+",
        "",
        formatted_text,
        flags=re.MULTILINE,
    )

    return formatted_text.strip()


def infer(audio_input, progress=gr.Progress(track_tqdm=True)):
    progress(0.0, desc="Starting...")

    # STEP 1 | Split vocals from the song/audio file
    splt_result = split_process(
        audio_input,
        "vocals",
        progress=progress,
    )

    print(splt_result)

    # STEP 2 | Transcribe vocals
    progress(0.92, desc="Transcribing vocals with Whisper...")

    whisper_result = transcribe(
        splt_result,
        "transcribe",
    )

    print(whisper_result)

    # STEP 3 | Format lyrics
    progress(0.98, desc="Formatting lyrics...")

    lyrics = format_lyrics(whisper_result)

    print(lyrics)

    progress(1.0, desc="Done.")

    return splt_result, lyrics


css = """
#col-container {
    max-width: 510px;
    margin-left: auto;
    margin-right: auto;
}
"""


with gr.Blocks() as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML(
            """
            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
                <div
                    style="
                        display: inline-flex;
                        align-items: center;
                        gap: 0.8rem;
                        font-size: 1.75rem;
                    "
                >
                    <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
                        Song To Lyrics
                    </h1>
                </div>
                <p style="margin-bottom: 10px; font-size: 94%">
                    Send the audio file of your favorite song, and get the lyrics! <br />
                    Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper.
                </p>
            </div>
            """
        )

        song_in = gr.Audio(
            label="Song input",
            type="numpy",
            sources="upload",
        )

        getlyrics_btn = gr.Button("Get Lyrics!")

        vocals_out = gr.Audio(label="Vocals Only")
        lyrics_res = gr.Textbox(label="Lyrics")

    getlyrics_btn.click(
        fn=infer,
        inputs=[song_in],
        outputs=[vocals_out, lyrics_res],
    )


demo.queue().launch(css=css, ssr_mode=False)