import gradio as gr
import librosa
import numpy as np
import soundfile as sf
import onnxruntime as ort
import os
from huggingface_hub import hf_hub_download

# Download the official ONNX model weights from the Hub
print("Downloading FlashSR ONNX weights...")
model_path = hf_hub_download(repo_id="YatharthS/FlashSR", filename="model.onnx", subfolder="onnx")

# Initialize the ONNX Runtime inference session on CPU
print("Initializing ONNX Runtime Session...")
ort_session = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])

# Dynamically fetch input and output layer names from the ONNX graph
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

def super_resolve_onnx(audio_path):
    if audio_path is None:
        return None
    
    # 1. Load audio and force resample to 16kHz
    y, sr = librosa.load(audio_path, sr=16000)
    
    # 2. Format input array to match the ONNX expected shape: [Batch, Samples]
    lowres_wav = y[np.newaxis, :].astype(np.float32)
    
    # 3. Execute ultra-fast ONNX inference
    print("Processing audio via ONNX...")
    onnx_output = ort_session.run([output_name], {input_name: lowres_wav})[0]
    
    # 4. Clean up dimensions and extract raw audio array
    new_wav = onnx_output.squeeze()
    
    # 5. Save out the crisp 48kHz output file
    output_path = "output_48khz_onnx.wav"
    sf.write(output_path, new_wav, 48000)
    
    return output_path

# Gradio Setup
title = "⚡ FlashSR ONNX: Real-Time Audio Super-Resolution"
description = (
    "This version runs entirely on **ONNX Runtime (CPU Optimized)**."
)

demo = gr.Interface(
    fn=super_resolve_onnx,
    inputs=gr.Audio(type="filepath", label="Input Audio (VOD Clip)"),
    outputs=gr.Audio(type="filepath", label="ONNX Enhanced Output (48kHz)"),
    title=title,
    description=description,
    flagging_mode="never"
)

if __name__ == "__main__":
    demo.launch()