import gradio as gr
import torch
import torchaudio as ta
from huggingface_hub import hf_hub_download
from peft import PeftModel
import tempfile
import sys
import os

# Ensure chatterbox is installed (usually handled via requirements.txt in HF spaces)
try:
    from chatterbox.tts import ChatterboxTTS
except ImportError:
    print("chatterbox-tts is not installed. Please add it to your requirements.txt.")
    sys.exit(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "Praha-Labs/PrahaTTS-ML"

def load_model():
    print(f"Loading base Chatterbox model on {device}...")
    # Load the base model
    model = ChatterboxTTS.from_pretrained(device=device)
    
    print("Downloading and applying custom Indic tokenizer...")
    try:
        # Download tokenizer_indic.json from the adapter repository
        tokenizer_path = hf_hub_download(repo_id=repo_id, filename="tokenizer_indic.json")
        
        # Override the default tokenizer.
        # Note: Depending on Chatterbox's exact structure, the tokenizer might be 
        # on the text-to-speech backbone component (e.g., model.tokenizer or model.t3.tokenizer).
        # We try assigning it directly if the library supports loading from a custom path.
        if hasattr(model, 'tokenizer'):
            # Some versions use a load/from_file method
            if hasattr(model.tokenizer, 'load_from_file'):
                model.tokenizer.load_from_file(tokenizer_path)
            else:
                print("Warning: Custom tokenizer replacement might require library-specific logic.")
    except Exception as e:
        print(f"Warning during tokenizer load: {e}")

    print("Loading LoRA adapter weights...")
    try:
        # Load the PEFT adapter onto the LLaMA backbone/Transformer model
        # Chatterbox architecture uses 't3' (Text-to-Speech Token Generator)
        if hasattr(model, 't3'):
            model.t3 = PeftModel.from_pretrained(model.t3, repo_id)
        else:
            # Fallback if the architecture wraps it differently
            model = PeftModel.from_pretrained(model, repo_id)
        print("LoRA adapter loaded successfully.")
    except Exception as e:
        print(f"Failed to load PEFT adapter: {e}")
        
    return model

# Initialize Model
tts_model = load_model()

def synthesize_audio(text, ref_audio, exaggeration, cfg):
    if not text.strip():
        return None, "Please enter some text."
        
    audio_prompt_path = ref_audio if ref_audio else None
    
    try:
        # Generate the waveform
        wav = tts_model.generate(
            text, 
            audio_prompt_path=audio_prompt_path,
            exaggeration=exaggeration,
            cfg=cfg
        )
        
        # Save generated audio to a temporary file for Gradio
        temp_out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        ta.save(temp_out.name, wav.cpu(), tts_model.sr)
        return temp_out.name, "Generation successful!"
        
    except Exception as e:
        return None, f"Generation Error: {str(e)}"

# Define the Gradio Interface
with gr.Blocks(title="PrahaTTS-ML: Malayalam TTS", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🗣️ PrahaTTS-ML: Malayalam LoRA Adapter for Chatterbox TTS")
    gr.Markdown(
        "This Space runs the [Praha-Labs/PrahaTTS-ML](https://huggingface.co/Praha-Labs/PrahaTTS-ML) model. "
        "It is a Malayalam LoRA adapter built on top of ResembleAI's Chatterbox non-turbo TTS model. \n\n"
        "**Note**: Provide up to 5-10 seconds of clear reference audio for voice cloning capabilities."
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Input Text (Malayalam/English)", 
                lines=4, 
                placeholder="നമസ്കാരം, మీరు ఎలా ఉన్నారు?"
            )
            ref_audio_input = gr.Audio(
                label="Reference Voice Audio (Optional, for Voice Cloning)", 
                type="filepath"
            )
            
            with gr.Accordion("Advanced Voice Controls", open=False):
                exaggeration_slider = gr.Slider(
                    minimum=0.0, maximum=1.0, value=0.5, step=0.05, 
                    label="Emotion Exaggeration", 
                    info="Lower for monotone, higher for dramatic/expressive"
                )
                cfg_slider = gr.Slider(
                    minimum=0.0, maximum=1.0, value=0.5, step=0.05, 
                    label="CFG Weight", 
                    info="Lower if speech is too fast, higher to strictly mimic the reference voice"
                )
                
            generate_btn = gr.Button("Synthesize Speech", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Generated Output", interactive=False)
            status_output = gr.Textbox(label="Status Logging", interactive=False)
            
    # Connect logic to UI
    generate_btn.click(
        fn=synthesize_audio,
        inputs=[text_input, ref_audio_input, exaggeration_slider, cfg_slider],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.launch()