"""
BharatGen AyurParam — Swastik.fit AI Vaidya
Hosted on HuggingFace Spaces with ZeroGPU (free, no credit card needed)

Model: bharatgenai/AyurParam (2.9B params, trained on 1,000+ Ayurvedic texts)
License: CC-BY-4.0 (commercial OK)
Prompt format: <user> {question} <assistant>

This Space is called by the Swastik Cloud Function (ayurParamProxy).
The /run/predict endpoint receives: { data: ["<user> ... <assistant>"] }
Returns: { data: ["response text"] }
"""

import spaces
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "bharatgenai/AyurParam"

tokenizer = None
model = None


def load_model():
    global tokenizer, model
    if model is not None:
        return
    print("[AyurParam] Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False)
    print("[AyurParam] Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    model.eval()
    print("[AyurParam] Model ready.")


# Load on startup
load_model()


@spaces.GPU
def generate(prompt: str) -> str:
    """
    Main inference function.
    Accepts either:
      - Raw prompt already formatted: "<user> ... <assistant>"
      - Plain text question (will be wrapped automatically)
    Returns: assistant response only (no prompt echo)
    """
    if not prompt or not prompt.strip():
        return "Please provide a question."

    # Ensure correct prompt format
    if "<user>" not in prompt:
        formatted = f"<user> {prompt.strip()} <assistant>"
    else:
        # Already formatted — ensure it ends with <assistant>
        formatted = prompt.strip()
        if not formatted.endswith("<assistant>"):
            formatted = formatted + " <assistant>"

    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    input_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.6,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )

    # Decode only the new tokens (not the prompt)
    new_tokens = output[0][input_len:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

    # Clean up any trailing special tokens
    for stop in ["<user>", "<context>", "</s>"]:
        if stop in response:
            response = response[: response.index(stop)].strip()

    return response


# Gradio interface — Swastik Cloud Function calls /run/predict directly
demo = gr.Interface(
    fn=generate,
    inputs=gr.Textbox(
        label="Prompt",
        placeholder="<user> What foods should I eat for better digestion? <assistant>",
        lines=3,
    ),
    outputs=gr.Textbox(label="AyurParam Response", lines=8),
    title="BharatGen AyurParam — Ayurveda AI",
    description=(
        "**AyurParam** is India's first AI trained on 1,000+ Ayurvedic texts (54.5M words). "
        "2.9B parameter model fine-tuned on classical Ayurveda knowledge.\n\n"
        "Prompt format: `<user> your question <assistant>`\n\n"
        "This Space powers the AI Vaidya at [swastik.fit](https://swastik.fit)."
    ),
    examples=[
        ["<user> What foods should I eat to improve digestion according to Ayurveda? <assistant>"],
        ["<user> I have vata imbalance — what daily routine do you recommend? <assistant>"],
        ["<user> What are the benefits of turmeric in Ayurvedic medicine? <assistant>"],
        ["<user> namaste <assistant>"],  # warmup ping
    ],
    cache_examples=False,
    api_name="predict",  # enables /run/predict endpoint
)

if __name__ == "__main__":
    demo.launch()