import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
import torch
from threading import Thread

# Download the models and tokenizers from the Hugging Face platform
MODEL_CHOICES = [
    "Polygl0t/Tucano2-qwen-0.5B-Instruct",
    "Polygl0t/Tucano2-qwen-0.5B-Think",
    "Polygl0t/Tucano2-qwen-1.5B-Instruct",
    "Polygl0t/Tucano2-qwen-1.5B-Think",
    "Polygl0t/Tucano2-qwen-3.7B-Instruct",
    "Polygl0t/Tucano2-qwen-3.7B-Think",
]

models = {}
tokenizers = {}
for mid in MODEL_CHOICES:
    tokenizers[mid] = AutoTokenizer.from_pretrained(mid)
    models[mid] = AutoModelForCausalLM.from_pretrained(
        mid,
        dtype=torch.float16,
        device_map="auto",
    )

# Default system prompt used when none is provided
DEFAULT_SYSTEM_PROMPT = "Você é Tucano, um assistente de IA útil e amigável. Responda às perguntas de forma clara e concisa, fornecendo informações relevantes e precisas. Seja educado e respeitoso em suas respostas."

# Text generation function
@spaces.GPU
def predict(message, history, system_prompt=None, model_name=None):
    model_name = model_name or MODEL_CHOICES[0]
    model = models[model_name]
    tokenizer = tokenizers[model_name]

    class StopOnTokens(StoppingCriteria):
        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
            if input_ids[0][-1] == tokenizer.eos_token_id:
                return True
            return False

    stop = StopOnTokens()

    # The system prompt is always present and never removed during truncation
    system_message = {"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT}

    # Build the message list from conversation history
    messages = []
    for msg in history:
        messages.append({"role": msg["role"], "content": msg["content"]})
    messages.append({"role": "user", "content": message})

    # Pop oldest conversation messages (keeping the system prompt and the
    # latest user message) until the prompt fits within model_max_length
    max_length = tokenizer.model_max_length
    while True:
        input_text = tokenizer.apply_chat_template(
            [system_message] + messages, tokenize=False, add_generation_prompt=True
        )
        token_count = len(tokenizer.encode(input_text))
        if token_count <= max_length or len(messages) <= 1:
            break
        messages.pop(0)

    model_inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        repetition_penalty=1.2,
        max_new_tokens=1024,
        do_sample=True,
        top_p=.9,
        top_k=50,
        temperature=0.2,
        num_beams=1,
        stopping_criteria=StoppingCriteriaList([stop])
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        if tokenizer.eos_token and tokenizer.eos_token in partial_message:
            break
        yield partial_message

CITE_AS = """
## Cite as

```bibtex
@misc{correa2026tucano2cool,
      title={{Tucano 2 Cool: Better Open Source LLMs for Portuguese}},
      author={Nicholas Kluge Corr{\\^e}a and Aniket Sen and Shiza Fatimah and Sophia Falk and Lennard Landgraf and Julia Kastner and Lucie Flek},
      year={2026},
      eprint={2603.03543},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2603.03543},
}
```
"""

with gr.Blocks() as demo:
    gr.ChatInterface(
        fn=predict,
        additional_inputs=[
            gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=3, render=False),
            gr.Dropdown(choices=MODEL_CHOICES, value=MODEL_CHOICES[0], label="Model", render=False),
        ],
        title="Tucano 2 Cool 🦜",
        description="Meet Tucano 2 — a family of open-source language models (0.5B to 3.7B parameters) built from the ground up for Portuguese. As of their release, Tucano 2 outperforms most prior Portuguese models of similar size.",
        examples=[
            ["Como eu posso resolver o seguinte problema: 2x + 3 = 11?"],
            ["Qual é a capital do Brasil?"],
            ["Explique a teoria da relatividade de forma simples."],
        ],
        cache_examples=False
    )
    gr.Markdown(CITE_AS)

if __name__ == "__main__":
    demo.launch()