import spaces import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer import torch from threading import Thread # Download the models and tokenizers from the Hugging Face platform MODEL_CHOICES = [ "Polygl0t/Tucano2-qwen-0.5B-Instruct", "Polygl0t/Tucano2-qwen-0.5B-Think", "Polygl0t/Tucano2-qwen-1.5B-Instruct", "Polygl0t/Tucano2-qwen-1.5B-Think", "Polygl0t/Tucano2-qwen-3.7B-Instruct", "Polygl0t/Tucano2-qwen-3.7B-Think", ] models = {} tokenizers = {} for mid in MODEL_CHOICES: tokenizers[mid] = AutoTokenizer.from_pretrained(mid) models[mid] = AutoModelForCausalLM.from_pretrained( mid, dtype=torch.float16, device_map="auto", ) # Default system prompt used when none is provided DEFAULT_SYSTEM_PROMPT = "Você é Tucano, um assistente de IA útil e amigável. Responda às perguntas de forma clara e concisa, fornecendo informações relevantes e precisas. Seja educado e respeitoso em suas respostas." # Text generation function @spaces.GPU def predict(message, history, system_prompt=None, model_name=None): model_name = model_name or MODEL_CHOICES[0] model = models[model_name] tokenizer = tokenizers[model_name] class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: if input_ids[0][-1] == tokenizer.eos_token_id: return True return False stop = StopOnTokens() # The system prompt is always present and never removed during truncation system_message = {"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT} # Build the message list from conversation history messages = [] for msg in history: messages.append({"role": msg["role"], "content": msg["content"]}) messages.append({"role": "user", "content": message}) # Pop oldest conversation messages (keeping the system prompt and the # latest user message) until the prompt fits within model_max_length max_length = tokenizer.model_max_length while True: input_text = tokenizer.apply_chat_template( [system_message] + messages, tokenize=False, add_generation_prompt=True ) token_count = len(tokenizer.encode(input_text)) if token_count <= max_length or len(messages) <= 1: break messages.pop(0) model_inputs = tokenizer([input_text], return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( model_inputs, streamer=streamer, repetition_penalty=1.2, max_new_tokens=1024, do_sample=True, top_p=.9, top_k=50, temperature=0.2, num_beams=1, stopping_criteria=StoppingCriteriaList([stop]) ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_message = "" for new_token in streamer: partial_message += new_token if tokenizer.eos_token and tokenizer.eos_token in partial_message: break yield partial_message CITE_AS = """ ## Cite as ```bibtex @misc{correa2026tucano2cool, title={{Tucano 2 Cool: Better Open Source LLMs for Portuguese}}, author={Nicholas Kluge Corr{\\^e}a and Aniket Sen and Shiza Fatimah and Sophia Falk and Lennard Landgraf and Julia Kastner and Lucie Flek}, year={2026}, eprint={2603.03543}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2603.03543}, } ``` """ with gr.Blocks() as demo: gr.ChatInterface( fn=predict, additional_inputs=[ gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=3, render=False), gr.Dropdown(choices=MODEL_CHOICES, value=MODEL_CHOICES[0], label="Model", render=False), ], title="Tucano 2 Cool 🦜", description="Meet Tucano 2 — a family of open-source language models (0.5B to 3.7B parameters) built from the ground up for Portuguese. As of their release, Tucano 2 outperforms most prior Portuguese models of similar size.", examples=[ ["Como eu posso resolver o seguinte problema: 2x + 3 = 11?"], ["Qual é a capital do Brasil?"], ["Explique a teoria da relatividade de forma simples."], ], cache_examples=False ) gr.Markdown(CITE_AS) if __name__ == "__main__": demo.launch()