import gradio as gr import torch import re import threading from llmcompressor.transformers import SparseAutoModelForCausalLM from transformers import AutoTokenizer # Model configuration MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq" THINK_TAG_PATTERN = re.compile(r'.*?\s*', flags=re.DOTALL) class ChatBot: def __init__(self): self.model = None self.tokenizer = None self.loading = True # Load model in separate thread thread = threading.Thread(target=self.load_model) thread.start() def load_model(self): """Load model and tokenizer with concurrent loading""" import concurrent.futures def load_tokenizer(): print("Loading tokenizer...") return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) def load_model(): print("Loading model...") return SparseAutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, device_map="auto", torch_dtype="auto", max_workers=4 # Use 4 threads for model loading ) try: # Load tokenizer and model concurrently with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: tokenizer_future = executor.submit(load_tokenizer) model_future = executor.submit(load_model) # Get results self.tokenizer = tokenizer_future.result() print("Tokenizer loaded!") self.model = model_future.result() print("Model loaded!") except Exception as e: print(f"Error loading: {e}") finally: self.loading = False def chat(self, message, history): if self.loading: return history + [(message, "Model is loading, please wait...")] if not self.model: return history + [(message, "Model failed to load")] # Build messages messages = [] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) # Apply chat template and strip thinking prompt = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=False ) prompt = THINK_TAG_PATTERN.sub("", prompt) # Generate inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=512, temperature=0.7, do_sample=True, pad_token_id=self.tokenizer.eos_token_id ) # Extract response response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True) response = THINK_TAG_PATTERN.sub("", response).strip() return history + [(message, response)] # Initialize chatbot bot = ChatBot() # Create interface with gr.Blocks() as demo: gr.HTML("

Qomhrá: A Bilingual Irish-English LLM

") chatbot = gr.Chatbot(height=500) msg = gr.Textbox(placeholder="Type your message...", show_label=False) msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg) if __name__ == "__main__": demo.launch()