import gradio as gr
import torch
import re
import threading
from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoTokenizer
# Model configuration
MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
THINK_TAG_PATTERN = re.compile(r'.*?\s*', flags=re.DOTALL)
class ChatBot:
def __init__(self):
self.model = None
self.tokenizer = None
self.loading = True
# Load model in separate thread
thread = threading.Thread(target=self.load_model)
thread.start()
def load_model(self):
"""Load model and tokenizer with concurrent loading"""
import concurrent.futures
def load_tokenizer():
print("Loading tokenizer...")
return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
def load_model():
print("Loading model...")
return SparseAutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
device_map="auto",
torch_dtype="auto",
max_workers=4 # Use 4 threads for model loading
)
try:
# Load tokenizer and model concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
tokenizer_future = executor.submit(load_tokenizer)
model_future = executor.submit(load_model)
# Get results
self.tokenizer = tokenizer_future.result()
print("Tokenizer loaded!")
self.model = model_future.result()
print("Model loaded!")
except Exception as e:
print(f"Error loading: {e}")
finally:
self.loading = False
def chat(self, message, history):
if self.loading:
return history + [(message, "Model is loading, please wait...")]
if not self.model:
return history + [(message, "Model failed to load")]
# Build messages
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
# Apply chat template and strip thinking
prompt = self.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
)
prompt = THINK_TAG_PATTERN.sub("", prompt)
# Generate
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id
)
# Extract response
response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
response = THINK_TAG_PATTERN.sub("", response).strip()
return history + [(message, response)]
# Initialize chatbot
bot = ChatBot()
# Create interface
with gr.Blocks() as demo:
gr.HTML("
Qomhrá: A Bilingual Irish-English LLM
")
chatbot = gr.Chatbot(height=500)
msg = gr.Textbox(placeholder="Type your message...", show_label=False)
msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
if __name__ == "__main__":
demo.launch()