Spaces:

jmcinern
/

Qomhra

Sleeping

App Files Files Community

jmcinern commited on Sep 23, 2025

Commit

2499997

verified ·

1 Parent(s): 5deeb8d

Create app.py

Browse files

had to restart

Files changed (1) hide show

app.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import gradio as gr
+import torch
+import re
+import threading
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
+# Model configuration
+MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
+THINK_TAG_PATTERN = re.compile(r'<think>.*?</think>\s*', flags=re.DOTALL)
+class ChatBot:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.loading = True
+        # Load model in separate thread
+        thread = threading.Thread(target=self.load_model)
+        thread.start()
+    def load_model(self):
+        """Load model and tokenizer with concurrent loading"""
+        import concurrent.futures
+        def load_tokenizer():
+            print("Loading tokenizer...")
+            return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+        def load_model():
+            print("Loading model...")
+            return SparseAutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                trust_remote_code=True,
+                device_map="auto",
+                torch_dtype="auto",
+                max_workers=4  # Use 4 threads for model loading
+            )
+        try:
+            # Load tokenizer and model concurrently
+            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+                tokenizer_future = executor.submit(load_tokenizer)
+                model_future = executor.submit(load_model)
+                # Get results
+                self.tokenizer = tokenizer_future.result()
+                print("Tokenizer loaded!")
+                self.model = model_future.result()
+                print("Model loaded!")
+        except Exception as e:
+            print(f"Error loading: {e}")
+        finally:
+            self.loading = False
+    def chat(self, message, history):
+        if self.loading:
+            return history + [(message, "Model is loading, please wait...")]
+        if not self.model:
+            return history + [(message, "Model failed to load")]
+        # Build messages
+        messages = []
+        for user_msg, bot_msg in history:
+            messages.append({"role": "user", "content": user_msg})
+            messages.append({"role": "assistant", "content": bot_msg})
+        messages.append({"role": "user", "content": message})
+        # Apply chat template and strip thinking
+        prompt = self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
+        )
+        prompt = THINK_TAG_PATTERN.sub("", prompt)
+        # Generate
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        # Extract response
+        response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+        response = THINK_TAG_PATTERN.sub("", response).strip()
+        return history + [(message, response)]
+# Initialize chatbot
+bot = ChatBot()
+# Create interface
+with gr.Blocks() as demo:
+    gr.HTML("<h1 style='text-align: center;'>Qomhrá: A Bilingual Irish-English LLM</h1>")
+    chatbot = gr.Chatbot(height=500)
+    msg = gr.Textbox(placeholder="Type your message...", show_label=False)
+    msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
+if __name__ == "__main__":
+    demo.launch()