Spaces:

jmcinern
/

Qomhra

Sleeping

App Files Files Community

jmcinern commited on Sep 23, 2025

Commit

5deeb8d

verified ·

1 Parent(s): 0671b54

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -109

app.py DELETED Viewed

@@ -1,109 +0,0 @@
-import gradio as gr
-import torch
-import re
-import threading
-from llmcompressor.transformers import SparseAutoModelForCausalLM
-from transformers import AutoTokenizer
-# Model configuration
-MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
-THINK_TAG_PATTERN = re.compile(r'<think>.*?</think>\s*', flags=re.DOTALL)
-class ChatBot:
-    def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self.loading = True
-        # Load model in separate thread
-        thread = threading.Thread(target=self.load_model)
-        thread.start()
-    def load_model(self):
-        """Load model and tokenizer with concurrent loading"""
-        import concurrent.futures
-        def load_tokenizer():
-            print("Loading tokenizer...")
-            return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-        def load_model():
-            print("Loading model...")
-            return SparseAutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                trust_remote_code=True,
-                device_map="auto",
-                torch_dtype="auto",
-                max_workers=4  # Use 4 threads for model loading
-            )
-        try:
-            # Load tokenizer and model concurrently
-            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-                tokenizer_future = executor.submit(load_tokenizer)
-                model_future = executor.submit(load_model)
-                # Get results
-                self.tokenizer = tokenizer_future.result()
-                print("Tokenizer loaded!")
-                self.model = model_future.result()
-                print("Model loaded!")
-        except Exception as e:
-            print(f"Error loading: {e}")
-        finally:
-            self.loading = False
-    def chat(self, message, history):
-        if self.loading:
-            return history + [(message, "Model is loading, please wait...")]
-        if not self.model:
-            return history + [(message, "Model failed to load")]
-        # Build messages
-        messages = []
-        for user_msg, bot_msg in history:
-            messages.append({"role": "user", "content": user_msg})
-            messages.append({"role": "assistant", "content": bot_msg})
-        messages.append({"role": "user", "content": message})
-        # Apply chat template and strip thinking
-        prompt = self.tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
-        )
-        prompt = THINK_TAG_PATTERN.sub("", prompt)
-        # Generate
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=512,
-                temperature=0.7,
-                do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-        # Extract response
-        response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
-        response = THINK_TAG_PATTERN.sub("", response).strip()
-        return history + [(message, response)]
-# Initialize chatbot
-bot = ChatBot()
-# Create interface
-with gr.Blocks() as demo:
-    gr.HTML("<h1 style='text-align: center;'>Qomhrá: A Bilingual Irish-English LLM</h1>")
-    chatbot = gr.Chatbot(height=500)
-    msg = gr.Textbox(placeholder="Type your message...", show_label=False)
-    msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
-if __name__ == "__main__":
-    demo.launch()