Spaces:

jmcinern
/

Qomhra

Sleeping

App Files Files Community

jmcinern commited on Sep 23, 2025

Commit

0671b54

verified ·

1 Parent(s): 555ebaa

Update app.py

Browse files

cleaner code, manual think strip, concurrent CPU use

Files changed (1) hide show

app.py +80 -359

app.py CHANGED Viewed

@@ -1,388 +1,109 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import json
 import re
-from typing import List, Tuple, Optional
-import time
-# Thinking tag regex pattern for hard stripping
-THINK_TAG_PATTERN = re.compile(r'<think>.*?</think>\s*', flags=re.DOTALL)
 # Model configuration
 MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
-class IrishEnglishChatbot:
     def __init__(self):
         self.model = None
         self.tokenizer = None
-        self.load_model()
     def load_model(self):
-        """Load the quantized model and tokenizer"""
-        print(f"Loading model: {MODEL_NAME}")
-        try:
             print("Loading tokenizer...")
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_NAME,
-                trust_remote_code=True
-            )
-            print("Loading model with optimized settings...")
-            # Try different loading strategies in order of preference
-            # Strategy 1: Try with llm-compressor (modern approach)
-            try:
-                from llmcompressor.transformers import SparseAutoModelForCausalLM
-                print("Attempting to load with llm-compressor...")
-                self.model = SparseAutoModelForCausalLM.from_pretrained(
-                    MODEL_NAME,
-                    trust_remote_code=True,
-                    device_map="auto",
-                    torch_dtype="auto",
-                    low_cpu_mem_usage=True
-                )
-                print("✅ Loaded with llm-compressor")
-                return
-            except ImportError:
-                print("llm-compressor not available, trying AutoAWQ...")
-            except Exception as e:
-                print(f"llm-compressor failed: {e}, trying AutoAWQ...")
-            # Strategy 2: Try with AutoAWQ (suppress deprecation warning)
-            try:
-                import warnings
-                warnings.filterwarnings("ignore", category=DeprecationWarning)
-                from awq import AutoAWQForCausalLM
-                print("Attempting to load with AutoAWQ...")
-                self.model = AutoAWQForCausalLM.from_quantized(
-                    MODEL_NAME,
-                    trust_remote_code=True,
-                    device_map="auto",
-                    low_cpu_mem_usage=True
-                )
-                print("✅ Loaded with AutoAWQ")
-                return
-            except Exception as e:
-                print(f"AutoAWQ failed: {e}, falling back to transformers...")
-            # Strategy 3: Fall back to standard transformers
-            print("Attempting to load with standard transformers...")
-            self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 trust_remote_code=True,
                 device_map="auto",
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-                use_safetensors=True
             )
-            print("✅ Loaded with transformers (fallback)")
-        except Exception as e:
-            print(f"❌ Error loading model: {e}")
-            # Show user-friendly error
-            self.model = None
-            self.tokenizer = None
-            raise RuntimeError(f"Failed to load model. This might be due to insufficient GPU memory or network issues. Error: {str(e)}")
-    def format_chat_prompt(self, messages: List[dict], add_generation_prompt: bool = True) -> str:
-        """Format messages using the custom Qwen3 chat template"""
         try:
-            formatted = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=add_generation_prompt,
-                enable_thinking=False  # Disable thinking mode as per your training
-            )
-            return formatted
         except Exception as e:
-            print(f"Template error: {e}")
-            # Fallback manual formatting
-            formatted = ""
-            for msg in messages:
-                role = msg["role"]
-                content = msg["content"]
-                formatted += f"<|im_start|>{role}\n{content}<|im_end|>\n"
-            if add_generation_prompt:
-                formatted += "<|im_start|>assistant\n"
-            return formatted
-    def generate_response(
-        self,
-        message: str,
-        history: List[Tuple[str, str]],
-        temperature: float = 0.7,
-        max_tokens: int = 512,
-        top_p: float = 0.9
-    ) -> Tuple[str, List[Tuple[str, str]]]:
-        """Generate response from the model"""
-        if self.model is None:
-            return "❌ Model not loaded. Please refresh the page.", history + [(message, "❌ Model not loaded. Please refresh the page.")]
-        try:
-            # Build conversation history
-            messages = []
-            # Add conversation history
-            for user_msg, assistant_msg in history:
-                messages.append({"role": "user", "content": user_msg})
-                messages.append({"role": "assistant", "content": assistant_msg})
-            # Add current message
-            messages.append({"role": "user", "content": message})
-            # Format prompt
-            formatted_prompt = self.format_chat_prompt(messages, add_generation_prompt=True)
-            # Tokenize with length limits
-            inputs = self.tokenizer(
-                formatted_prompt,
-                return_tensors="pt",
-                truncation=True,
-                max_length=3072  # Leave room for response
-            ).to(self.model.device)
-            # Generate with timeout protection
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=temperature > 0,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                    repetition_penalty=1.1,
-                    use_cache=True
-                )
-            # Decode response
-            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Hard strip thinking tags (safety measure) - do this FIRST
-            full_response = THINK_TAG_PATTERN.sub('', full_response)
-            # Extract just the assistant's response
-            if "<|im_start|>assistant" in full_response:
-                response = full_response.split("<|im_start|>assistant")[-1]
-                response = response.replace("<|im_end|>", "").strip()
-            else:
-                # Fallback - take everything after the input
-                input_length = len(self.tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True))
-                response = full_response[input_length:].strip()
-            # Hard strip thinking tags (safety measure)
-            response = THINK_TAG_PATTERN.sub('', response)
-            # Clean up other chat tokens
-            response = re.sub(r'<\|im_start\|>.*?<\|im_end\|>', '', response, flags=re.DOTALL)
-            response = response.strip()
-            # Final safety check - remove any remaining thinking artifacts
-            response = re.sub(r'</?think[^>]*>', '', response)
-            response = response.strip()
-            # Handle empty responses
-            if not response:
-                response = "I apologize, but I couldn't generate a proper response. Please try again."
-            # Update history
-            new_history = history + [(message, response)]
-            return response, new_history
-        except Exception as e:
-            error_msg = f"❌ Generation error: {str(e)}"
-            print(f"Generation error: {e}")
-            new_history = history + [(message, error_msg)]
-            return error_msg, new_history
-# Initialize chatbot with error handling
-print("Initializing chatbot...")
-try:
-    chatbot = IrishEnglishChatbot()
-    print("✅ Chatbot initialized successfully!")
-except Exception as e:
-    print(f"❌ Failed to initialize chatbot: {e}")
-    chatbot = None
-# Gradio interface functions
-def chat_fn(message, history, temperature, max_tokens, top_p):
-    """Main chat function for Gradio"""
-    if not message.strip():
-        return history, history, ""
-    if chatbot is None:
-        error_msg = "❌ Model not available. Please contact the space owner."
-        new_history = history + [(message, error_msg)]
-        return new_history, new_history, ""
-    try:
-        response, new_history = chatbot.generate_response(
-            message=message,
-            history=history,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            top_p=top_p
         )
-        return new_history, new_history, ""
-    except Exception as e:
-        error_msg = f"❌ Error: {str(e)}"
-        new_history = history + [(message, error_msg)]
-        return new_history, new_history, ""
-def clear_chat():
-    """Clear chat history"""
-    return [], []
-# Example prompts for different languages
-example_prompts = [
-    "Conas atá tú inniu?",  # Irish: How are you today?
-    "What is the capital of Ireland?",
-    "Inis dom faoi stair na hÉireann",  # Irish: Tell me about Irish history
-    "Translate 'hello' to Irish",
-    "Cad iad na príomhchathracha in Éirinn?",  # Irish: What are the main cities in Ireland?
-    "Explain machine learning in simple terms"
-]
-# Custom CSS
-custom_css = """
-.gradio-container {
-    font-family: 'Arial', sans-serif;
-}
-.chat-message {
-    padding: 10px;
-    margin: 5px 0;
-    border-radius: 8px;
-}
-.user-message {
-    background-color: #e3f2fd;
-    margin-left: 20%;
-}
-.bot-message {
-    background-color: #f5f5f5;
-    margin-right: 20%;
-}
-#title {
-    text-align: center;
-    color: #1976d2;
-    font-size: 2em;
-    margin-bottom: 1em;
-}
-"""
-# Create Gradio interface
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
-    gr.HTML("<h1 id='title'>🇮🇪 Irish-English Qwen3 Chatbot 🤖</h1>")
-    gr.Markdown("""
-    ## Fáilte! Welcome!
-    This is an Irish-English bilingual AI assistant based on Qwen3-8B, fine-tuned for both Irish (Gaeilge) and English.
-    You can chat with me in either language!
-    **Features:**
-    - 🇮🇪 Native Irish language support
-    - 🇬🇧 English language support
-    - ⚡ AWQ quantized for fast inference
-    - 💬 Conversational chat interface
-    """)
-    with gr.Row():
-        with gr.Column(scale=4):
-            chatbot_interface = gr.Chatbot(
-                label="Chat History",
-                height=500,
-                show_label=True,
-                bubble_full_width=False
-            )
-            msg_box = gr.Textbox(
-                label="Your message",
-                placeholder="Type your message in Irish or English...",
-                lines=2,
-                max_lines=4
-            )
-            with gr.Row():
-                submit_btn = gr.Button("Send", variant="primary", size="sm")
-                clear_btn = gr.Button("Clear Chat", variant="secondary", size="sm")
-        with gr.Column(scale=1):
-            gr.Markdown("### Settings")
-            temperature = gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.7,
-                step=0.1,
-                label="Temperature",
-                info="Higher = more creative"
-            )
-            max_tokens = gr.Slider(
-                minimum=50,
-                maximum=1024,
-                value=512,
-                step=50,
-                label="Max Tokens",
-                info="Maximum response length"
-            )
-            top_p = gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.9,
-                step=0.1,
-                label="Top P",
-                info="Nucleus sampling"
             )
-            gr.Markdown("### Example Prompts")
-            for prompt in example_prompts:
-                gr.Button(
-                    prompt,
-                    size="sm",
-                    variant="outline"
-                ).click(
-                    fn=lambda x=prompt: x,
-                    outputs=msg_box
-                )
-    # Event handlers
-    submit_btn.click(
-        fn=chat_fn,
-        inputs=[msg_box, chatbot_interface, temperature, max_tokens, top_p],
-        outputs=[chatbot_interface, chatbot_interface, msg_box]
-    )
-    msg_box.submit(
-        fn=chat_fn,
-        inputs=[msg_box, chatbot_interface, temperature, max_tokens, top_p],
-        outputs=[chatbot_interface, chatbot_interface, msg_box]
-    )
-    clear_btn.click(
-        fn=clear_chat,
-        outputs=[chatbot_interface, chatbot_interface]
-    )
-    # Footer
-    gr.HTML("""
-    <div style="text-align: center; margin-top: 2em; color: #666;">
-        <p>Model: <a href="https://huggingface.co/jmcinern/qwen3-8B-cpt-sft-awq" target="_blank">jmcinern/qwen3-8B-cpt-sft-awq</a></p>
-        <p>Based on Qwen3-8B | AWQ Quantized | Irish-English Bilingual</p>
-    </div>
-    """)
-# Launch configuration
 if __name__ == "__main__":
-    demo.launch(
-        share=False,
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_error=True
-    )

 import gradio as gr
 import torch
 import re
+import threading
+from llmcompressor.transformers import SparseAutoModelForCausalLM
+from transformers import AutoTokenizer
 # Model configuration
 MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
+THINK_TAG_PATTERN = re.compile(r'<think>.*?</think>\s*', flags=re.DOTALL)
+class ChatBot:
     def __init__(self):
         self.model = None
         self.tokenizer = None
+        self.loading = True
+        # Load model in separate thread
+        thread = threading.Thread(target=self.load_model)
+        thread.start()
     def load_model(self):
+        """Load model and tokenizer with concurrent loading"""
+        import concurrent.futures
+        def load_tokenizer():
             print("Loading tokenizer...")
+            return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+        def load_model():
+            print("Loading model...")
+            return SparseAutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 trust_remote_code=True,
                 device_map="auto",
+                torch_dtype="auto",
+                max_workers=4  # Use 4 threads for model loading
             )
         try:
+            # Load tokenizer and model concurrently
+            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+                tokenizer_future = executor.submit(load_tokenizer)
+                model_future = executor.submit(load_model)
+                # Get results
+                self.tokenizer = tokenizer_future.result()
+                print("Tokenizer loaded!")
+                self.model = model_future.result()
+                print("Model loaded!")
         except Exception as e:
+            print(f"Error loading: {e}")
+        finally:
+            self.loading = False
+    def chat(self, message, history):
+        if self.loading:
+            return history + [(message, "Model is loading, please wait...")]
+        if not self.model:
+            return history + [(message, "Model failed to load")]
+        # Build messages
+        messages = []
+        for user_msg, bot_msg in history:
+            messages.append({"role": "user", "content": user_msg})
+            messages.append({"role": "assistant", "content": bot_msg})
+        messages.append({"role": "user", "content": message})
+        # Apply chat template and strip thinking
+        prompt = self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
+        prompt = THINK_TAG_PATTERN.sub("", prompt)
+        # Generate
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=512,
+                temperature=0.7,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id
             )
+        # Extract response
+        response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+        response = THINK_TAG_PATTERN.sub("", response).strip()
+        return history + [(message, response)]
+# Initialize chatbot
+bot = ChatBot()
+# Create interface
+with gr.Blocks() as demo:
+    gr.HTML("<h1 style='text-align: center;'>Qomhrá: A Bilingual Irish-English LLM</h1>")
+    chatbot = gr.Chatbot(height=500)
+    msg = gr.Textbox(placeholder="Type your message...", show_label=False)
+    msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
 if __name__ == "__main__":
+    demo.launch()