jmcinern commited on
Commit
2499997
·
verified ·
1 Parent(s): 5deeb8d

Create app.py

Browse files

had to restart

Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import re
4
+ import threading
5
+ from llmcompressor.transformers import SparseAutoModelForCausalLM
6
+ from transformers import AutoTokenizer
7
+
8
+ # Model configuration
9
+ MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
10
+ THINK_TAG_PATTERN = re.compile(r'<think>.*?</think>\s*', flags=re.DOTALL)
11
+
12
+ class ChatBot:
13
+ def __init__(self):
14
+ self.model = None
15
+ self.tokenizer = None
16
+ self.loading = True
17
+
18
+ # Load model in separate thread
19
+ thread = threading.Thread(target=self.load_model)
20
+ thread.start()
21
+
22
+ def load_model(self):
23
+ """Load model and tokenizer with concurrent loading"""
24
+ import concurrent.futures
25
+
26
+ def load_tokenizer():
27
+ print("Loading tokenizer...")
28
+ return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
29
+
30
+ def load_model():
31
+ print("Loading model...")
32
+ return SparseAutoModelForCausalLM.from_pretrained(
33
+ MODEL_NAME,
34
+ trust_remote_code=True,
35
+ device_map="auto",
36
+ torch_dtype="auto",
37
+ max_workers=4 # Use 4 threads for model loading
38
+ )
39
+
40
+ try:
41
+ # Load tokenizer and model concurrently
42
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
43
+ tokenizer_future = executor.submit(load_tokenizer)
44
+ model_future = executor.submit(load_model)
45
+
46
+ # Get results
47
+ self.tokenizer = tokenizer_future.result()
48
+ print("Tokenizer loaded!")
49
+
50
+ self.model = model_future.result()
51
+ print("Model loaded!")
52
+
53
+ except Exception as e:
54
+ print(f"Error loading: {e}")
55
+ finally:
56
+ self.loading = False
57
+
58
+ def chat(self, message, history):
59
+ if self.loading:
60
+ return history + [(message, "Model is loading, please wait...")]
61
+
62
+ if not self.model:
63
+ return history + [(message, "Model failed to load")]
64
+
65
+ # Build messages
66
+ messages = []
67
+ for user_msg, bot_msg in history:
68
+ messages.append({"role": "user", "content": user_msg})
69
+ messages.append({"role": "assistant", "content": bot_msg})
70
+ messages.append({"role": "user", "content": message})
71
+
72
+ # Apply chat template and strip thinking
73
+ prompt = self.tokenizer.apply_chat_template(
74
+ messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
75
+ )
76
+ prompt = THINK_TAG_PATTERN.sub("", prompt)
77
+
78
+ # Generate
79
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
80
+
81
+ with torch.no_grad():
82
+ outputs = self.model.generate(
83
+ **inputs,
84
+ max_new_tokens=512,
85
+ temperature=0.7,
86
+ do_sample=True,
87
+ pad_token_id=self.tokenizer.eos_token_id
88
+ )
89
+
90
+ # Extract response
91
+ response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
92
+ response = THINK_TAG_PATTERN.sub("", response).strip()
93
+
94
+ return history + [(message, response)]
95
+
96
+ # Initialize chatbot
97
+ bot = ChatBot()
98
+
99
+ # Create interface
100
+ with gr.Blocks() as demo:
101
+ gr.HTML("<h1 style='text-align: center;'>Qomhrá: A Bilingual Irish-English LLM</h1>")
102
+
103
+ chatbot = gr.Chatbot(height=500)
104
+ msg = gr.Textbox(placeholder="Type your message...", show_label=False)
105
+
106
+ msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
107
+
108
+ if __name__ == "__main__":
109
+ demo.launch()