jmcinern commited on
Commit
5deeb8d
·
verified ·
1 Parent(s): 0671b54

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -109
app.py DELETED
@@ -1,109 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- import re
4
- import threading
5
- from llmcompressor.transformers import SparseAutoModelForCausalLM
6
- from transformers import AutoTokenizer
7
-
8
- # Model configuration
9
- MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
10
- THINK_TAG_PATTERN = re.compile(r'<think>.*?</think>\s*', flags=re.DOTALL)
11
-
12
- class ChatBot:
13
- def __init__(self):
14
- self.model = None
15
- self.tokenizer = None
16
- self.loading = True
17
-
18
- # Load model in separate thread
19
- thread = threading.Thread(target=self.load_model)
20
- thread.start()
21
-
22
- def load_model(self):
23
- """Load model and tokenizer with concurrent loading"""
24
- import concurrent.futures
25
-
26
- def load_tokenizer():
27
- print("Loading tokenizer...")
28
- return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
29
-
30
- def load_model():
31
- print("Loading model...")
32
- return SparseAutoModelForCausalLM.from_pretrained(
33
- MODEL_NAME,
34
- trust_remote_code=True,
35
- device_map="auto",
36
- torch_dtype="auto",
37
- max_workers=4 # Use 4 threads for model loading
38
- )
39
-
40
- try:
41
- # Load tokenizer and model concurrently
42
- with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
43
- tokenizer_future = executor.submit(load_tokenizer)
44
- model_future = executor.submit(load_model)
45
-
46
- # Get results
47
- self.tokenizer = tokenizer_future.result()
48
- print("Tokenizer loaded!")
49
-
50
- self.model = model_future.result()
51
- print("Model loaded!")
52
-
53
- except Exception as e:
54
- print(f"Error loading: {e}")
55
- finally:
56
- self.loading = False
57
-
58
- def chat(self, message, history):
59
- if self.loading:
60
- return history + [(message, "Model is loading, please wait...")]
61
-
62
- if not self.model:
63
- return history + [(message, "Model failed to load")]
64
-
65
- # Build messages
66
- messages = []
67
- for user_msg, bot_msg in history:
68
- messages.append({"role": "user", "content": user_msg})
69
- messages.append({"role": "assistant", "content": bot_msg})
70
- messages.append({"role": "user", "content": message})
71
-
72
- # Apply chat template and strip thinking
73
- prompt = self.tokenizer.apply_chat_template(
74
- messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
75
- )
76
- prompt = THINK_TAG_PATTERN.sub("", prompt)
77
-
78
- # Generate
79
- inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
80
-
81
- with torch.no_grad():
82
- outputs = self.model.generate(
83
- **inputs,
84
- max_new_tokens=512,
85
- temperature=0.7,
86
- do_sample=True,
87
- pad_token_id=self.tokenizer.eos_token_id
88
- )
89
-
90
- # Extract response
91
- response = self.tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
92
- response = THINK_TAG_PATTERN.sub("", response).strip()
93
-
94
- return history + [(message, response)]
95
-
96
- # Initialize chatbot
97
- bot = ChatBot()
98
-
99
- # Create interface
100
- with gr.Blocks() as demo:
101
- gr.HTML("<h1 style='text-align: center;'>Qomhrá: A Bilingual Irish-English LLM</h1>")
102
-
103
- chatbot = gr.Chatbot(height=500)
104
- msg = gr.Textbox(placeholder="Type your message...", show_label=False)
105
-
106
- msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
107
-
108
- if __name__ == "__main__":
109
- demo.launch()