Spaces:
Sleeping
Sleeping
File size: 3,503 Bytes
2499997 43c9011 c4b05c8 2499997 c4b05c8 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 c4b05c8 2499997 452fd6f 2499997 c4b05c8 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 78a0e5d 2499997 7705b58 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 452fd6f 2499997 3b1fdb8 78a0e5d 2499997 452fd6f 2499997 452fd6f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import gradio as gr
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
# Model configuration
MODEL_NAME = "jmcinern/qwen3-8B-cpt-sft-awq"
DPO_ADAPTER = "jmcinern/qomhra-8B-awq-dpo-beta-0.5-checkpoint-checkpoint-100"
THINK_TAG_PATTERN = re.compile(r"<think>.*?</think>\s*", flags=re.DOTALL)
class ChatBot:
def __init__(self):
self.model = None
self.tokenizer = None
self.loading = True
self.load_model()
def load_model(self):
"""Load model and tokenizer sequentially"""
try:
print("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, trust_remote_code=True
)
print("Tokenizer loaded!")
print("Loading model...")
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
device_map="auto",
torch_dtype="auto",
low_cpu_mem_usage=True,
)
self.model = PeftModel.from_pretrained(
base_model,
DPO_ADAPTER
)
print("Model loaded!")
except Exception as e:
print(f"Error loading model: {e}")
finally:
self.loading = False
def chat(self, message, history):
if self.loading:
return history + [(message, "Model is loading, please wait...")]
if not self.model:
return history + [(message, "Model failed to load")]
# Build messages
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
# Apply chat template and strip thinking tags
prompt = self.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
)
prompt = THINK_TAG_PATTERN.sub("", prompt)
# Tokenize
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
# Get stop token IDs for "assistant\n"
stop_token_ids = self.tokenizer.encode(
"assistant\n", add_special_tokens=False
)
# Generate response
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=2048,
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
eos_token_id=[self.tokenizer.eos_token_id] + stop_token_ids,
)
# Decode and clean response
response = self.tokenizer.decode(
outputs[0][len(inputs.input_ids[0]) :], skip_special_tokens=True
)
response = THINK_TAG_PATTERN.sub("", response).strip()
return history + [(message, response)]
# Initialize chatbot
bot = ChatBot()
# Create interface
with gr.Blocks() as demo:
gr.HTML('<h1 style="margin:0;">A Bilingual Irish-English LLM — Developed by Abair.ie</h1>')
chatbot = gr.Chatbot(height=400)
msg = gr.Textbox(placeholder="Type your message...", show_label=False)
msg.submit(bot.chat, [msg, chatbot], [chatbot]).then(lambda: "", outputs=msg)
if __name__ == "__main__":
demo.launch()
|