Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from threading import Thread | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig | |
| MODEL_ID = "huihui-ai/Huihui-Qwen3.5-35B-A3B-abliterated" | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| print("Loading model (4-bit quantized)...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| quantization_config=BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| ), | |
| device_map="auto", | |
| dtype=torch.bfloat16, | |
| ) | |
| print("Model loaded!") | |
| def chat(message, history): | |
| messages = [{"role": "system", "content": "You are a helpful assistant."}] | |
| for user_msg, bot_msg in history: | |
| messages.append({"role": "user", "content": user_msg}) | |
| if bot_msg: | |
| messages.append({"role": "assistant", "content": bot_msg}) | |
| messages.append({"role": "user", "content": message}) | |
| text = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True, enable_thinking=False | |
| ) | |
| inputs = tokenizer(text, return_tensors="pt").to(model.device) | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| thread = Thread( | |
| target=model.generate, | |
| kwargs=dict( | |
| **inputs, | |
| max_new_tokens=2048, | |
| temperature=0.7, | |
| top_k=20, | |
| top_p=0.95, | |
| do_sample=True, | |
| streamer=streamer, | |
| ), | |
| ) | |
| thread.start() | |
| partial = "" | |
| for token in streamer: | |
| partial += token | |
| yield partial | |
| demo = gr.ChatInterface( | |
| chat, | |
| title="Huihui-Qwen3.5-35B-A3B Abliterated", | |
| description="Chat with the abliterated Qwen3.5-35B-A3B model (4-bit quantized, uncensored)", | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |