import os import torch import gradio as gr import spaces import json import time from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig from huggingface_hub import login import logging # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ====================================================== # Load Configuration # ====================================================== def load_config(): """Load configuration from config.json""" try: with open("config.json", "r", encoding="utf-8") as f: return json.load(f) except FileNotFoundError: logger.warning("config.json not found, using default settings") return { "model": {"model_id": "unsloth/gpt-oss-20b-GGUF"}, "generation": { "max_new_tokens": 1024, "temperature": 1, "top_p": 0.95, "top_k": 64, "do_sample": True, "repetition_penalty": 1.1, "timeout_seconds": 60 }, "interface": {"max_context_length": 4096} } config = load_config() # ====================================================== # Settings # ====================================================== MODEL_ID = config["model"].get("model_id", "anaspro/Lahja-iraqi-4B") # Load system prompt from external file try: with open("system_prompt.txt", "r", encoding="utf-8") as f: SYSTEM_PROMPT = f.read() except FileNotFoundError: logger.warning("system_prompt.txt not found, using default prompt") SYSTEM_PROMPT = "أنت مساعد ذكي مفيد. تحدث بالعربية وساعد المستخدم في استفساراته." # Login to Hugging Face if os.getenv("HF_TOKEN"): login(token=os.getenv("HF_TOKEN")) logger.info("🔐 Logged in to Hugging Face") # Global model variables model = None tokenizer = None model_lock = False # ====================================================== # Model loading function # ====================================================== def load_model(): """Load the model and tokenizer with proper error handling""" global model, tokenizer, model_lock if model_lock: logger.info("Model loading already in progress...") return False model_lock = True try: logger.info("🔄 Loading model...") # Load tokenizer first tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, trust_remote_code=True, use_fast=True ) # Add padding token if missing if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Configure 4-bit quantization if config["model"].get("load_in_4bit", False): quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) else: quantization_config = None # Load model with optimized settings model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=config["model"].get("torch_dtype", "auto"), device_map=config["model"].get("device_map", "auto"), trust_remote_code=config["model"].get("trust_remote_code", True), low_cpu_mem_usage=config["model"].get("low_cpu_mem_usage", True), quantization_config=quantization_config ) model.eval() # Clear cache to free memory if torch.cuda.is_available(): torch.cuda.empty_cache() logger.info("✅ Model loaded successfully!") return True except Exception as e: logger.error(f"❌ Error loading model: {str(e)}") return False finally: model_lock = False # ====================================================== # Chat function (ZeroGPU) # ====================================================== @spaces.GPU(duration=120) def chat(message, history): """Main chat function with improved error handling and conversation management""" global model, tokenizer # Load model if not already loaded if model is None or tokenizer is None: if not load_model(): return "❌ عذراً، حدث خطأ في تحميل النموذج. يرجى المحاولة مرة أخرى." try: # ====================================================== # Build conversation properly # ====================================================== messages = [{"role": "system", "content": SYSTEM_PROMPT}] # Process conversation history correctly if history: for exchange in history: if isinstance(exchange, dict): # Handle message format from Gradio if exchange.get("role") == "user": messages.append({"role": "user", "content": exchange.get("content", "")}) elif exchange.get("role") == "assistant": messages.append({"role": "assistant", "content": exchange.get("content", "")}) elif isinstance(exchange, (list, tuple)) and len(exchange) >= 2: # Handle [user_msg, assistant_msg] format if exchange[0]: # User message messages.append({"role": "user", "content": str(exchange[0])}) if exchange[1]: # Assistant message messages.append({"role": "assistant", "content": str(exchange[1])}) # Add current user message if message and message.strip(): # فلتر للتأكد من أن الموضوع متعلق بالإنترنت internet_keywords = ["نت", "انترنت", "مودم", "wifi", "باقة", "سرعة", "كابل", "راوتر", "فايبر", "اتصال", "شبكة", "تحميل", "رفع", "ميجا", "جيجا"] message_lower = message.lower() # إذا الرسالة تحتوي على كلمات متعلقة بالإنترنت أو أسئلة عامة قصيرة has_internet_keywords = any(keyword in message_lower for keyword in internet_keywords) is_short_question = len(message.strip()) < 50 # الأسئلة القصيرة مسموحة if has_internet_keywords or is_short_question: messages.append({"role": "user", "content": message.strip()}) else: return "آسف، انا هنا حتى اساعدك بمشاكل النت والباقات بس. شنو مشكلتك بالإنترنت؟" else: return "يرجى كتابة رسالة صحيحة." # ====================================================== # Tokenize input with error handling # ====================================================== try: max_length = config.get("interface", {}).get("max_context_length", 4096) input_ids = tokenizer.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True, truncation=True, max_length=max_length ).to(model.device) except Exception as e: logger.error(f"Tokenization error: {e}") return "❌ خطأ في معالجة الرسالة. يرجى المحاولة مرة أخرى." # ====================================================== # Setup text streamer # ====================================================== streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True, clean_up_tokenization_spaces=True ) generation_config = config.get("generation", {}) generation_kwargs = { "input_ids": input_ids, "streamer": streamer, "max_new_tokens": generation_config.get("max_new_tokens", 800), # تقليل أكثر لمنع الهلوسة "min_new_tokens": 15, # حد أدنى معقول "temperature": generation_config.get("temperature", 0.6), # تقليل العشوائية أكثر "top_p": generation_config.get("top_p", 0.85), # تقليل التنوع للتحكم "top_k": generation_config.get("top_k", 30), # تشديد القيود "do_sample": generation_config.get("do_sample", True), "repetition_penalty": generation_config.get("repetition_penalty", 1.15), # زيادة عقوبة التكرار "no_repeat_ngram_size": 4, # منع تكرار العبارات الأطول "early_stopping": True, # توقف مبكر للجمل المكتملة "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, "use_cache": True } # ====================================================== # Generate output in a separate thread with timeout # ====================================================== thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.daemon = True thread.start() partial_text = "" start_time = time.time() timeout = config.get("generation", {}).get("timeout_seconds", 60) # كلمات تشير إلى بداية حوار جديد dialogue_indicators = ["👤", "🤖", "العميل:", "الزبون:", "المساعد:", "العضو:", "السؤال:", "الجواب:"] try: for new_text in streamer: if time.time() - start_time > timeout: logger.warning("Generation timeout reached") break partial_text += new_text # إيقاف التوليد إذا بدأ النموذج بكتابة حوار for indicator in dialogue_indicators: if indicator in partial_text[50:]: # تجاهل أول 50 حرف logger.info("Stopping generation - dialogue detected") return partial_text[:partial_text.find(indicator, 50)].strip() yield partial_text except Exception as e: logger.error(f"Generation error: {e}") yield "❌ حدث خطأ أثناء توليد الإجابة. يرجى المحاولة مرة أخرى." thread.join(timeout=5) # Give thread 5 seconds to finish # Clear GPU cache after generation if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception as e: logger.error(f"Chat function error: {e}") return f"❌ حدث خطأ غير متوقع: {str(e)}" # ====================================================== # Gradio Interface with enhanced styling # ====================================================== def create_interface(): """Create the Gradio interface with enhanced UI""" # Custom CSS for better styling custom_css = """ .gradio-container { max-width: 1000px !important; margin: auto !important; } .chat-message { padding: 10px !important; margin: 5px 0 !important; border-radius: 10px !important; } .message { font-size: 16px !important; line-height: 1.5 !important; } """ # Create a simpler interface for better compatibility demo = gr.ChatInterface( fn=chat, type="messages", title="📞 دعم فني - NB TEL مساعد عراقي", description="**مساعد ذكي للدعم الفني بشبكة النور - NB TEL**\n\nاحجي معاه كأنك زبون: اشرح مشكلتك، اسأل عن الباقات، او اطلب تذكرة دعم.", examples=[ ["النت عندي ما يشتغل من الصبح، شنو السبب؟"], ["كم سعر باقة 60 ميجا؟"], ["الواي فاي ما يظهر عندي، شنو الحل؟"], ["اريد اشتراك جديد، شنو المطلوب؟"], ["شلون اغير كلمة مرور الواي فاي؟"] ], cache_examples=False, theme=gr.themes.Soft( primary_hue="blue", secondary_hue="gray", neutral_hue="slate" ), css=custom_css ) return demo # Create the interface demo = create_interface() if __name__ == "__main__": demo.launch()