import os import json import time import psutil import threading import logging import pytz from datetime import datetime from typing import List, Dict, Optional, Generator import gradio as gr from huggingface_hub import HfApi, hf_hub_download # --- KERNEL INITIALIZATION --- try: from llama_cpp import Llama except ImportError: try: from llama_cpp_pydist import Llama except ImportError: class Llama: def __init__(self, *args, **kwargs): raise ImportError("Kernel Binary Missing. Ensure llama-cpp-python is installed.") # --- CONFIGURATION --- HF_TOKEN = os.environ.get("HF_TOKEN") SPACE_ID = os.environ.get("SPACE_ID") LOG_FILE = "engine_telemetry.json" RAM_LIMIT_PCT = 0.85 # Increased from 0.50 to prevent false rejections SYSTEM_RESERVE_MB = 500 # Increased reserve DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF" DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s') logger = logging.getLogger(__name__) # --- TELEMETRY MODULE --- class TelemetryManager: def __init__(self, api: HfApi): self.api = api self.stats = self._load_initial_stats() def _load_initial_stats(self) -> Dict: if os.path.exists(LOG_FILE): try: with open(LOG_FILE, "r", encoding="utf-8") as f: return json.load(f) except Exception: pass return { "session_start": str(datetime.now(pytz.utc)), "load_count": {}, "total_tokens_generated": 0, "popular_repos": [] } def track_load(self, repo: str, filename: str): key = f"{repo}/{filename}" self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1 self._sync_to_cloud() def track_generation(self, tokens: int): self.stats["total_tokens_generated"] += tokens def _sync_to_cloud(self): if not HF_TOKEN or not SPACE_ID: return try: with open(LOG_FILE, "w", encoding="utf-8") as f: json.dump(self.stats, f, indent=4) self.api.upload_file( path_or_fileobj=LOG_FILE, path_in_repo=LOG_FILE, repo_id=SPACE_ID, repo_type="space" ) except Exception as e: logger.error(f"Sync Failure: {e}") # --- RESOURCE MONITOR --- class ResourceMonitor: @staticmethod def get_metrics() -> Dict: vm = psutil.virtual_memory() return { "ram_used_gb": round(vm.used / (1024**3), 2), "ram_avail_gb": round(vm.available / (1024**3), 2), "ram_total_gb": round(vm.total / (1024**3), 2), "ram_pct": vm.percent, "cpu_usage_pct": psutil.cpu_percent(interval=None), "load_avg": os.getloadavg()[0] if hasattr(os, 'getloadavg') else 0 } @staticmethod def validate_deployment(file_path: str) -> (bool, str): try: vm = psutil.virtual_memory() file_size_mb = os.path.getsize(file_path) / (1024**2) total_ram_mb = vm.total / (1024**2) avail_ram_mb = vm.available / (1024**2) logger.info(f"Validation - Model: {file_size_mb:.1f}MB | Available RAM: {avail_ram_mb:.1f}MB | Total: {total_ram_mb:.1f}MB") if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT): return False, f"Model size ({file_size_mb:.1f}MB) exceeds safety limit ({total_ram_mb * RAM_LIMIT_PCT:.1f}MB)." if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb: return False, f"Insufficient RAM. Need {file_size_mb+SYSTEM_RESERVE_MB:.1f}MB, have {avail_ram_mb:.1f}MB available." return True, "Validation Passed." except Exception as e: logger.error(f"Validation error: {e}") return False, f"Validation error: {str(e)}" # --- ENGINE CORE --- class ZeroEngine: def __init__(self): self.api = HfApi(token=HF_TOKEN) self.telemetry = TelemetryManager(self.api) self.llm: Optional[Llama] = None self.active_model_info = {"repo": "", "file": ""} self.kernel_lock = threading.Lock() self.is_prefilling = False def list_ggufs(self, repo_id: str) -> List[str]: try: files = self.api.list_repo_files(repo_id=repo_id) ggufs = [f for f in files if f.endswith(".gguf")] logger.info(f"Found {len(ggufs)} GGUF files in {repo_id}") return ggufs except Exception as e: logger.error(f"Scan error: {e}") return [] def boot_kernel(self, repo: str, filename: str) -> str: """Boot kernel with proper error handling to prevent space crashes""" try: if not repo or not filename: return "🔴 ERROR: Repository or filename missing" logger.info(f"[BOOT] Starting download: {filename} from {repo}") # Download with timeout protection try: path = hf_hub_download( repo_id=repo, filename=filename, token=HF_TOKEN, local_files_only=False ) logger.info(f"[BOOT] Download complete: {path}") except Exception as e: logger.error(f"[BOOT] Download failed: {e}") return f"🔴 DOWNLOAD FAILED: {str(e)}" # Validate before loading valid, msg = ResourceMonitor.validate_deployment(path) if not valid: logger.warning(f"[BOOT] Validation failed: {msg}") return f"🔴 VALIDATION FAILED: {msg}" logger.info("[BOOT] Validation passed, initializing model...") # Load model with proper cleanup with self.kernel_lock: # Clear previous model if self.llm: logger.info("[BOOT] Clearing previous model...") try: del self.llm self.llm = None except Exception as e: logger.warning(f"[BOOT] Cleanup warning: {e}") # Initialize new model with conservative settings try: logger.info("[BOOT] Loading model into memory...") self.llm = Llama( model_path=path, n_ctx=2048, n_threads=2, use_mmap=True, # Critical: memory map to reduce RAM usage n_batch=256, # Reduced from 512 to be safer n_gpu_layers=0, # Force CPU only verbose=False ) self.active_model_info = {"repo": repo, "file": filename} self.telemetry.track_load(repo, filename) logger.info("[BOOT] Model loaded successfully!") return f"🟢 KERNEL ONLINE: {filename}" except Exception as e: logger.error(f"[BOOT] Model loading failed: {e}") self.llm = None return f"🔴 LOAD FAILED: {str(e)}" except Exception as e: logger.error(f"[BOOT] Unexpected error: {e}") return f"🔴 BOOT FAILURE: {str(e)}" def stitch_cache(self, ghost_text: str) -> str: if not self.llm or not ghost_text or self.is_prefilling: return "Kernel Idle/Busy" def _bg_eval(): self.is_prefilling = True try: tokens = self.llm.tokenize(ghost_text.encode("utf-8")) self.llm.eval(tokens) logger.info(f"Ghost cache primed: {len(tokens)} tokens") except Exception as e: logger.error(f"KV Cache priming failed: {e}") finally: self.is_prefilling = False threading.Thread(target=_bg_eval, daemon=True).start() return "⚡ Ghost Cache Primed" def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str) -> Generator: if not self.llm: history.append({"role": "assistant", "content": "⚠️ Engine offline. BOOT a kernel first."}) yield history return # Prepare input full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt formatted_prompt = f"User: {full_input}\nAssistant: " # Add User Message & Empty Assistant Message for Streaming history.append({"role": "user", "content": prompt}) history.append({"role": "assistant", "content": "..."}) yield history response_text = "" start_time = time.time() tokens_count = 0 try: stream = self.llm( formatted_prompt, max_tokens=1024, stop=["User:", "<|eot_id|>", "\n\n"], stream=True ) for chunk in stream: token = chunk["choices"][0]["text"] response_text += token tokens_count += 1 elapsed = time.time() - start_time tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0 # Update history with streaming content history[-1]["content"] = f"{response_text}\n\n`[{tps} t/s]`" yield history self.telemetry.track_generation(tokens_count) except Exception as e: logger.error(f"Inference error: {e}") history[-1]["content"] = f"🔴 Runtime Error: {str(e)}" yield history # --- CUSTOM CSS --- CUSTOM_CSS = """ @import url('https://fonts.cdnfonts.com/css/consolas'); * { font-family: 'Consolas', 'Courier New', monospace !important; } /* Global smooth rounded corners */ .gradio-container { border-radius: 24px !important; } /* All buttons */ button { border-radius: 16px !important; transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; font-family: 'Consolas', monospace !important; } button:hover { transform: translateY(-2px); box-shadow: 0 8px 16px rgba(0,0,0,0.2) !important; } /* Input fields */ input, textarea, .gr-textbox, .gr-dropdown { border-radius: 12px !important; font-family: 'Consolas', monospace !important; } /* Chat messages */ .message { border-radius: 16px !important; font-family: 'Consolas', monospace !important; } /* Code blocks */ .gr-code { border-radius: 12px !important; font-family: 'Consolas', monospace !important; } /* Labels */ .gr-label { border-radius: 12px !important; font-family: 'Consolas', monospace !important; } /* Sidebar */ .gr-sidebar { border-radius: 20px !important; background: linear-gradient(135deg, rgba(20,20,40,0.95), rgba(10,10,20,0.98)) !important; backdrop-filter: blur(10px) !important; } /* Markdown sections */ .gr-markdown { font-family: 'Consolas', monospace !important; } /* Chatbot container */ .chatbot { border-radius: 20px !important; font-family: 'Consolas', monospace !important; } /* Dropdown menus */ .gr-dropdown-menu { border-radius: 12px !important; font-family: 'Consolas', monospace !important; } /* Column containers */ .gr-column { border-radius: 16px !important; } /* Row containers */ .gr-row { border-radius: 12px !important; } /* Smooth animations for all interactive elements */ * { transition: all 0.2s ease !important; } /* Header styling */ h1, h2, h3, h4, h5, h6 { font-family: 'Consolas', monospace !important; } """ # --- UI INTERFACE --- kernel = ZeroEngine() with gr.Blocks(title="ZeroEngine Kernel 6.5", css=CUSTOM_CSS) as demo: gr.HTML("""
Gradio 6.5.0 Production Build | Smooth Rounded UI