import os # Force Gradio to skip the slow Node.js frontend build on 2-core CPU os.environ["GRADIO_SSR"] = "0" import gradio as gr from llama_cpp import Llama # ─── Extreme 2-Thread / 16GB RAM Optimization ──────────────────────────────── # RAM Budget (16 GB Total): # Model (Q4_K_M) ≈ 7.4 GB # KV Cache (q8_0, 4096) ≈ 1.3 GB # OS / Gradio / Python ≈ 3.0 GB # Safety Headroom ≈ 4.3 GB # CPU Budget (2 Threads): # Locked to 2 threads to eliminate context-switching overhead. # ────────────────────────────────────────────────────────────────────────────── N_THREADS = 2 print("Optimizing for 16GB RAM / 2 CPU Cores...") llm = Llama.from_pretrained( repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF", filename="gemma4-v2-Q4_K_M.gguf", # ── Context & Memory ─────────────────────────────────────────────── n_ctx=4096, # Fits comfortably in 16GB RAM cache_type="q8_0", # Halves KV cache RAM with zero quality loss use_mlock=True, # Lock weights in physical RAM use_mmap=True, # Efficient memory mapping # ── Extreme CPU Tuning ───────────────────────────────────────────── n_gpu_layers=0, # CPU only n_threads=N_THREADS, # Exact core count (no context switching) n_threads_batch=N_THREADS, # Match batch threads to core count n_batch=512, # Sweet spot for L1/L2 cache on 2 cores n_ubatch=32, # Ultra-small micro-batch for zero overhead verbose=False, chat_format="gemma", ) print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache") # ─── Chat Function (Gradio 6 Dict Format) ──────────────────────────────────── def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty): # 1. Build messages for llama.cpp messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt.strip()}) # Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}] messages.extend(history) messages.append({"role": "user", "content": message}) # 2. Stream tokens response = "" for chunk in llm.create_chat_completion( messages=messages, temperature=temperature, max_tokens=max_tokens, top_p=top_p, top_k=top_k, repeat_penalty=repeat_penalty, stream=True, ): delta = chunk["choices"][0].get("delta", {}) token = delta.get("content", "") if token: response += token # 3. Yield the EXACT format Gradio 6 requires yield history + [ {"role": "user", "content": message}, {"role": "assistant", "content": response} ] # ─── UI ─────────────────────────────────────────────────────────────────────── CSS = """ .gradio-container { max-width: 1100px !important; } #chatbot { height: 650px !important; } """ with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo: gr.Markdown("# 💎 Gemma-4 12B · Q4_K_M · Extreme CPU Tuning\n**2 Cores** · **16GB RAM** · **q8_0 KV** · **4096 ctx**") with gr.Row(equal_height=False): with gr.Column(scale=5): chatbot = gr.Chatbot(elem_id="chatbot") with gr.Row(): msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True) send_btn = gr.Button("Send", variant="primary", scale=1) with gr.Column(scale=2): system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4) temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05) max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64) top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05) top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1) repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05) clear_btn = gr.Button("🗑 Clear Chat", variant="secondary") for evt in [msg_input.submit, send_btn.click]: evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input) clear_btn.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, theme=gr.themes.Soft(primary_hue="indigo"), css=CSS )