import os
# Force Gradio to skip the slow Node.js frontend build on 2-core CPU
os.environ["GRADIO_SSR"] = "0"

import gradio as gr
from llama_cpp import Llama

# ─── Extreme 2-Thread / 16GB RAM Optimization ────────────────────────────────
# RAM Budget (16 GB Total):
#   Model (Q4_K_M)          ≈  7.4 GB
#   KV Cache (q8_0, 4096)   ≈  1.3 GB
#   OS / Gradio / Python    ≈  3.0 GB
#   Safety Headroom         ≈  4.3 GB
# CPU Budget (2 Threads):
#   Locked to 2 threads to eliminate context-switching overhead.
# ──────────────────────────────────────────────────────────────────────────────

N_THREADS = 2

print("Optimizing for 16GB RAM / 2 CPU Cores...")

llm = Llama.from_pretrained(
    repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
    filename="gemma4-v2-Q4_K_M.gguf",
    
    # ── Context & Memory ───────────────────────────────────────────────
    n_ctx=4096,                # Fits comfortably in 16GB RAM
    cache_type="q8_0",         # Halves KV cache RAM with zero quality loss
    use_mlock=True,            # Lock weights in physical RAM
    use_mmap=True,             # Efficient memory mapping
    
    # ── Extreme CPU Tuning ─────────────────────────────────────────────
    n_gpu_layers=0,            # CPU only
    n_threads=N_THREADS,       # Exact core count (no context switching)
    n_threads_batch=N_THREADS, # Match batch threads to core count
    n_batch=512,               # Sweet spot for L1/L2 cache on 2 cores
    n_ubatch=32,               # Ultra-small micro-batch for zero overhead
    
    verbose=False,
    chat_format="gemma",
)

print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache")


# ─── Chat Function (Gradio 6 Dict Format) ────────────────────────────────────
def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
    # 1. Build messages for llama.cpp
    messages = []
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt.strip()})
    
    # Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}]
    messages.extend(history)
    messages.append({"role": "user", "content": message})

    # 2. Stream tokens
    response = ""
    for chunk in llm.create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        top_k=top_k,
        repeat_penalty=repeat_penalty,
        stream=True,
    ):
        delta = chunk["choices"][0].get("delta", {})
        token = delta.get("content", "")
        if token:
            response += token
            # 3. Yield the EXACT format Gradio 6 requires
            yield history + [
                {"role": "user", "content": message},
                {"role": "assistant", "content": response}
            ]


# ─── UI ───────────────────────────────────────────────────────────────────────
CSS = """
.gradio-container { max-width: 1100px !important; }
#chatbot { height: 650px !important; }
"""

with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo:
    gr.Markdown("# 💎 Gemma-4 12B · Q4_K_M · Extreme CPU Tuning\n**2 Cores** · **16GB RAM** · **q8_0 KV** · **4096 ctx**")

    with gr.Row(equal_height=False):
        with gr.Column(scale=5):
            chatbot = gr.Chatbot(elem_id="chatbot")
            with gr.Row():
                msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True)
                send_btn = gr.Button("Send", variant="primary", scale=1)

        with gr.Column(scale=2):
            system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4)
            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
            max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
            top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
            repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
            clear_btn = gr.Button("🗑 Clear Chat", variant="secondary")

    for evt in [msg_input.submit, send_btn.click]:
        evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input)
    clear_btn.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0", 
        server_port=7860, 
        theme=gr.themes.Soft(primary_hue="indigo"), 
        css=CSS
    )