Spaces:

aexyb
/

Gomes

Running

App Files Files Community

aexyb commited on about 12 hours ago

Commit

a7e488b

verified ·

1 Parent(s): 423ca86

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -160

app.py CHANGED Viewed

@@ -1,48 +1,60 @@
 import os
-# Force Gradio to skip the slow Node.js SSR build
 os.environ["GRADIO_SSR"] = "0"
 import gradio as gr
 from llama_cpp import Llama
-# ─── Auto-Detect CPU & Optimize ───────────────────────────────────────────────
-N_THREADS = os.cpu_count() or 4
-print(f"Optimizing for {N_THREADS} CPU Threads...")
 llm = Llama.from_pretrained(
     repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
     filename="gemma4-v2-Q4_K_M.gguf",
     # ── Context & Memory ───────────────────────────────────────────────
-    n_ctx=4096,
-    cache_type="q8_0",         # Halves KV cache RAM (zero quality loss)
     use_mlock=True,            # Lock weights in physical RAM
     use_mmap=True,             # Efficient memory mapping
-    # ── Auto-Scale CPU Tuning ──────────────────────────────────────────
-    n_gpu_layers=0,
-    n_threads=N_THREADS,       # Auto-detected max threads
-    n_threads_batch=N_THREADS,
-    n_batch=1024,              # Fast prompt processing, safe for L2 cache
-    n_ubatch=64,               # Efficient micro-batching
     verbose=False,
     chat_format="gemma",
 )
-print(f"Model loaded | {N_THREADS} Threads | 4096 ctx | q8_0 KV Cache")
-# ─── Chat Function ────────────────────────────────────────────────────────────
 def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
     messages = []
     if system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt.strip()})
     messages.extend(history)
     messages.append({"role": "user", "content": message})
     response = ""
     for chunk in llm.create_chat_completion(
         messages=messages,
@@ -57,166 +69,46 @@ def respond(message, history, system_prompt, temperature, max_tokens, top_p, top
         token = delta.get("content", "")
         if token:
             response += token
             yield history + [
                 {"role": "user", "content": message},
                 {"role": "assistant", "content": response}
             ]
-# ─── Custom Stylish CSS ───────────────────────────────────────────────────────
 CSS = """
-/* Main Container */
-.gradio-container {
-    max-width: 950px !important;
-    padding: 20px !important;
-}
-/* Chat Window */
-#chatbot {
-    height: 70vh !important;
-    border-radius: 16px !important;
-    border: 1px solid rgba(255,255,255,0.1) !important;
-    background: #1e1e24 !important;
-    box-shadow: 0 8px 32px 0 rgba(0,0,0,0.4) !important;
-}
-/* Chat Bubbles */
-#chatbot .bot-message, #chatbot .user-message {
-    padding: 12px 18px !important;
-    border-radius: 16px !important;
-    font-size: 15px !important;
-    line-height: 1.6 !important;
-}
-#chatbot .user-message {
-    background: linear-gradient(135deg, #6d28d9, #4f46e5) !important;
-    color: white !important;
-    border: none !important;
-}
-#chatbot .bot-message {
-    background: #2a2a35 !important;
-    color: #e2e8f0 !important;
-    border: none !important;
-}
-/* Input Box */
-.input-area textarea {
-    border-radius: 12px !important;
-    border: 1px solid rgba(255,255,255,0.1) !important;
-    background: #2a2a35 !important;
-    color: white !important;
-    font-size: 15px !important;
-    padding: 14px !important;
-}
-.input-area textarea:focus {
-    border-color: #7c3aed !important;
-    box-shadow: 0 0 0 2px rgba(124, 58, 237, 0.2) !important;
-}
-/* Buttons */
-.send-btn, .stop-btn {
-    border-radius: 12px !important;
-    font-weight: 600 !important;
-    transition: all 0.2s ease !important;
-}
-.send-btn {
-    background: linear-gradient(135deg, #7c3aed, #4f46e5) !important;
-    border: none !important;
-}
-.send-btn:hover {
-    transform: translateY(-1px);
-    box-shadow: 0 4px 12px rgba(124, 58, 237, 0.4) !important;
-}
-/* Accordion / Settings Panel */
-.accordion {
-    background: #1e1e24 !important;
-    border: 1px solid rgba(255,255,255,0.05) !important;
-    border-radius: 12px !important;
-}
-/* Sliders */
-input[type="range"] {
-    accent-color: #7c3aed !important;
-}
-/* System Prompt Textbox */
-.system-prompt textarea {
-    background: #2a2a35 !important;
-    border-radius: 8px !important;
-    border: 1px solid rgba(255,255,255,0.1) !important;
-}
 """
-# ─── Build UI ─────────────────────────────────────────────────────────────────
-with gr.Blocks(title="Gemma 4 | 12B CPU Engine", css=CSS) as demo:
-    # Sleek Header
-    gr.HTML("""
-        <div style="text-align: center; margin-bottom: 20px;">
-            <h1 style="font-size: 28px; font-weight: 800; background: linear-gradient(90deg, #a78bfa, #818cf8, #6366f1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin-bottom: 6px;">
-                Gemma-4 12B
-            </h1>
-            <p style="color: #94a3b8; font-size: 14px; margin: 0;">
-                Q4_K_M Quantized · Auto-Threaded CPU · Locked in RAM
-            </p>
-        </div>
-    """)
-    # Main Chat Column
-    with gr.Column():
-        chatbot = gr.Chatbot(elem_id="chatbot", placeholder="Ask me anything...")
-        with gr.Row(equal_height=True):
-            msg_input = gr.Textbox(
-                placeholder="Type your message...",
-                show_label=False,
-                scale=8,
-                autofocus=True,
-                elem_classes="input-area"
-            )
-            send_btn = gr.Button("Send", variant="primary", scale=1, elem_classes="send-btn")
-        # Collapsible Advanced Settings
-        with gr.Accordion("⚙️ Advanced Settings", open=False):
-            with gr.Column():
-                system_prompt = gr.Textbox(
-                    label="System Prompt",
-                    value="You are a helpful, harmless, and honest assistant.",
-                    lines=3,
-                    elem_classes="system-prompt"
-                )
-                with gr.Row():
-                    temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
-                    top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
-                with gr.Row():
-                    top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
-                    repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
-                max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
-    # Event Wiring
     for evt in [msg_input.submit, send_btn.click]:
-        evt(
-            respond,
-            [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty],
-            chatbot
-        ).then(lambda: "", None, msg_input)
-# ─── Launch with Theme ────────────────────────────────────────────────────────
 if __name__ == "__main__":
-    # Applying a sleek dark base theme via launch()
-    custom_theme = gr.themes.Base(
-        primary_hue="purple",
-        secondary_hue="slate",
-        neutral_hue="slate",
-        font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]
-    )
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        theme=custom_theme,
         css=CSS
     )

 import os
+# Force Gradio to skip the slow Node.js frontend build on 2-core CPU
 os.environ["GRADIO_SSR"] = "0"
 import gradio as gr
 from llama_cpp import Llama
+# ─── Extreme 2-Thread / 16GB RAM Optimization ────────────────────────────────
+# RAM Budget (16 GB Total):
+#   Model (Q4_K_M)          ≈  7.4 GB
+#   KV Cache (q8_0, 4096)   ≈  1.3 GB
+#   OS / Gradio / Python    ≈  3.0 GB
+#   Safety Headroom         ≈  4.3 GB
+# CPU Budget (2 Threads):
+#   Locked to 2 threads to eliminate context-switching overhead.
+# ──────────────────────────────────────────────────────────────────────────────
+N_THREADS = 2
+print("Optimizing for 16GB RAM / 2 CPU Cores...")
 llm = Llama.from_pretrained(
     repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
     filename="gemma4-v2-Q4_K_M.gguf",
     # ── Context & Memory ───────────────────────────────────────────────
+    n_ctx=4096,                # Fits comfortably in 16GB RAM
+    cache_type="q8_0",         # Halves KV cache RAM with zero quality loss
     use_mlock=True,            # Lock weights in physical RAM
     use_mmap=True,             # Efficient memory mapping
+    # ── Extreme CPU Tuning ─────────────────────────────────────────────
+    n_gpu_layers=0,            # CPU only
+    n_threads=N_THREADS,       # Exact core count (no context switching)
+    n_threads_batch=N_THREADS, # Match batch threads to core count
+    n_batch=512,               # Sweet spot for L1/L2 cache on 2 cores
+    n_ubatch=32,               # Ultra-small micro-batch for zero overhead
     verbose=False,
     chat_format="gemma",
 )
+print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache")
+# ─── Chat Function (Gradio 6 Dict Format) ────────────────────────────────────
 def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
+    # 1. Build messages for llama.cpp
     messages = []
     if system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt.strip()})
+    # Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}]
     messages.extend(history)
     messages.append({"role": "user", "content": message})
+    # 2. Stream tokens
     response = ""
     for chunk in llm.create_chat_completion(
         messages=messages,
         token = delta.get("content", "")
         if token:
             response += token
+            # 3. Yield the EXACT format Gradio 6 requires
             yield history + [
                 {"role": "user", "content": message},
                 {"role": "assistant", "content": response}
             ]
+# ─── UI ───────────────────────────────────────────────────────────────────────
 CSS = """
+.gradio-container { max-width: 1100px !important; }
+#chatbot { height: 650px !important; }
 """
+with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo:
+    gr.Markdown("# 💎 Gemma-4 12B · Q4_K_M · Extreme CPU Tuning\n**2 Cores** · **16GB RAM** · **q8_0 KV** · **4096 ctx**")
+    with gr.Row(equal_height=False):
+        with gr.Column(scale=5):
+            chatbot = gr.Chatbot(elem_id="chatbot")
+            with gr.Row():
+                msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True)
+                send_btn = gr.Button("Send", variant="primary", scale=1)
+        with gr.Column(scale=2):
+            system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4)
+            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
+            max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
+            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
+            top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
+            repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
+            clear_btn = gr.Button("🗑 Clear Chat", variant="secondary")
     for evt in [msg_input.submit, send_btn.click]:
+        evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input)
+    clear_btn.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        theme=gr.themes.Soft(primary_hue="indigo"),
         css=CSS
     )