aexyb commited on
Commit
a7e488b
·
verified ·
1 Parent(s): 423ca86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -160
app.py CHANGED
@@ -1,48 +1,60 @@
1
  import os
2
- # Force Gradio to skip the slow Node.js SSR build
3
  os.environ["GRADIO_SSR"] = "0"
4
 
5
  import gradio as gr
6
  from llama_cpp import Llama
7
 
8
- # ─── Auto-Detect CPU & Optimize ───────────────────────────────────────────────
9
- N_THREADS = os.cpu_count() or 4
 
 
 
 
 
 
 
10
 
11
- print(f"Optimizing for {N_THREADS} CPU Threads...")
 
 
12
 
13
  llm = Llama.from_pretrained(
14
  repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
15
  filename="gemma4-v2-Q4_K_M.gguf",
16
 
17
  # ── Context & Memory ───────────────────────────────────────────────
18
- n_ctx=4096,
19
- cache_type="q8_0", # Halves KV cache RAM (zero quality loss)
20
  use_mlock=True, # Lock weights in physical RAM
21
  use_mmap=True, # Efficient memory mapping
22
 
23
- # ── Auto-Scale CPU Tuning ──────────────────────────────────────────
24
- n_gpu_layers=0,
25
- n_threads=N_THREADS, # Auto-detected max threads
26
- n_threads_batch=N_THREADS,
27
- n_batch=1024, # Fast prompt processing, safe for L2 cache
28
- n_ubatch=64, # Efficient micro-batching
29
 
30
  verbose=False,
31
  chat_format="gemma",
32
  )
33
 
34
- print(f"Model loaded | {N_THREADS} Threads | 4096 ctx | q8_0 KV Cache")
35
 
36
 
37
- # ─── Chat Function ────────────────────────────────────────────────────────────
38
  def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
 
39
  messages = []
40
  if system_prompt.strip():
41
  messages.append({"role": "system", "content": system_prompt.strip()})
42
 
 
43
  messages.extend(history)
44
  messages.append({"role": "user", "content": message})
45
 
 
46
  response = ""
47
  for chunk in llm.create_chat_completion(
48
  messages=messages,
@@ -57,166 +69,46 @@ def respond(message, history, system_prompt, temperature, max_tokens, top_p, top
57
  token = delta.get("content", "")
58
  if token:
59
  response += token
 
60
  yield history + [
61
  {"role": "user", "content": message},
62
  {"role": "assistant", "content": response}
63
  ]
64
 
65
 
66
- # ─── Custom Stylish CSS ───────────────────────────────────────────────────────
67
  CSS = """
68
- /* Main Container */
69
- .gradio-container {
70
- max-width: 950px !important;
71
- padding: 20px !important;
72
- }
73
-
74
- /* Chat Window */
75
- #chatbot {
76
- height: 70vh !important;
77
- border-radius: 16px !important;
78
- border: 1px solid rgba(255,255,255,0.1) !important;
79
- background: #1e1e24 !important;
80
- box-shadow: 0 8px 32px 0 rgba(0,0,0,0.4) !important;
81
- }
82
-
83
- /* Chat Bubbles */
84
- #chatbot .bot-message, #chatbot .user-message {
85
- padding: 12px 18px !important;
86
- border-radius: 16px !important;
87
- font-size: 15px !important;
88
- line-height: 1.6 !important;
89
- }
90
- #chatbot .user-message {
91
- background: linear-gradient(135deg, #6d28d9, #4f46e5) !important;
92
- color: white !important;
93
- border: none !important;
94
- }
95
- #chatbot .bot-message {
96
- background: #2a2a35 !important;
97
- color: #e2e8f0 !important;
98
- border: none !important;
99
- }
100
-
101
- /* Input Box */
102
- .input-area textarea {
103
- border-radius: 12px !important;
104
- border: 1px solid rgba(255,255,255,0.1) !important;
105
- background: #2a2a35 !important;
106
- color: white !important;
107
- font-size: 15px !important;
108
- padding: 14px !important;
109
- }
110
- .input-area textarea:focus {
111
- border-color: #7c3aed !important;
112
- box-shadow: 0 0 0 2px rgba(124, 58, 237, 0.2) !important;
113
- }
114
-
115
- /* Buttons */
116
- .send-btn, .stop-btn {
117
- border-radius: 12px !important;
118
- font-weight: 600 !important;
119
- transition: all 0.2s ease !important;
120
- }
121
- .send-btn {
122
- background: linear-gradient(135deg, #7c3aed, #4f46e5) !important;
123
- border: none !important;
124
- }
125
- .send-btn:hover {
126
- transform: translateY(-1px);
127
- box-shadow: 0 4px 12px rgba(124, 58, 237, 0.4) !important;
128
- }
129
-
130
- /* Accordion / Settings Panel */
131
- .accordion {
132
- background: #1e1e24 !important;
133
- border: 1px solid rgba(255,255,255,0.05) !important;
134
- border-radius: 12px !important;
135
- }
136
-
137
- /* Sliders */
138
- input[type="range"] {
139
- accent-color: #7c3aed !important;
140
- }
141
-
142
- /* System Prompt Textbox */
143
- .system-prompt textarea {
144
- background: #2a2a35 !important;
145
- border-radius: 8px !important;
146
- border: 1px solid rgba(255,255,255,0.1) !important;
147
- }
148
  """
149
 
150
- # ─── Build UI ─────────────────────────────────────────────────────────────────
151
- with gr.Blocks(title="Gemma 4 | 12B CPU Engine", css=CSS) as demo:
152
-
153
- # Sleek Header
154
- gr.HTML("""
155
- <div style="text-align: center; margin-bottom: 20px;">
156
- <h1 style="font-size: 28px; font-weight: 800; background: linear-gradient(90deg, #a78bfa, #818cf8, #6366f1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin-bottom: 6px;">
157
- Gemma-4 12B
158
- </h1>
159
- <p style="color: #94a3b8; font-size: 14px; margin: 0;">
160
- Q4_K_M Quantized · Auto-Threaded CPU · Locked in RAM
161
- </p>
162
- </div>
163
- """)
164
-
165
- # Main Chat Column
166
- with gr.Column():
167
- chatbot = gr.Chatbot(elem_id="chatbot", placeholder="Ask me anything...")
168
-
169
- with gr.Row(equal_height=True):
170
- msg_input = gr.Textbox(
171
- placeholder="Type your message...",
172
- show_label=False,
173
- scale=8,
174
- autofocus=True,
175
- elem_classes="input-area"
176
- )
177
- send_btn = gr.Button("Send", variant="primary", scale=1, elem_classes="send-btn")
178
-
179
- # Collapsible Advanced Settings
180
- with gr.Accordion("⚙️ Advanced Settings", open=False):
181
- with gr.Column():
182
- system_prompt = gr.Textbox(
183
- label="System Prompt",
184
- value="You are a helpful, harmless, and honest assistant.",
185
- lines=3,
186
- elem_classes="system-prompt"
187
- )
188
-
189
- with gr.Row():
190
- temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
191
- top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
192
-
193
- with gr.Row():
194
- top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
195
- repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
196
-
197
- max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
198
-
199
- # Event Wiring
200
  for evt in [msg_input.submit, send_btn.click]:
201
- evt(
202
- respond,
203
- [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty],
204
- chatbot
205
- ).then(lambda: "", None, msg_input)
206
 
207
- # ─── Launch with Theme ────────────────────────────────────────────────────────
208
  if __name__ == "__main__":
209
- # Applying a sleek dark base theme via launch()
210
- custom_theme = gr.themes.Base(
211
- primary_hue="purple",
212
- secondary_hue="slate",
213
- neutral_hue="slate",
214
- font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"]
215
- )
216
-
217
  demo.launch(
218
  server_name="0.0.0.0",
219
  server_port=7860,
220
- theme=custom_theme,
221
  css=CSS
222
  )
 
1
  import os
2
+ # Force Gradio to skip the slow Node.js frontend build on 2-core CPU
3
  os.environ["GRADIO_SSR"] = "0"
4
 
5
  import gradio as gr
6
  from llama_cpp import Llama
7
 
8
+ # ─── Extreme 2-Thread / 16GB RAM Optimization ────────────────────────────────
9
+ # RAM Budget (16 GB Total):
10
+ # Model (Q4_K_M) ≈ 7.4 GB
11
+ # KV Cache (q8_0, 4096) ≈ 1.3 GB
12
+ # OS / Gradio / Python ≈ 3.0 GB
13
+ # Safety Headroom ≈ 4.3 GB
14
+ # CPU Budget (2 Threads):
15
+ # Locked to 2 threads to eliminate context-switching overhead.
16
+ # ──────────────────────────────────────────────────────────────────────────────
17
 
18
+ N_THREADS = 2
19
+
20
+ print("Optimizing for 16GB RAM / 2 CPU Cores...")
21
 
22
  llm = Llama.from_pretrained(
23
  repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
24
  filename="gemma4-v2-Q4_K_M.gguf",
25
 
26
  # ── Context & Memory ───────────────────────────────────────────────
27
+ n_ctx=4096, # Fits comfortably in 16GB RAM
28
+ cache_type="q8_0", # Halves KV cache RAM with zero quality loss
29
  use_mlock=True, # Lock weights in physical RAM
30
  use_mmap=True, # Efficient memory mapping
31
 
32
+ # ── Extreme CPU Tuning ─────────────────────────────────────────────
33
+ n_gpu_layers=0, # CPU only
34
+ n_threads=N_THREADS, # Exact core count (no context switching)
35
+ n_threads_batch=N_THREADS, # Match batch threads to core count
36
+ n_batch=512, # Sweet spot for L1/L2 cache on 2 cores
37
+ n_ubatch=32, # Ultra-small micro-batch for zero overhead
38
 
39
  verbose=False,
40
  chat_format="gemma",
41
  )
42
 
43
+ print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache")
44
 
45
 
46
+ # ─── Chat Function (Gradio 6 Dict Format) ────────────────────────────────────
47
  def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
48
+ # 1. Build messages for llama.cpp
49
  messages = []
50
  if system_prompt.strip():
51
  messages.append({"role": "system", "content": system_prompt.strip()})
52
 
53
+ # Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}]
54
  messages.extend(history)
55
  messages.append({"role": "user", "content": message})
56
 
57
+ # 2. Stream tokens
58
  response = ""
59
  for chunk in llm.create_chat_completion(
60
  messages=messages,
 
69
  token = delta.get("content", "")
70
  if token:
71
  response += token
72
+ # 3. Yield the EXACT format Gradio 6 requires
73
  yield history + [
74
  {"role": "user", "content": message},
75
  {"role": "assistant", "content": response}
76
  ]
77
 
78
 
79
+ # ─── UI ───────────────────────────────────────────────────────────────────────
80
  CSS = """
81
+ .gradio-container { max-width: 1100px !important; }
82
+ #chatbot { height: 650px !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """
84
 
85
+ with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo:
86
+ gr.Markdown("# 💎 Gemma-4 12B · Q4_K_M · Extreme CPU Tuning\n**2 Cores** · **16GB RAM** · **q8_0 KV** · **4096 ctx**")
87
+
88
+ with gr.Row(equal_height=False):
89
+ with gr.Column(scale=5):
90
+ chatbot = gr.Chatbot(elem_id="chatbot")
91
+ with gr.Row():
92
+ msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True)
93
+ send_btn = gr.Button("Send", variant="primary", scale=1)
94
+
95
+ with gr.Column(scale=2):
96
+ system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4)
97
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
98
+ max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
99
+ top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
100
+ top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
101
+ repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
102
+ clear_btn = gr.Button("🗑 Clear Chat", variant="secondary")
103
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  for evt in [msg_input.submit, send_btn.click]:
105
+ evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input)
106
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
 
 
 
107
 
 
108
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
109
  demo.launch(
110
  server_name="0.0.0.0",
111
  server_port=7860,
112
+ theme=gr.themes.Soft(primary_hue="indigo"),
113
  css=CSS
114
  )