File size: 11,418 Bytes
3068971
 
 
 
 
 
0abb106
3068971
 
 
26fd9b6
 
 
dbe6259
0abb106
 
 
 
 
 
dbe6259
 
 
0abb106
dbe6259
26fd9b6
3068971
26fd9b6
f7e811f
3068971
 
 
 
 
 
 
dbe6259
3068971
 
 
 
 
 
 
 
dbe6259
3068971
dbe6259
 
3068971
0abb106
3068971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbe6259
3068971
 
 
 
 
 
 
 
dbe6259
3068971
dbe6259
3068971
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbe6259
3068971
dbe6259
 
3068971
dbe6259
26fd9b6
 
 
3068971
 
 
 
 
 
 
26fd9b6
3068971
 
 
dbe6259
3068971
 
 
e77443a
dbe6259
0abb106
dbe6259
3068971
 
 
dbe6259
3068971
 
 
 
 
0abb106
 
3068971
 
 
 
 
 
 
dbe6259
3068971
 
 
 
dbe6259
 
 
3068971
 
 
 
 
dbe6259
 
3068971
 
dbe6259
3068971
 
 
dbe6259
3068971
dbe6259
 
3068971
 
dbe6259
3068971
 
dbe6259
 
 
 
 
 
3068971
 
 
 
 
 
dbe6259
 
 
3068971
e77443a
dbe6259
3068971
 
 
 
dbe6259
3068971
 
 
03e7ef9
dbe6259
 
3068971
 
 
dbe6259
 
3068971
dbe6259
3068971
26fd9b6
03e7ef9
 
dbe6259
 
26fd9b6
3068971
03e7ef9
3068971
dbe6259
 
 
 
3068971
26fd9b6
3068971
dbe6259
 
 
 
 
 
 
 
03e7ef9
dbe6259
 
 
26fd9b6
 
dbe6259
 
 
3068971
 
dbe6259
 
3068971
dbe6259
3068971
 
dbe6259
3068971
dbe6259
 
3068971
 
dbe6259
 
3068971
dbe6259
3068971
dbe6259
 
3068971
dbe6259
3068971
 
 
 
dbe6259
3068971
 
 
dbe6259
 
 
3068971
dbe6259
3068971
daa3fb2
 
 
dbe6259
3068971
dbe6259
 
 
 
 
 
 
 
 
 
 
3068971
 
dbe6259
3068971
03e7ef9
0abb106
dbe6259
03e7ef9
 
0abb106
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import os
import json
import time
import psutil
import threading
import logging
import pytz
from datetime import datetime
from typing import List, Dict, Optional, Generator

import gradio as gr
from huggingface_hub import HfApi, hf_hub_download

# --- KERNEL INITIALIZATION ---
try:
    from llama_cpp import Llama
except ImportError:
    try:
        from llama_cpp_pydist import Llama
    except ImportError:
        class Llama:
            def __init__(self, *args, **kwargs):
                raise ImportError("Kernel Binary Missing. Ensure llama-cpp-python is installed.")

# --- CONFIGURATION ---
HF_TOKEN = os.environ.get("HF_TOKEN")
SPACE_ID = os.environ.get("SPACE_ID")
LOG_FILE = "engine_telemetry.json"
RAM_LIMIT_PCT = 0.50
SYSTEM_RESERVE_MB = 250
DEFAULT_MODEL = "unsloth/Llama-3.2-1B-Instruct-GGUF"
DEFAULT_QUANT = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - ZEROENGINE - %(message)s')
logger = logging.getLogger(__name__)

# --- TELEMETRY MODULE ---
class TelemetryManager:
    def __init__(self, api: HfApi):
        self.api = api
        self.stats = self._load_initial_stats()

    def _load_initial_stats(self) -> Dict:
        if os.path.exists(LOG_FILE):
            try:
                with open(LOG_FILE, "r", encoding="utf-8") as f:
                    return json.load(f)
            except Exception:
                pass
        return {
            "session_start": str(datetime.now(pytz.utc)),
            "load_count": {},
            "total_tokens_generated": 0,
            "popular_repos": []
        }

    def track_load(self, repo: str, filename: str):
        key = f"{repo}/{filename}"
        self.stats["load_count"][key] = self.stats["load_count"].get(key, 0) + 1
        self._sync_to_cloud()

    def track_generation(self, tokens: int):
        self.stats["total_tokens_generated"] += tokens

    def _sync_to_cloud(self):
        if not HF_TOKEN or not SPACE_ID:
            return
        try:
            with open(LOG_FILE, "w", encoding="utf-8") as f:
                json.dump(self.stats, f, indent=4)
            self.api.upload_file(
                path_or_fileobj=LOG_FILE,
                path_in_repo=LOG_FILE,
                repo_id=SPACE_ID,
                repo_type="space"
            )
        except Exception as e:
            logger.error(f"Sync Failure: {e}")

# --- RESOURCE MONITOR ---
class ResourceMonitor:
    @staticmethod
    def get_metrics() -> Dict:
        vm = psutil.virtual_memory()
        return {
            "ram_used_gb": round(vm.used / (1024**3), 2),
            "ram_avail_gb": round(vm.available / (1024**3), 2),
            "ram_total_gb": round(vm.total / (1024**3), 2),
            "ram_pct": vm.percent,
            "cpu_usage_pct": psutil.cpu_percent(interval=None),
            "load_avg": os.getloadavg()[0] if hasattr(os, 'getloadavg') else 0
        }

    @staticmethod
    def validate_deployment(file_path: str) -> (bool, str):
        vm = psutil.virtual_memory()
        file_size_mb = os.path.getsize(file_path) / (1024**2)
        total_ram_mb = vm.total / (1024**2)
        avail_ram_mb = vm.available / (1024**2)
        if file_size_mb > (total_ram_mb * RAM_LIMIT_PCT):
            return False, f"Model size ({file_size_mb:.1f}MB) exceeds safety limit."
        if (file_size_mb + SYSTEM_RESERVE_MB) > avail_ram_mb:
            return False, f"Insufficient headroom for context (Need ~{file_size_mb+SYSTEM_RESERVE_MB:.1f}MB)."
        return True, "Passed."

# --- ENGINE CORE ---
class ZeroEngine:
    def __init__(self):
        self.api = HfApi(token=HF_TOKEN)
        self.telemetry = TelemetryManager(self.api)
        self.llm: Optional[Llama] = None
        self.active_model_info = {"repo": "", "file": ""}
        self.kernel_lock = threading.Lock()
        self.is_prefilling = False

    def list_ggufs(self, repo_id: str) -> List[str]:
        try:
            files = self.api.list_repo_files(repo_id=repo_id)
            return [f for f in files if f.endswith(".gguf")]
        except Exception as e:
            logger.error(f"Scan error: {e}")
            return []

    def boot_kernel(self, repo: str, filename: str) -> str:
        try:
            logger.info(f"Downloading {filename} from {repo}...")
            path = hf_hub_download(repo_id=repo, filename=filename, token=HF_TOKEN)
            
            valid, msg = ResourceMonitor.validate_deployment(path)
            if not valid:
                return msg
            
            with self.kernel_lock:
                if self.llm:
                    del self.llm
                self.llm = Llama(
                    model_path=path,
                    n_ctx=2048,
                    n_threads=2,
                    use_mmap=True,
                    n_batch=512,
                    verbose=False
                )
                self.active_model_info = {"repo": repo, "file": filename}
                self.telemetry.track_load(repo, filename)
            
            return f"🟢 KERNEL ONLINE: {filename}"
        except Exception as e:
            return f"🔴 BOOT FAILURE: {str(e)}"

    def stitch_cache(self, ghost_text: str) -> str:
        if not self.llm or not ghost_text or self.is_prefilling:
            return "Kernel Idle/Busy"
            
        def _bg_eval():
            self.is_prefilling = True
            try:
                tokens = self.llm.tokenize(ghost_text.encode("utf-8"))
                self.llm.eval(tokens)
            except Exception as e:
                logger.error(f"KV Cache priming failed: {e}")
            finally:
                self.is_prefilling = False
        
        threading.Thread(target=_bg_eval, daemon=True).start()
        return "⚡ Ghost Cache Primed"

    def inference_generator(self, prompt: str, history: List[Dict], ghost_context: str) -> Generator:
        if not self.llm:
            history.append({"role": "assistant", "content": "⚠️ Engine offline. BOOT a kernel first."})
            yield history
            return

        # Prepare input
        full_input = f"{ghost_context}\n{prompt}" if ghost_context else prompt
        formatted_prompt = f"User: {full_input}\nAssistant: "
        
        # Add User Message & Empty Assistant Message for Streaming
        history.append({"role": "user", "content": prompt})
        history.append({"role": "assistant", "content": "..."})
        yield history

        response_text = ""
        start_time = time.time()
        tokens_count = 0

        try:
            stream = self.llm(
                formatted_prompt, 
                max_tokens=1024, 
                stop=["User:", "<|eot_id|>", "\n\n"], 
                stream=True
            )
            
            for chunk in stream:
                token = chunk["choices"][0]["text"]
                response_text += token
                tokens_count += 1
                
                elapsed = time.time() - start_time
                tps = round(tokens_count / elapsed, 1) if elapsed > 0 else 0
                
                # Gradio 6.5.0: Update history dict structure
                history[-1]["content"] = f"{response_text}\n\n`[{tps} t/s]`"
                yield history
            
            self.telemetry.track_generation(tokens_count)
        except Exception as e:
            history[-1]["content"] = f"🔴 Runtime Error: {str(e)}"
            yield history

# --- UI INTERFACE ---
kernel = ZeroEngine()

# Removed 'theme' from gr.Blocks constructor (Moved to .launch())
with gr.Blocks(title="ZeroEngine Kernel 6.5") as demo:
    gr.HTML("<div style='text-align: center; border-bottom: 2px solid #333; margin-bottom: 20px;'><h1>🛰️ ZEROENGINE V0.1</h1><p>Gradio 6.5.0 Production Build</p></div>")
    
    with gr.Row():
        with gr.Column(scale=8):
            # FIXED: Removed 'type="messages"' (deprecated/auto-detected in 6.5.0)
            chat_box = gr.Chatbot(
                label="Main Engine Feedback", 
                height=650, 
                show_label=False, 
                autoscroll=True
            )
            
            with gr.Row():
                user_input = gr.Textbox(
                    placeholder="Input command...", 
                    label="Terminal", 
                    container=False, 
                    scale=9
                )
                send_btn = gr.Button("EXE", variant="primary", scale=1)
        
        with gr.Sidebar(label="Engine Room", open=True, width=350):
            gr.Markdown("### 🛠️ Hardware Status")
            ram_metric = gr.Label(label="RAM Usage", value="0/0 GB")
            cpu_metric = gr.Label(label="CPU Load", value="0%")
            
            gr.Markdown("---")
            gr.Markdown("### 📡 Model Control")
            repo_input = gr.Textbox(label="HuggingFace Repo", value=DEFAULT_MODEL)
            quant_dropdown = gr.Dropdown(label="Available Quants", choices=[])
            
            with gr.Row():
                scan_btn = gr.Button("SCAN", size="sm")
                boot_btn = gr.Button("BOOT", variant="primary", size="sm")
            
            boot_status = gr.Markdown("Status: `STANDBY`")
            
            gr.Markdown("---")
            gr.Markdown("### 👻 Ghost Cache")
            ghost_buffer = gr.Textbox(
                label="Background Context", 
                placeholder="Queue priming tokens here...", 
                lines=3
            )
            stitch_status = gr.Markdown("Cache: `EMPTY`")
            stitch_btn = gr.Button("STITCH", size="sm")
            
            log_output = gr.Code(label="Kernel Logs", language="shell", value="[INIT] System Ready.")

    # --- UI LOGIC ---
    def update_stats():
        m = ResourceMonitor.get_metrics()
        return f"{m['ram_used_gb']}/{m['ram_total_gb']} GB", f"{m['cpu_usage_pct']}%"

    def on_scan(repo):
        files = kernel.list_ggufs(repo)
        if not files:
            return gr.update(choices=[], value=None), "No GGUFs found in repo."
        return gr.update(choices=files, value=files[0]), f"Found {len(files)} quants."

    def on_boot(repo, file):
        if not repo or not file:
            return "Selection Missing", gr.update()
        yield "System: Booting Kernel...", gr.update()
        res = kernel.boot_kernel(repo, file)
        yield res, gr.update()

    # FIXED: Use gr.Timer for periodic updates (Gradio 6.5.0 compatible)
    timer = gr.Timer(value=2)
    timer.tick(update_stats, None, [ram_metric, cpu_metric])
    
    scan_btn.click(on_scan, [repo_input], [quant_dropdown, log_output])
    boot_btn.click(on_boot, [repo_input, quant_dropdown], [boot_status, log_output])
    
    stitch_btn.click(
        lambda x: f"Cache: `{kernel.stitch_cache(x)}`", 
        [ghost_buffer], 
        [stitch_status]
    )
    
    inference_args = [user_input, chat_box, ghost_buffer]
    user_input.submit(kernel.inference_generator, inference_args, [chat_box])
    send_btn.click(kernel.inference_generator, inference_args, [chat_box])
    user_input.submit(lambda: "", None, [user_input])

# --- LAUNCH ---
if __name__ == "__main__":
    # FIXED: Theme and CSS parameters moved here for Gradio 6 compatibility
    demo.queue(max_size=20).launch(
        server_name="0.0.0.0",
        share=False,
        theme=gr.themes.Monochrome(primary_hue="blue", radius_size="none")
    )