import os
import re
import json
import time
import uuid
import datetime
import html as _html
from pathlib import Path
# --- Preload CUDA runtime libs before importing llama_cpp ---
# The cu124 llama-cpp-python wheel's libllama.so needs libcudart.so.12 /
# libcublas at import time. On ZeroGPU those aren't on the default loader
# path, so we dlopen the pip-provided nvidia libs (cudart first) globally.
import ctypes
import glob
import site
def _preload_cuda():
bases = set(site.getsitepackages())
try:
bases.add(site.getusersitepackages())
except Exception:
pass
libs = []
for base in bases:
libs += glob.glob(os.path.join(base, "nvidia", "*", "lib", "*.so*"))
priority = {"cuda_runtime": 0, "cublas": 1}
def _key(p):
for name, rank in priority.items():
if name in p:
return rank
return 2
for so in sorted(set(libs), key=_key):
try:
ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
except OSError:
pass
_preload_cuda()
import gradio as gr
import spaces
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# ---- feedback logging (JSONL, synced to a private HF dataset) ----
# NB: huggingface_hub.CommitScheduler's background thread breaks under ZeroGPU's
# process forking ("Invalid file descriptor: -1"), so we append locally and push
# the file synchronously from the main process instead.
from huggingface_hub import HfApi
FEEDBACK_REPO = os.environ.get("FEEDBACK_REPO", "AlexWortega/my-pi-agent-feedback")
_FB_DIR = Path("feedback")
_FB_DIR.mkdir(exist_ok=True)
_FB_FILE = _FB_DIR / f"log_{uuid.uuid4().hex}.jsonl"
_FB_PATH_IN_REPO = f"data/{_FB_FILE.name}"
_HF_TOKEN = os.environ.get("HF_TOKEN")
_api = HfApi(token=_HF_TOKEN) if _HF_TOKEN else None
print("feedback ->", FEEDBACK_REPO if _api else "(local only, no HF_TOKEN)", flush=True)
def _log(record):
record = {
"ts": datetime.datetime.now(datetime.timezone.utc).isoformat(),
**record,
}
try:
with _FB_FILE.open("a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
except Exception as e: # noqa: BLE001
print("local log failed:", repr(e)[:120], flush=True)
return
if _api is not None:
try:
_api.upload_file(
path_or_fileobj=str(_FB_FILE),
path_in_repo=_FB_PATH_IN_REPO,
repo_id=FEEDBACK_REPO,
repo_type="dataset",
commit_message="feedback log",
)
except Exception as e: # noqa: BLE001
print("dataset upload failed:", repr(e)[:160], flush=True)
# ---- model (GGUF pulled from the Hub at startup, runs on ZeroGPU) ----
GGUF_REPO = os.environ.get("GGUF_REPO", "AlexWortega/qwen35-4b-soyuz-merged-gguf")
GGUF_FILE = os.environ.get("GGUF_FILE", "qwen35-4b-soyuz-merged.nomtp.Q4_K_M.gguf")
N_CTX = int(os.environ.get("N_CTX", "16384"))
print("Downloading GGUF from the Hub ...", flush=True)
MODEL_PATH = hf_hub_download(GGUF_REPO, GGUF_FILE)
print("GGUF ready at", MODEL_PATH, flush=True)
_LLM = None
def _get_llm():
global _LLM
if _LLM is None:
_LLM = Llama(
model_path=MODEL_PATH,
n_gpu_layers=-1,
n_ctx=N_CTX,
verbose=False,
)
return _LLM
_THINK = re.compile(r"(.*?)", re.DOTALL)
_CODE_BLOCK = re.compile(r"```([\w+-]*)\s*\n(.*?)```", re.DOTALL)
def _split(text):
"""Return (clean_answer, thinking). Handles an unterminated ."""
think_parts = _THINK.findall(text)
answer = _THINK.sub("", text)
if "" in text and "" not in text:
i = text.index("")
think_parts.append(text[i + len(""):])
answer = text[:i]
thinking = "\n\n".join(p.strip() for p in think_parts).strip()
return answer.strip(), thinking
def _extract_doc(answer):
"""Assemble a single self-contained HTML document from the answer's
HTML/CSS/JS code blocks, to render in the preview iframe."""
htmls, csss, jss = [], [], []
for lang, body in _CODE_BLOCK.findall(answer):
l = (lang or "").lower().strip()
b = body.strip()
if not b:
continue
low = b.lower()
if l in ("html", "htm") or "" in b:
htmls.append(b)
doc = htmls[0] if htmls else ""
if not doc and (csss or jss):
doc = ""
if not doc:
return ""
if "\n"
+ doc
+ "\n"
)
if csss and "