import os import re import json import time import uuid import datetime import html as _html from pathlib import Path # --- Preload CUDA runtime libs before importing llama_cpp --- # The cu124 llama-cpp-python wheel's libllama.so needs libcudart.so.12 / # libcublas at import time. On ZeroGPU those aren't on the default loader # path, so we dlopen the pip-provided nvidia libs (cudart first) globally. import ctypes import glob import site def _preload_cuda(): bases = set(site.getsitepackages()) try: bases.add(site.getusersitepackages()) except Exception: pass libs = [] for base in bases: libs += glob.glob(os.path.join(base, "nvidia", "*", "lib", "*.so*")) priority = {"cuda_runtime": 0, "cublas": 1} def _key(p): for name, rank in priority.items(): if name in p: return rank return 2 for so in sorted(set(libs), key=_key): try: ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL) except OSError: pass _preload_cuda() import gradio as gr import spaces from huggingface_hub import hf_hub_download from llama_cpp import Llama # ---- feedback logging (JSONL, synced to a private HF dataset) ---- # NB: huggingface_hub.CommitScheduler's background thread breaks under ZeroGPU's # process forking ("Invalid file descriptor: -1"), so we append locally and push # the file synchronously from the main process instead. from huggingface_hub import HfApi FEEDBACK_REPO = os.environ.get("FEEDBACK_REPO", "AlexWortega/my-pi-agent-feedback") _FB_DIR = Path("feedback") _FB_DIR.mkdir(exist_ok=True) _FB_FILE = _FB_DIR / f"log_{uuid.uuid4().hex}.jsonl" _FB_PATH_IN_REPO = f"data/{_FB_FILE.name}" _HF_TOKEN = os.environ.get("HF_TOKEN") _api = HfApi(token=_HF_TOKEN) if _HF_TOKEN else None print("feedback ->", FEEDBACK_REPO if _api else "(local only, no HF_TOKEN)", flush=True) def _log(record): record = { "ts": datetime.datetime.now(datetime.timezone.utc).isoformat(), **record, } try: with _FB_FILE.open("a", encoding="utf-8") as f: f.write(json.dumps(record, ensure_ascii=False) + "\n") except Exception as e: # noqa: BLE001 print("local log failed:", repr(e)[:120], flush=True) return if _api is not None: try: _api.upload_file( path_or_fileobj=str(_FB_FILE), path_in_repo=_FB_PATH_IN_REPO, repo_id=FEEDBACK_REPO, repo_type="dataset", commit_message="feedback log", ) except Exception as e: # noqa: BLE001 print("dataset upload failed:", repr(e)[:160], flush=True) # ---- model (GGUF pulled from the Hub at startup, runs on ZeroGPU) ---- GGUF_REPO = os.environ.get("GGUF_REPO", "AlexWortega/qwen35-4b-soyuz-merged-gguf") GGUF_FILE = os.environ.get("GGUF_FILE", "qwen35-4b-soyuz-merged.nomtp.Q4_K_M.gguf") N_CTX = int(os.environ.get("N_CTX", "16384")) print("Downloading GGUF from the Hub ...", flush=True) MODEL_PATH = hf_hub_download(GGUF_REPO, GGUF_FILE) print("GGUF ready at", MODEL_PATH, flush=True) _LLM = None def _get_llm(): global _LLM if _LLM is None: _LLM = Llama( model_path=MODEL_PATH, n_gpu_layers=-1, n_ctx=N_CTX, verbose=False, ) return _LLM _THINK = re.compile(r"(.*?)", re.DOTALL) _CODE_BLOCK = re.compile(r"```([\w+-]*)\s*\n(.*?)```", re.DOTALL) def _split(text): """Return (clean_answer, thinking). Handles an unterminated .""" think_parts = _THINK.findall(text) answer = _THINK.sub("", text) if "" in text and "" not in text: i = text.index("") think_parts.append(text[i + len(""):]) answer = text[:i] thinking = "\n\n".join(p.strip() for p in think_parts).strip() return answer.strip(), thinking def _extract_doc(answer): """Assemble a single self-contained HTML document from the answer's HTML/CSS/JS code blocks, to render in the preview iframe.""" htmls, csss, jss = [], [], [] for lang, body in _CODE_BLOCK.findall(answer): l = (lang or "").lower().strip() b = body.strip() if not b: continue low = b.lower() if l in ("html", "htm") or "" in b: htmls.append(b) doc = htmls[0] if htmls else "" if not doc and (csss or jss): doc = "" if not doc: return "" if "\n" + doc + "\n" ) if csss and "" doc = doc.replace("", style + "", 1) if "" in doc else style + doc if jss and "" doc = doc.replace("", script + "", 1) if "" in doc else doc + script return doc _EMPTY_PREVIEW = ( "
" "The preview appears here once the model returns HTML/CSS/JS in a code block.
" ) _NO_REASONING = "(no reasoning in this turn)" def _iframe(doc): if not doc or not doc.strip(): return _EMPTY_PREVIEW esc = _html.escape(doc, quote=True) return ( f'' ) DEFAULT_SYS = ( "You are Soyuz, a helpful coding assistant. Reason briefly inside " " ... , then answer. When the user asks for a web app, page, " "game or visual UI, output ONE complete, self-contained HTML file with inline " "CSS and JavaScript inside a single ```html code block (no external files or " "CDNs unless necessary). Keep the thinking short so you have room for the code." ) @spaces.GPU(duration=120) def _stream(message, history, system_prompt, temperature, max_tokens): llm = _get_llm() msgs = [] if system_prompt and system_prompt.strip(): msgs.append({"role": "system", "content": system_prompt.strip()}) for m in history: if m.get("role") in ("user", "assistant") and m.get("content"): msgs.append({"role": m["role"], "content": m["content"]}) msgs.append({"role": "user", "content": message}) raw = "" for chunk in llm.create_chat_completion( messages=msgs, max_tokens=int(max_tokens), temperature=float(temperature), stream=True, ): delta = chunk["choices"][0]["delta"].get("content", "") if not delta: continue raw += delta answer, thinking = _split(raw) yield (answer if answer else "โ€ฆ"), raw, thinking def respond(message, history, system_prompt, temperature, max_tokens, meta): meta = meta or [] if not message or not message.strip(): yield history or [], "", "", _NO_REASONING, "", _EMPTY_PREVIEW, meta return history = (history or []) + [ {"role": "user", "content": message}, {"role": "assistant", "content": ""}, ] prior = history[:-2] answer = raw = thinking = "" for answer, raw, thinking in _stream( message, prior, system_prompt, temperature, max_tokens ): history[-1]["content"] = answer code = _extract_doc(answer) # live-update chat / raw / reasoning / code; keep preview steady until done yield history, "", raw, (thinking or _NO_REASONING), code, _EMPTY_PREVIEW, meta doc = _extract_doc(answer) history[-1]["content"] = answer turn_id = uuid.uuid4().hex record = { "turn_id": turn_id, "event": "generation", "user": message, "answer": answer, "reasoning": thinking, "code": doc, "reaction": None, } meta = meta + [record] yield history, "", raw, (thinking or _NO_REASONING), doc, _iframe(doc), meta _log(record) # after yield: upload doesn't block the visible response def on_like(meta, evt: gr.LikeData): reaction = "like" if evt.liked else "dislike" turn = (evt.index // 2) if isinstance(evt.index, int) else 0 base = dict(meta[turn]) if (meta and 0 <= turn < len(meta)) else {} _log( { "turn_id": base.get("turn_id"), "event": "feedback", "reaction": reaction, "user": base.get("user"), "answer": base.get("answer"), "reasoning": base.get("reasoning"), "code": base.get("code"), } ) emoji = "๐Ÿ‘" if evt.liked else "๐Ÿ‘Ž" return f"Saved feedback {emoji} (turn {turn + 1})" CARD = """
๐Ÿฆ€
My Pi Agent โ€” Soyuz
qwen35-4b-soyuz-merged ยท Qwen3.5-4B (hybrid linear-attention) ยท GGUF Q4_K_M ยท runs on ZeroGPU via llama-cpp-python
๐Ÿ’ฌ chat  ยท  ๐Ÿง  shows the model's reasoning  ยท  ๐Ÿ–ฅ live HTML/JS preview (Artifacts-style)  ยท  ๐Ÿ“ text-only (no image input)
base model ยท GGUF
""" with gr.Blocks(title="My Pi Agent โ€” Soyuz", fill_height=True) as demo: gr.HTML(CARD) gr.Markdown( "Ask for a web app, page or game and the model writes a complete HTML file โ€” " "the **๐Ÿ–ฅ Preview** tab runs it live. Tips: bump *Max tokens* for bigger apps." ) meta_state = gr.State([]) with gr.Row(): with gr.Column(scale=3): chatbot = gr.Chatbot(height=480, label="Chat (use ๐Ÿ‘ / ๐Ÿ‘Ž on replies)") fb_status = gr.Markdown("", elem_id="fb_status") msg = gr.Textbox( placeholder="e.g. \"build a pomodoro timer in HTML/JS\" (Enter to send)", label="Message", autofocus=True, ) with gr.Row(): send = gr.Button("Send", variant="primary") clear = gr.Button("Clear") with gr.Column(scale=3): with gr.Tab("๐Ÿ–ฅ Preview"): preview = gr.HTML(_EMPTY_PREVIEW) with gr.Tab("๐Ÿงฉ Code"): code_box = gr.Code(label="Artifact (HTML)", language="html") with gr.Tab("๐Ÿง  Reasoning"): think_box = gr.Textbox(label="What the model was thinking", lines=18) with gr.Tab("๐Ÿ“ Raw"): raw_box = gr.Textbox(label="Full raw output (with tags)", lines=18) with gr.Accordion("โš™๏ธ Settings", open=False): system_prompt = gr.Textbox( value=DEFAULT_SYS, label="System prompt", lines=4 ) temperature = gr.Slider( 0.0, 1.5, value=0.6, step=0.05, label="Temperature" ) max_tokens = gr.Slider( 256, 8192, value=4096, step=128, label="Max tokens" ) inputs = [msg, chatbot, system_prompt, temperature, max_tokens, meta_state] outputs = [chatbot, msg, raw_box, think_box, code_box, preview, meta_state] send.click(respond, inputs, outputs) msg.submit(respond, inputs, outputs) clear.click( lambda: ([], "", "", "", "", _EMPTY_PREVIEW, []), None, outputs ) chatbot.like(on_like, [meta_state], [fb_status]) demo.queue().launch()