Spaces:

prism-ml
/

Bonsai-Image-Demo

Running on L40S

App Files Files

pashak commited on 20 days ago

Commit

17895f4

0 Parent(s):

init

Browse files

Files changed (9) hide show

.gitattributes +35 -0
Dockerfile +61 -0
README.md +33 -0
space/__init__.py +0 -0
space/app.py +483 -0
space/dashboard.html +594 -0
space/entrypoint.sh +323 -0
space/metrics_pusher.py +599 -0
space/nginx.conf +117 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,61 @@

+# syntax=docker/dockerfile:1.6
+# CUDA 12.8 runtime — gemlite/Triton kernels JIT against the runtime ptxas
+# that comes with this image; no need for the larger -devel variant.
+FROM nvidia/cuda:12.8.0-runtime-ubuntu22.04
+# ── system deps ──────────────────────────────────────────────────────────────
+# build-essential is needed because some sdists (gemlite among them) compile C
+# extensions at install time. python3 is the host interpreter that bootstraps
+# uv; uv then provisions its own pinned interpreter for the venv. nginx fronts
+# everything on :7860 (frontend, backend API, dashboard).
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates curl git build-essential python3 python3-venv \
+        libgomp1 libssl3 nginx openssl \
+    && rm -rf /var/lib/apt/lists/* \
+    && chown -R 1000:1000 /var/lib/nginx /var/log/nginx /run
+# Non-root user: HF Spaces convention is uid 1000.
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user PATH="/home/user/.local/bin:$PATH"
+# uv (Python venv + package manager). The demo's setup.sh assumes it's on PATH.
+RUN curl -fsSL https://astral.sh/uv/install.sh | sh
+WORKDIR /home/user/app
+# ── clone + run setup.sh in one RUN so GH_TOKEN never lands in a layer ───────
+# GH_TOKEN is supplied by `--mount=type=secret`; the secret file is only
+# visible during this single RUN and is not stored in the image.
+# SKIP_DOWNLOAD=1 keeps setup.sh from pulling the 3.5 GB model at build time
+# — entrypoint.sh handles that at boot so a Space restart doesn't have to
+# rebuild the image.
+RUN --mount=type=secret,id=GH_TOKEN,uid=1000,required=true \
+    git config --global credential.helper '!f() { echo "username=oauth2"; echo "password=$(cat /run/secrets/GH_TOKEN)"; }; f' \
+    && git clone https://github.com/PrismML-Eng/Bonsai-image-demo.git . \
+    && SKIP_DOWNLOAD=1 BONSAI_PACKAGE_MIN_AGE_DAYS=0 ./setup.sh \
+    && git config --global --unset credential.helper
+# ── pre-build the Next.js frontend ───────────────────────────────────────────
+# Baking the build into the image so the first browser visit doesn't pay
+# `npm install + next build` (~2 min) on top of model load. NEXT_PUBLIC_*
+# vars are inlined at build time and don't change at runtime, so the
+# backend URL (always loopback inside this container) is baked too.
+RUN cd vendor/image-studio/frontend \
+    && PATH="$HOME/app/.venv/bin:$PATH" \
+       NEXT_PUBLIC_BACKEND_URL=http://127.0.0.1:8000 \
+       npm install --no-audit --no-fund \
+    && PATH="$HOME/app/.venv/bin:$PATH" \
+       NEXT_PUBLIC_BACKEND_URL=http://127.0.0.1:8000 \
+       npm run build
+# ── Space-local files ────────────────────────────────────────────────────────
+# All Space-specific code lives under space/ (Python package + sidecar +
+# dashboard + nginx config + entrypoint). The demo's own code stays at the
+# repo root (cloned earlier) so the two namespaces don't collide.
+COPY --chown=user space/ /home/user/app/space/
+RUN chmod +x /home/user/app/space/entrypoint.sh
+EXPOSE 7860
+CMD ["/home/user/app/space/entrypoint.sh"]

README.md ADDED Viewed

	@@ -0,0 +1,33 @@

+---
+title: Bonsai Image (1-bit + 1.58-bit) GPU
+emoji: 🎨
+colorFrom: green
+colorTo: blue
+sdk: docker
+app_port: 7860
+suggested_hardware: l40sx1
+pinned: true
+short_description: Run 1-bit and 1.58-bit Bonsai-Image-4B on GPU
+models:
+  - prism-ml/bonsai-image-ternary-4B-mlx-2bit
+  - prism-ml/bonsai-image-ternary-4B-gemlite-2bit
+  - prism-ml/bonsai-image-ternary-4B-unpacked
+  - prism-ml/bonsai-image-binary-4B-mlx-1bit
+  - prism-ml/bonsai-image-binary-4B-gemlite-1bit
+  - prism-ml/bonsai-image-binary-4B-unpacked
+---
+# Bonsai Image Demo
+- Ternary (1.58-bit)
+- Binary (1-bit)
+## Privacy
+- **We do not log prompts or generated images.** Generation runs in-process and outputs are streamed back over HTTPS.
+- The studio UI keeps your prompt history **in your browser's local storage only**. Clearing your browser cache erases it.
+- Please do not submit sensitive, private, or confidential content in your prompts.
+## Fair Use
+Single-GPU demo, shared across all visitors. Heavy load may queue requests. Please avoid bursts of automated traffic so everyone can try it.

space/__init__.py ADDED Viewed

File without changes

space/app.py ADDED Viewed

	@@ -0,0 +1,483 @@

+"""HF Space wrapper around scripts.local_backend.
+Adds a metrics middleware that:
+  - tracks total / success / error counters (cumulative since first launch)
+  - per-shape latency histogram (rolling)
+  - rolling 1000-request log with hashed-IP for unique-user count
+  - per-day buckets (UTC date) for the daily archives the metrics_pusher
+    sidecar writes under $BONSAI_STATE_DIR/daily/YYYY-MM-DD.json
+State loaded at boot from $BONSAI_STATE_DIR/state.json so counters survive
+Space restarts (assuming a persistent storage bucket is mounted; entrypoint
+falls back to ephemeral disk otherwise).
+Run with: uvicorn space.app:app
+"""
+from __future__ import annotations
+import asyncio
+import hashlib
+import json
+import os
+import time
+from collections import defaultdict, deque
+from datetime import datetime, timezone
+from threading import Lock
+from fastapi import Request
+# Re-export the real backend's app object so /generate, /backends, /docs
+# are served untouched.
+from scripts.local_backend import app  # noqa: F401  (re-exported)
+# ── in-memory state ──────────────────────────────────────────────────────────
+_lock = Lock()
+_started_at = time.monotonic()
+_total = {"requests": 0, "success": 0, "errors": 0}
+_by_shape: dict[str, dict] = defaultdict(
+    lambda: {"count": 0, "duration_ms_total": 0, "durations": deque(maxlen=200)}
+)
+# Cumulative by-variant counter. The `variant` key is "ternary", "binary",
+# or "unknown" (parsed from the request's `backend` field — see middleware).
+# Mirrors by_shape's shape so the dashboard can show "ternary: X · binary: Y"
+# across all time without re-summing the by_day history.
+_by_variant: dict[str, dict] = defaultdict(
+    lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}
+)
+_recent: deque = deque(maxlen=1000)
+# Per-day buckets keyed by UTC YYYY-MM-DD. Last 30 days kept in memory;
+# older days remain on disk (metrics_pusher writes one file per day under
+# $BONSAI_STATE_DIR/daily/).
+_MAX_DAYS_IN_MEMORY = 30
+_by_day: dict[str, dict] = {}
+# UTC bucketing. (We tried Pacific Time, but `zoneinfo.ZoneInfo` needs
+# /usr/share/zoneinfo/ which our CUDA Ubuntu base image strips with
+# --no-install-recommends. To re-enable PT, install `tzdata` in the
+# Dockerfile and swap these back to ZoneInfo("America/Los_Angeles").)
+def _today() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%d")
+def _now_hour() -> int:
+    return datetime.now(timezone.utc).hour
+def _empty_day() -> dict:
+    return {
+        "requests": 0,
+        "success": 0,
+        "errors": 0,
+        # queue_ms_total at three levels: day-total + per_shape + per_gpu.
+        # Day-total powers the dashboard's "today avg queue" tile; the
+        # per-shape and per-gpu views surface where queueing pressure is
+        # actually landing (e.g. is the slow GPU starving on small shapes?).
+        "by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
+        "by_hour": [0] * 24,
+        "unique_ips": set(),
+        # Per-GPU attribution for this day. Persisted to state.json +
+        # written into daily/YYYY-MM-DD.json so historical days retain
+        # their original GPU split even after a tier swap.
+        "by_gpu": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
+        # Per-variant attribution (ternary/binary/unknown). Tells you which
+        # arm took the traffic on this day independent of which GPU served
+        # it — useful for "did users actually click binary today, or are
+        # they all defaulting to ternary?" analysis.
+        "by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
+        "queue_ms_total": 0,
+    }
+def _bump_day(date: str, ok: bool, shape: str, dt_ms: int, queue_ms: int, hour: int, ip_hash: str, variant: str) -> None:
+    """Increment today's bucket. Caller must hold _lock."""
+    if date not in _by_day:
+        _by_day[date] = _empty_day()
+    d = _by_day[date]
+    d["requests"] += 1
+    if ok:
+        d["success"] += 1
+    else:
+        d["errors"] += 1
+    d["by_shape"][shape]["count"] += 1
+    d["by_shape"][shape]["duration_ms_total"] += dt_ms
+    d["by_shape"][shape]["queue_ms_total"] += queue_ms
+    d["by_hour"][hour] += 1
+    d["unique_ips"].add(ip_hash)
+    d["by_gpu"][_GPU_NAME]["count"] += 1
+    d["by_gpu"][_GPU_NAME]["duration_ms_total"] += dt_ms
+    d["by_gpu"][_GPU_NAME]["queue_ms_total"] += queue_ms
+    d["by_variant"][variant]["count"] += 1
+    d["by_variant"][variant]["duration_ms_total"] += dt_ms
+    d["by_variant"][variant]["queue_ms_total"] += queue_ms
+    d["queue_ms_total"] += queue_ms
+    if len(_by_day) > _MAX_DAYS_IN_MEMORY:
+        for stale in sorted(_by_day)[:-_MAX_DAYS_IN_MEMORY]:
+            del _by_day[stale]
+# ── persisted state ──────────────────────────────────────────────────────────
+# $BONSAI_STATE_DIR is set by entrypoint.sh — /data/state if a persistent
+# storage bucket is mounted, else $APP_DIR/outputs/.state (ephemeral).
+_STATE_DIR = os.environ.get("BONSAI_STATE_DIR", "/tmp")
+_STATE_PATH = os.path.join(_STATE_DIR, "state.json")
+# entrypoint.sh sets this to "1" when /data is mounted + writable, else "0".
+# Surfaced to the dashboard so it can show a "counters won't persist" warning.
+_PERSISTENT_STORAGE = os.environ.get("BONSAI_PERSISTENT_STORAGE", "0") == "1"
+def _load_state() -> dict:
+    """Return a dict with all persisted fields, or fresh defaults on miss / parse error."""
+    fresh = {
+        "pepper": os.urandom(16).hex().encode(),
+        "totals": {"requests": 0, "success": 0, "errors": 0},
+        "by_shape": {},
+        "by_variant": {},  # parallel to by_shape; new in this build, may be missing in old state files
+        "recent": [],
+        "by_day": {},
+    }
+    try:
+        with open(_STATE_PATH) as f:
+            data = json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError, OSError) as exc:
+        print(f"[space.app] no prior state ({type(exc).__name__}: {exc}); starting fresh", flush=True)
+        return fresh
+    try:
+        fresh["pepper"] = (data.get("ip_pepper") or fresh["pepper"].decode()).encode()
+        fresh["totals"] = {
+            "requests": int(data.get("total_requests", 0)),
+            "success": int(data.get("success", 0)),
+            "errors": int(data.get("errors", 0)),
+        }
+        by_shape_raw = data.get("by_shape", {}) or {}
+        by_shape_loaded = {}
+        for shape, b in by_shape_raw.items():
+            by_shape_loaded[shape] = {
+                "count": int(b.get("count", 0)),
+                "duration_ms_total": int(b.get("duration_ms_total", 0)),
+                "durations": deque(maxlen=200),  # p50/p95 starts fresh after a boot
+            }
+        fresh["by_shape"] = by_shape_loaded
+        # by_variant: parallel to by_shape, no `durations` deque (no need
+        # for p50/p95 yet, just cumulative count + duration + queue).
+        by_variant_raw = data.get("by_variant", {}) or {}
+        by_variant_loaded = {}
+        for variant, b in by_variant_raw.items():
+            by_variant_loaded[variant] = {
+                "count": int(b.get("count", 0)),
+                "duration_ms_total": int(b.get("duration_ms_total", 0)),
+                "queue_ms_total": int(b.get("queue_ms_total", 0)),
+            }
+        fresh["by_variant"] = by_variant_loaded
+        fresh["recent"] = data.get("recent", []) or []
+        # Per-day
+        by_day_raw = data.get("by_day", {}) or {}
+        by_day_loaded: dict[str, dict] = {}
+        for date, d in by_day_raw.items():
+            bd = _empty_day()
+            bd["requests"] = int(d.get("requests", 0))
+            bd["success"] = int(d.get("success", 0))
+            bd["errors"] = int(d.get("errors", 0))
+            # queue_ms_total fields default to 0 for state files persisted
+            # before this feature shipped — keeps reload graceful.
+            bd["queue_ms_total"] = int(d.get("queue_ms_total", 0))
+            for shape, s in (d.get("by_shape", {}) or {}).items():
+                bd["by_shape"][shape] = {
+                    "count": int(s.get("count", 0)),
+                    "duration_ms_total": int(s.get("duration_ms_total", 0)),
+                    "queue_ms_total": int(s.get("queue_ms_total", 0)),
+                }
+            bh = d.get("by_hour") or [0] * 24
+            bd["by_hour"] = list(bh) + [0] * max(0, 24 - len(bh))
+            bd["unique_ips"] = set(d.get("unique_ips", []) or [])
+            for gpu_name, g in (d.get("by_gpu", {}) or {}).items():
+                bd["by_gpu"][gpu_name] = {
+                    "count": int(g.get("count", 0)),
+                    "duration_ms_total": int(g.get("duration_ms_total", 0)),
+                    "queue_ms_total": int(g.get("queue_ms_total", 0)),
+                }
+            for variant_name, v in (d.get("by_variant", {}) or {}).items():
+                bd["by_variant"][variant_name] = {
+                    "count": int(v.get("count", 0)),
+                    "duration_ms_total": int(v.get("duration_ms_total", 0)),
+                    "queue_ms_total": int(v.get("queue_ms_total", 0)),
+                }
+            by_day_loaded[date] = bd
+        fresh["by_day"] = by_day_loaded
+    except Exception as exc:
+        print(f"[space.app] state file partially malformed ({exc}); using what we could parse", flush=True)
+    return fresh
+# ── replica gating for multi-GPU deploys ─────────────────────────────────────
+# Each uvicorn process (one per GPU) sets BONSAI_REPLICA_INDEX via entrypoint.
+# Only replica 0 seeds its in-memory counters from state.json — other
+# replicas start at zero. metrics_pusher polls every replica and sums them,
+# so this avoids N-way inflation of cumulative counts. Pepper comes from
+# the env (set by entrypoint), shared across all replicas so unique-user
+# hashing is consistent.
+_REPLICA_INDEX = int(os.environ.get("BONSAI_REPLICA_INDEX", "0"))
+# Name of the GPU this replica is pinned to (entrypoint sets it from
+# `nvidia-smi --query-gpu=name`). Exposed in /metrics so the pusher can
+# aggregate per-GPU averages on the dashboard. Falls back to "unknown"
+# if not provided.
+# Default to NVIDIA L40S if entrypoint didn't supply a name — that's the
+# tier we ran on for most of the demo's history, so unattributed counters
+# get folded into the L40S bucket rather than a misleading "unknown".
+_GPU_NAME = os.environ.get("BONSAI_GPU_NAME", "").strip() or "NVIDIA L40S"
+_loaded = _load_state()
+if _REPLICA_INDEX == 0:
+    _total.update(_loaded["totals"])
+    for _s, _b in _loaded["by_shape"].items():
+        _by_shape[_s] = _b
+    for _v, _b in _loaded["by_variant"].items():
+        _by_variant[_v] = _b
+    for _r in _loaded["recent"][-1000:]:
+        _recent.append(_r)
+    _by_day.update(_loaded["by_day"])
+    print(
+        f"[space.app] replica 0: seeded counters from {_STATE_PATH} "
+        f"(requests={_total['requests']} days={len(_by_day)} "
+        f"persistent_storage={_PERSISTENT_STORAGE})",
+        flush=True,
+    )
+else:
+    print(
+        f"[space.app] replica {_REPLICA_INDEX}: starting counters at 0 "
+        f"(replica 0 owns cumulative state)",
+        flush=True,
+    )
+# Pepper: prefer env (entrypoint exports a single value for all replicas).
+# Fall back to whatever _load_state surfaced (typically random on first
+# launch) — fine for single-replica or testing.
+_IP_PEPPER = os.environ.get("BONSAI_IP_PEPPER", _loaded["pepper"].decode()).encode()
+def _hash_ip(ip: str) -> str:
+    return hashlib.sha256(_IP_PEPPER + ip.encode()).hexdigest()[:12]
+# Concurrency cap per replica. Image-gen is compute-bound; two concurrent
+# requests at one GPU just contend for the same SMs and serialize at the
+# kernel-launch level, wasting time. With Semaphore(1), additional requests
+# queue at the asyncio level, and nginx's least_conn sees them as "this
+# replica is busy" → routes to a free GPU when one's available.
+_GENERATE_CONCURRENCY = int(os.environ.get("BONSAI_GENERATE_CONCURRENCY", "1"))
+_generate_sem = asyncio.Semaphore(_GENERATE_CONCURRENCY)
+# In-flight gauge. Incremented when a /generate request enters the middleware
+# (before semaphore acquire — so queued requests count), decremented in
+# finally. metrics_pusher sums across replicas and derives queue depth as
+# max(0, total_inflight - total_concurrency).
+_inflight = 0
+_inflight_lock = Lock()
+# ── middleware ───────────────────────────────────────────────────────────────
+@app.middleware("http")
+async def _track_generate(request: Request, call_next):
+    if request.url.path != "/generate" or request.method != "POST":
+        return await call_next(request)
+    # Read + replay the body so the downstream handler still sees it.
+    body = await request.body()
+    async def _receive() -> dict:
+        return {"type": "http.request", "body": body, "more_body": False}
+    request._receive = _receive  # type: ignore[attr-defined]
+    shape = "unknown"
+    # variant: "ternary" / "binary" / "unknown". Parsed from the request's
+    # `backend` field — values look like "bonsai-ternary-gemlite" or
+    # "bonsai-binary-mlx". If the client omits backend, FastAPI's default
+    # picks the resident pipeline arm (set by MFLUX_STUDIO_GPU_DEFAULT_BACKEND
+    # in entrypoint.sh — currently bonsai-ternary-gemlite) so we mirror that
+    # default here for fair attribution.
+    variant = "ternary"
+    try:
+        payload = json.loads(body or b"{}")
+        w, h = int(payload.get("width", 0)), int(payload.get("height", 0))
+        if w and h:
+            shape = f"{w}x{h}"
+        backend = (payload.get("backend") or "").lower()
+        if "ternary" in backend:
+            variant = "ternary"
+        elif "binary" in backend:
+            variant = "binary"
+        elif backend:
+            variant = "unknown"
+        # else: backend missing → keep the default "ternary" set above
+    except Exception:
+        pass
+    # Identity for unique-user counting. Preference order:
+    #   1. X-IP-Token — set by HF when the visitor is logged into
+    #      huggingface.co and viewing the Space via the embed. Tied to
+    #      their HF session, stable across home↔mobile network changes.
+    #   2. X-Forwarded-For — real client IP, set by nginx (and propagated
+    #      by Next.js's /api/generate route handler).
+    #   3. request.client.host — direct-loopback fallback (mostly never).
+    # The "hf:" / "ip:" prefix keeps the two namespaces from colliding.
+    hf_token = request.headers.get("x-ip-token")
+    if hf_token:
+        identity = f"hf:{hf_token}"
+    else:
+        forwarded = request.headers.get("x-forwarded-for")
+        ip = forwarded.split(",")[0].strip() if forwarded else (request.client.host if request.client else "0.0.0.0")
+        identity = f"ip:{ip}"
+    ip_hash = _hash_ip(identity)
+    date = _today()
+    hour = _now_hour()
+    # Increment in-flight gauge BEFORE the semaphore so queued requests are
+    # visible to the dashboard ("X pending"). Decrement in finally so the
+    # gauge stays accurate even on exceptions.
+    global _inflight
+    t_enqueue = time.monotonic()
+    with _inflight_lock:
+        _inflight += 1
+    try:
+        # Queue at the semaphore so only N requests per replica run on the
+        # GPU at once. The HTTP connection stays open while we wait, which
+        # makes nginx's least_conn see this replica as busy → routes new
+        # arrivals to a free GPU when one's available.
+        async with _generate_sem:
+            t_start = time.monotonic()
+            queue_ms = int((t_start - t_enqueue) * 1000)
+            try:
+                response = await call_next(request)
+            except Exception:
+                dt_ms = int((time.monotonic() - t_start) * 1000)
+                with _lock:
+                    _total["requests"] += 1
+                    _total["errors"] += 1
+                    _by_variant[variant]["count"] += 1
+                    _by_variant[variant]["duration_ms_total"] += dt_ms
+                    _by_variant[variant]["queue_ms_total"] += queue_ms
+                    _recent.append({"ts": int(time.time()), "shape": shape, "duration_ms": dt_ms, "queue_ms": queue_ms, "ip_hash": ip_hash, "gpu": _GPU_NAME, "variant": variant, "ok": False})
+                    _bump_day(date, False, shape, dt_ms, queue_ms, hour, ip_hash, variant)
+                raise
+            dt_ms = int((time.monotonic() - t_start) * 1000)
+            ok = response.status_code < 400
+            with _lock:
+                _total["requests"] += 1
+                if ok:
+                    _total["success"] += 1
+                else:
+                    _total["errors"] += 1
+                bucket = _by_shape[shape]
+                bucket["count"] += 1
+                bucket["duration_ms_total"] += dt_ms
+                bucket["durations"].append(dt_ms)
+                _by_variant[variant]["count"] += 1
+                _by_variant[variant]["duration_ms_total"] += dt_ms
+                _by_variant[variant]["queue_ms_total"] += queue_ms
+                _recent.append({"ts": int(time.time()), "shape": shape, "duration_ms": dt_ms, "queue_ms": queue_ms, "ip_hash": ip_hash, "gpu": _GPU_NAME, "variant": variant, "ok": ok})
+                _bump_day(date, ok, shape, dt_ms, queue_ms, hour, ip_hash, variant)
+        return response
+    finally:
+        with _inflight_lock:
+            _inflight -= 1
+# ── /metrics endpoint (loopback-only via nginx) ──────────────────────────────
+def _percentile(xs: list[int], p: int) -> int | None:
+    if not xs:
+        return None
+    s = sorted(xs)
+    idx = min(int(len(s) * p / 100), len(s) - 1)
+    return s[idx]
+@app.get("/metrics")
+def get_metrics() -> dict:
+    """Scraped by metrics_pusher every few seconds. Returns the full in-memory
+    state so the sidecar can rebuild analytics.json + write daily archives.
+    """
+    with _lock:
+        by_shape = {}
+        for shape, b in _by_shape.items():
+            durs = list(b["durations"])
+            by_shape[shape] = {
+                "count": b["count"],
+                "duration_ms_total": b["duration_ms_total"],
+                "duration_ms_p50": _percentile(durs, 50),
+                "duration_ms_p95": _percentile(durs, 95),
+            }
+        by_day_out = {}
+        for date, d in _by_day.items():
+            by_day_out[date] = {
+                "requests": d["requests"],
+                "success": d["success"],
+                "errors": d["errors"],
+                # queue_ms_total exposed at all three levels (day + per-shape +
+                # per-gpu) so the pusher can compute today's average queue at
+                # arbitrary slicing without re-summing recent[].
+                "queue_ms_total": d.get("queue_ms_total", 0),
+                "by_shape": {
+                    s: {
+                        "count": b["count"],
+                        "duration_ms_total": b["duration_ms_total"],
+                        "queue_ms_total": b.get("queue_ms_total", 0),
+                    }
+                    for s, b in d["by_shape"].items()
+                },
+                "by_hour": list(d["by_hour"]),
+                "unique_users": len(d["unique_ips"]),
+                "unique_ips": list(d["unique_ips"]),  # for round-trip persistence
+                "by_gpu": {
+                    g: {
+                        "count": v["count"],
+                        "duration_ms_total": v["duration_ms_total"],
+                        "queue_ms_total": v.get("queue_ms_total", 0),
+                    }
+                    for g, v in d["by_gpu"].items()
+                },
+                "by_variant": {
+                    v: {
+                        "count": b["count"],
+                        "duration_ms_total": b["duration_ms_total"],
+                        "queue_ms_total": b.get("queue_ms_total", 0),
+                    }
+                    for v, b in d.get("by_variant", {}).items()
+                },
+            }
+        with _inflight_lock:
+            inflight = _inflight
+        # Replica's own cumulative duration sum (sum across all shapes).
+        # Used by the pusher to compute per-GPU avg latency without
+        # rebuilding it from `recent` (which would lose history).
+        total_duration_ms = sum(b["duration_ms_total"] for b in _by_shape.values())
+        return {
+            "uptime_s": int(time.monotonic() - _started_at),
+            "replica_index": _REPLICA_INDEX,
+            "gpu_name": _GPU_NAME,
+            "inflight": inflight,
+            "generate_concurrency": _GENERATE_CONCURRENCY,
+            "total_requests": _total["requests"],
+            "success": _total["success"],
+            "errors": _total["errors"],
+            "total_duration_ms": total_duration_ms,
+            "by_shape": by_shape,
+            "by_variant": {
+                v: {
+                    "count": b["count"],
+                    "duration_ms_total": b["duration_ms_total"],
+                    "queue_ms_total": b.get("queue_ms_total", 0),
+                }
+                for v, b in _by_variant.items()
+            },
+            "by_day": by_day_out,
+            "recent": list(_recent),
+            "ip_pepper": _IP_PEPPER.decode(),
+            "persistent_storage": _PERSISTENT_STORAGE,
+            "state_dir": _STATE_DIR,
+        }

space/dashboard.html ADDED Viewed

	@@ -0,0 +1,594 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<title>Bonsai-Image Dashboard</title>
+<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.4/dist/chart.umd.min.js"></script>
+<style>
+:root {
+  --bg: #0e1116;
+  --panel: #161b22;
+  --panel-border: #1f2630;
+  --text: #d7dde6;
+  --muted: #7d8694;
+  --accent: #4cb583;
+  --warn: #d97757;
+  --grid: #21272f;
+}
+* { box-sizing: border-box; }
+body {
+  margin: 0; padding: 24px;
+  font: 14px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+  background: var(--bg); color: var(--text);
+}
+header { display: flex; align-items: baseline; justify-content: space-between; margin-bottom: 20px; }
+h1 { margin: 0; font-size: 18px; font-weight: 600; }
+.subtitle { color: var(--muted); font-size: 12px; }
+.grid { display: grid; gap: 16px; grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); margin-bottom: 16px; }
+.card { background: var(--panel); border: 1px solid var(--panel-border); border-radius: 8px; padding: 16px; }
+.card h2 { margin: 0 0 12px 0; font-size: 12px; font-weight: 600; color: var(--muted); text-transform: uppercase; letter-spacing: 0.05em; }
+.metric { font-size: 28px; font-weight: 600; line-height: 1; }
+.metric-sub { color: var(--muted); font-size: 12px; margin-top: 6px; }
+.row { display: grid; gap: 16px; grid-template-columns: 2fr 1fr; margin-bottom: 16px; }
+.row.single { grid-template-columns: 1fr; }
+.row.equal { grid-template-columns: 1fr 1fr; }
+.row.three { grid-template-columns: 1fr 1fr 1fr; }
+@media (max-width: 1100px) { .row.three { grid-template-columns: 1fr 1fr; } }
+@media (max-width: 900px) { .row, .row.equal, .row.three { grid-template-columns: 1fr; } }
+/* Replica pills: one chip per active uvicorn worker, color-coded by GPU
+   tier so a glance at the Replicas tile shows mixed vs homogeneous fleets. */
+.replicas { display: flex; flex-wrap: wrap; gap: 6px; margin-top: 8px; }
+.replica-pill { display: inline-flex; align-items: center; gap: 4px; font-size: 11px; padding: 2px 8px; border-radius: 999px; background: var(--grid); }
+.replica-pill .dot { width: 6px; height: 6px; border-radius: 50%; background: var(--accent); }
+.status-warn { color: var(--warn); }
+canvas { max-width: 100%; }
+table { width: 100%; border-collapse: collapse; font-size: 12px; }
+th, td { padding: 6px 8px; text-align: left; border-bottom: 1px solid var(--grid); }
+th { color: var(--muted); font-weight: 500; }
+th.num, td.num { text-align: right; font-variant-numeric: tabular-nums; }
+.status-ok { color: var(--accent); }
+.status-err { color: var(--warn); }
+footer { color: var(--muted); font-size: 11px; margin-top: 20px; text-align: right; }
+.gpu-bar { background: var(--grid); border-radius: 4px; height: 6px; overflow: hidden; margin-top: 4px; }
+.gpu-bar-fill { background: var(--accent); height: 100%; transition: width 0.3s; }
+.banner { display: none; padding: 12px 16px; border-radius: 8px; margin-bottom: 16px; font-size: 13px; line-height: 1.5; border: 1px solid; }
+.banner.warn { background: rgba(217, 119, 87, 0.08); border-color: rgba(217, 119, 87, 0.4); color: #e8a280; }
+.banner.error { background: rgba(217, 119, 87, 0.15); border-color: rgba(217, 119, 87, 0.5); color: #f0a890; }
+</style>
+</head>
+<body>
+<header>
+  <div>
+    <h1>🌿 Bonsai-Image Dashboard</h1>
+    <div class="subtitle" id="updated">loading...</div>
+  </div>
+  <div class="subtitle" id="refresh-label">auto-refresh every 2s</div>
+</header>
+<div id="storage-banner" class="banner warn"></div>
+<div id="stale-banner" class="banner warn"></div>
+<div id="error-banner" class="banner error"></div>
+<div class="grid">
+  <div class="card"><h2>Total images</h2><div class="metric" id="total-requests">—</div><div class="metric-sub" id="total-sub">— ok / — errors</div></div>
+  <div class="card"><h2>Today (UTC)</h2><div class="metric" id="req-today">—</div><div class="metric-sub" id="users-today">— unique users</div></div>
+  <div class="card"><h2>Last 7 days</h2><div class="metric" id="req-7d">—</div><div class="metric-sub" id="users-7d">— unique users</div></div>
+  <div class="card"><h2>Last 30 days</h2><div class="metric" id="req-30d">—</div><div class="metric-sub" id="users-30d">— unique users</div></div>
+  <div class="card"><h2>Pending</h2><div class="metric" id="pending">—</div><div class="metric-sub" id="pending-sub">— running / — capacity</div></div>
+  <div class="card">
+    <h2>Replicas</h2>
+    <div class="metric" id="replicas-metric">—</div>
+    <div class="metric-sub" id="replicas-sub">—</div>
+    <div class="replicas" id="replicas-pills"></div>
+  </div>
+  <div class="card">
+    <h2>By Variant</h2>
+    <div class="metric" id="variant-metric">—</div>
+    <div class="metric-sub" id="variant-sub">all-time · — today</div>
+  </div>
+  <div class="card"><h2>Uptime</h2><div class="metric" id="uptime">—</div><div class="metric-sub">since last restart</div></div>
+</div>
+<!-- Row: both charts side-by-side. Daily covers 30d, hourly covers today. -->
+<div class="row equal">
+  <div class="card">
+    <h2>Requests per day (last 30d)</h2>
+    <canvas id="daily-chart" height="80"></canvas>
+  </div>
+  <div class="card">
+    <h2>Today's requests by hour (UTC)</h2>
+    <canvas id="hourly-chart" height="80"></canvas>
+  </div>
+</div>
+<!-- Row: image-time stats. Three views of latency: rolling 50, all-time
+     per-resolution, today per-resolution. Same column shape so eye can
+     scan left→right and spot drift. -->
+<div class="row three">
+  <div class="card">
+    <h2>Average latency (last 50 requests)</h2>
+    <div style="display: flex; align-items: baseline; gap: 16px; margin-bottom: 12px;">
+      <div class="metric" id="avg-latency">—</div>
+      <div class="metric-sub" id="avg-latency-sub">across last — requests</div>
+    </div>
+    <table>
+      <thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg latency</th><th class="num">Avg queue</th></tr></thead>
+      <tbody id="latency-tbody"></tbody>
+    </table>
+  </div>
+  <div class="card">
+    <h2>By resolution (all-time)</h2>
+    <table>
+      <thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg time</th></tr></thead>
+      <tbody id="shape-tbody"></tbody>
+    </table>
+  </div>
+  <div class="card">
+    <h2>By resolution (today)</h2>
+    <table>
+      <thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg time</th><th class="num">Avg queue</th></tr></thead>
+      <tbody id="shape-today-tbody"></tbody>
+    </table>
+  </div>
+</div>
+<!-- Row: GPU stats. Live nvidia-smi snapshot, today's per-GPU breakdown,
+     all-time per-GPU breakdown. Lets you spot tier mix today vs total. -->
+<div class="row three">
+  <div class="card">
+    <h2>GPUs (live)</h2>
+    <div id="gpus"></div>
+  </div>
+  <div class="card">
+    <h2>By GPU (today)</h2>
+    <table>
+      <thead><tr><th>GPU</th><th class="num">Count</th><th class="num">Avg latency</th><th class="num">Avg queue</th></tr></thead>
+      <tbody id="gpu-today-tbody"></tbody>
+    </table>
+  </div>
+  <div class="card">
+    <h2>By GPU (all-time)</h2>
+    <table>
+      <thead><tr><th>GPU</th><th class="num">Reps</th><th class="num">Count</th><th class="num">Errors</th><th class="num">Avg latency</th></tr></thead>
+      <tbody id="gpu-tbody"></tbody>
+    </table>
+  </div>
+</div>
+<div class="row single">
+  <div class="card">
+    <h2>Recent requests (last 50)</h2>
+    <table>
+      <thead><tr><th>Time</th><th>Shape</th><th>GPU</th><th class="num">Queued</th><th class="num">Duration</th><th>User</th><th>Status</th></tr></thead>
+      <tbody id="recent-tbody"></tbody>
+    </table>
+  </div>
+</div>
+<footer id="footer">—</footer>
+<script>
+// Absolute paths — the dashboard URL has no trailing slash, so relative
+// `analytics.json` would resolve to `/analytics.json` (wrong) rather than
+// `/dash-…/analytics.json`. nginx has explicit location blocks for these.
+const ANALYTICS_URL = "/dash-10a08e9c1ee4/analytics.json";
+const GPU_URL = "/dash-10a08e9c1ee4/gpu-stats.json";
+function fmtDuration(s) {
+  if (!s) return "—";
+  const days = Math.floor(s / 86400);
+  const hours = Math.floor((s % 86400) / 3600);
+  const mins = Math.floor((s % 3600) / 60);
+  if (days) return `${days}d ${hours}h`;
+  if (hours) return `${hours}h ${mins}m`;
+  return `${mins}m`;
+}
+function fmtTime(ts) {
+  if (!ts) return "—";
+  return new Date(ts * 1000).toLocaleString();
+}
+function fmtRelative(ts) {
+  const dt = Date.now() / 1000 - ts;
+  if (dt < 60) return `${Math.floor(dt)}s ago`;
+  if (dt < 3600) return `${Math.floor(dt / 60)}m ago`;
+  if (dt < 86400) return `${Math.floor(dt / 3600)}h ago`;
+  return `${Math.floor(dt / 86400)}d ago`;
+}
+let hourlyChart, dailyChart;
+function initCharts() {
+  Chart.defaults.color = "#7d8694";
+  Chart.defaults.borderColor = "#21272f";
+  Chart.defaults.font.family = "-apple-system, BlinkMacSystemFont, Segoe UI, Roboto, sans-serif";
+  hourlyChart = new Chart(document.getElementById("hourly-chart"), {
+    type: "line",
+    data: {
+      labels: [],
+      datasets: [{
+        label: "requests",
+        data: [],
+        borderColor: "#4cb583",
+        backgroundColor: "rgba(76, 181, 131, 0.12)",
+        fill: true,
+        tension: 0.25,
+        pointRadius: 3,
+        pointBackgroundColor: "#4cb583",
+        pointHoverRadius: 6,
+      }],
+    },
+    options: {
+      plugins: { legend: { display: false } },
+      scales: { y: { beginAtZero: true, ticks: { precision: 0 } } },
+    },
+  });
+  // Daily chart: single-series line, same style as the hourly chart. We
+  // KEEP the per-GPU breakdown data in requests_by_day[].by_gpu — it's just
+  // not rendered on the chart. Per-GPU averages are surfaced in the By GPU
+  // (today/all-time) tables; this chart sticks to volume-over-time.
+  dailyChart = new Chart(document.getElementById("daily-chart"), {
+    type: "line",
+    data: {
+      labels: [],
+      datasets: [{
+        label: "requests",
+        data: [],
+        borderColor: "#4cb583",
+        backgroundColor: "rgba(76, 181, 131, 0.12)",
+        fill: true,
+        tension: 0.25,
+        pointRadius: 3,
+        pointBackgroundColor: "#4cb583",
+        pointHoverRadius: 6,
+        spanGaps: true,
+      }],
+    },
+    options: {
+      plugins: { legend: { display: false } },
+      scales: { y: { beginAtZero: true, ticks: { precision: 0 } } },
+    },
+  });
+}
+function renderStorageBanner(a) {
+  const banner = document.getElementById("storage-banner");
+  if (!a) { banner.style.display = "none"; return; }
+  if (a.persistent_storage === false) {
+    banner.style.display = "block";
+    banner.textContent = "⚠ Persistent storage bucket not mounted at /data — counters, model weights, and kernel caches reset on every Space restart. Enable a Storage Bucket in Space Settings → Storage.";
+  } else {
+    banner.style.display = "none";
+  }
+}
+function renderErrorBanner(msg) {
+  const banner = document.getElementById("error-banner");
+  if (!msg) { banner.style.display = "none"; return; }
+  banner.style.display = "block";
+  banner.textContent = msg;
+}
+function renderStaleBanner(a) {
+  // analytics.json gets rewritten every metrics_pusher tick (~2s). If the
+  // age creeps past ~10s the pusher is either struggling to reach the
+  // backends (load, restart, /metrics timeouts) or the pusher itself is
+  // wedged. Either way, surface it so the user doesn't mistake stale
+  // numbers for a real lull or zero-out.
+  const banner = document.getElementById("stale-banner");
+  if (!a || !a.updated_at) { banner.style.display = "none"; return; }
+  const ageSec = Math.floor(Date.now() / 1000 - a.updated_at);
+  if (ageSec > 10) {
+    banner.style.display = "block";
+    banner.textContent = `⚠ Metrics are ${ageSec}s stale — the backend likely couldn't answer the last few /metrics polls (often because it's busy with /generate). Numbers shown are the last good scrape.`;
+  } else {
+    banner.style.display = "none";
+  }
+}
+async function refresh() {
+  try {
+    const [aResp, gResp] = await Promise.all([fetch(ANALYTICS_URL, { cache: "no-store" }), fetch(GPU_URL, { cache: "no-store" })]);
+    if (!aResp.ok || !gResp.ok) throw new Error(`http ${aResp.status}/${gResp.status}`);
+    const a = await aResp.json();
+    const g = await gResp.json();
+    renderStorageBanner(a);
+    renderErrorBanner(null);
+    renderStaleBanner(a);
+    renderSummary(a);
+    renderReplicas(a);
+    renderVariant(a);
+    renderHourly(a);
+    renderShapeList(a);
+    renderShapeToday(a);
+    renderGpuToday(a);
+    renderDaily(a);
+    renderRecent(a);
+    renderGPUs(g);
+    renderLatency(a);
+    renderByGPU(a);
+    document.getElementById("updated").textContent = `updated ${fmtRelative(a.updated_at)}`;
+  } catch (e) {
+    renderErrorBanner(`Could not load metrics: ${e.message}. Sidecar may be down, or the bucket isn't ready yet.`);
+    document.getElementById("updated").textContent = `error: ${e.message}`;
+  }
+}
+function fmtMs(ms) {
+  if (ms == null || isNaN(ms)) return "—";
+  if (ms < 1000) return `${ms} ms`;
+  return `${(ms / 1000).toFixed(1)} s`;
+}
+function renderSummary(a) {
+  const t = a.summary_total || { requests: 0, success: 0, errors: 0 };
+  document.getElementById("total-requests").textContent = t.requests.toLocaleString();
+  document.getElementById("total-sub").innerHTML = `<span class="status-ok">${t.success.toLocaleString()} ok</span> · <span class="status-err">${t.errors.toLocaleString()} errors</span>`;
+  const today = a.summary_today || { requests: 0, unique_users: 0 };
+  document.getElementById("req-today").textContent = today.requests.toLocaleString();
+  document.getElementById("users-today").textContent = `${today.unique_users.toLocaleString()} unique users`;
+  const d7 = a.summary_7d || { requests: 0, unique_users: 0 };
+  document.getElementById("req-7d").textContent = d7.requests.toLocaleString();
+  document.getElementById("users-7d").textContent = `${d7.unique_users.toLocaleString()} unique users`;
+  const d30 = a.summary_30d || { requests: 0, unique_users: 0 };
+  document.getElementById("req-30d").textContent = d30.requests.toLocaleString();
+  document.getElementById("users-30d").textContent = `${d30.unique_users.toLocaleString()} unique users`;
+  const queue = a.queue_depth ?? 0;
+  const running = a.running ?? 0;
+  const cap = a.capacity ?? 0;
+  const todayQueueAvg = a.today_avg_queue_ms ?? 0;
+  document.getElementById("pending").textContent = queue.toLocaleString();
+  // Pending subtitle: live cap utilization + today's avg queue. The avg is
+  // computed in metrics_pusher from today_bucket.queue_ms_total / requests,
+  // so it includes successful + errored requests but not currently-queued
+  // ones (those haven't tripped queue_ms yet).
+  const queueAvgPart = today.requests ? ` · today queue avg ${fmtMs(todayQueueAvg)}` : "";
+  document.getElementById("pending-sub").textContent = `${running} running / ${cap} GPU slot${cap === 1 ? "" : "s"}${queueAvgPart}`;
+  document.getElementById("uptime").textContent = fmtDuration(a.uptime_s);
+}
+// Variant breakdown tile — shows the ternary vs binary mix at a glance.
+// Big number: "T:1234 · B:567" (all-time). Subtitle: today's split. Variants
+// keyed by name ("ternary" / "binary" / "unknown") from the request's
+// `backend` field; metrics_pusher exposes them under by_variant + by_variant_today.
+function renderVariant(a) {
+  const fmtMix = (data) => {
+    const t = data?.ternary?.count || 0;
+    const b = data?.binary?.count || 0;
+    const u = data?.unknown?.count || 0;
+    const parts = [`T:${t.toLocaleString()}`, `B:${b.toLocaleString()}`];
+    if (u) parts.push(`?:${u.toLocaleString()}`);
+    return parts.join(" · ");
+  };
+  document.getElementById("variant-metric").textContent = fmtMix(a.by_variant);
+  const todayMix = fmtMix(a.by_variant_today);
+  const todayTotal = Object.values(a.by_variant_today || {}).reduce((s, b) => s + (b.count || 0), 0);
+  document.getElementById("variant-sub").textContent = todayTotal
+    ? `all-time · today: ${todayMix}`
+    : `all-time · no requests yet today`;
+}
+// Multi-GPU health card. Shows replicas_seen/expected up top and a row of
+// pills below — one per active replica, dot color reflects healthy or
+// errored. If the seen count is below expected, "X/Y (1 down)" + warn tint.
+function renderReplicas(a) {
+  const seen = a.replicas_seen ?? 0;
+  const expected = a.replicas_expected ?? seen;
+  const per = a.per_replica || [];
+  const metricEl = document.getElementById("replicas-metric");
+  metricEl.textContent = expected ? `${seen} / ${expected}` : String(seen);
+  metricEl.className = "metric" + (seen < expected ? " status-warn" : "");
+  const down = Math.max(0, expected - seen);
+  const subParts = [];
+  if (per.length) {
+    // Summarize tier mix: count GPUs by name. "L40S × 2" or "L40S + A10G".
+    const tierCounts = new Map();
+    for (const r of per) tierCounts.set(r.gpu_name, (tierCounts.get(r.gpu_name) || 0) + 1);
+    const tierStr = [...tierCounts.entries()]
+      .map(([n, c]) => c > 1 ? `${n.replace(/^(NVIDIA |Tesla )/, "")} × ${c}` : n.replace(/^(NVIDIA |Tesla )/, ""))
+      .join(" + ");
+    subParts.push(tierStr);
+  } else {
+    subParts.push("no replicas responding");
+  }
+  if (down) subParts.push(`${down} down`);
+  document.getElementById("replicas-sub").textContent = subParts.join(" · ");
+  // Per-replica pills: short tier label + current inflight/capacity. Hover
+  // shows the full gpu_name + uptime via title attribute.
+  const pillsEl = document.getElementById("replicas-pills");
+  pillsEl.innerHTML = per.map(r => {
+    const short = (r.gpu_name || "?").replace(/^(NVIDIA |Tesla )/, "");
+    const busy = r.inflight > 0;
+    const dotColor = busy ? "var(--warn)" : "var(--accent)";
+    const title = `${r.gpu_name || "unknown"} · uptime ${fmtDuration(r.uptime_s)} · total ${(r.total_requests ?? 0).toLocaleString()}`;
+    return `<span class="replica-pill" title="${title}"><span class="dot" style="background: ${dotColor}"></span>${short} ${r.inflight}/${r.capacity}</span>`;
+  }).join("");
+}
+function renderDaily(a) {
+  const days = a.requests_by_day || [];
+  dailyChart.data.labels = days.map(d => d.date.slice(5));  // MM-DD
+  dailyChart.data.datasets[0].data = days.map(d => d.count);
+  dailyChart.update("none");
+}
+function renderShapeList(a) {
+  const by = a.by_shape || {};
+  const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
+  const tbody = document.getElementById("shape-tbody");
+  if (!entries.length) {
+    tbody.innerHTML = `<tr><td colspan="3" class="metric-sub">no requests yet</td></tr>`;
+    return;
+  }
+  tbody.innerHTML = entries.map(([shape, b]) => `
+    <tr>
+      <td>${shape}</td>
+      <td class="num">${b.count.toLocaleString()}</td>
+      <td class="num">${fmtMs(b.duration_ms_avg)}</td>
+    </tr>
+  `).join("");
+}
+function renderByGPU(a) {
+  const by = a.by_gpu || {};
+  const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
+  const tbody = document.getElementById("gpu-tbody");
+  if (!entries.length) {
+    tbody.innerHTML = `<tr><td colspan="5" class="metric-sub">no per-GPU data yet</td></tr>`;
+    return;
+  }
+  // Dropped the explicit Success column from this table when we shrunk it
+  // into a 3-col row — success is implied by count - errors and rarely
+  // useful at a glance. Error count gets the warn color when nonzero.
+  const shortName = (n) => (n || "—").replace(/^(NVIDIA |Tesla )/, "");
+  tbody.innerHTML = entries.map(([name, b]) => `
+    <tr>
+      <td>${shortName(name)}</td>
+      <td class="num">${(b.replicas ?? 0).toLocaleString()}</td>
+      <td class="num">${b.count.toLocaleString()}</td>
+      <td class="num ${(b.errors ?? 0) > 0 ? "status-err" : ""}">${(b.errors ?? 0).toLocaleString()}</td>
+      <td class="num">${fmtMs(b.duration_ms_avg)}</td>
+    </tr>
+  `).join("");
+}
+// Today-scoped mirrors of renderShapeList / renderByGPU. Same shape of input
+// from metrics_pusher (count + duration_ms_avg per key) so the table markup
+// matches; columns are trimmed since today's per-GPU bucket doesn't carry
+// replicas/success/errors splits.
+function renderShapeToday(a) {
+  const by = a.by_shape_today || {};
+  const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
+  const tbody = document.getElementById("shape-today-tbody");
+  if (!entries.length) {
+    tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no requests yet today</td></tr>`;
+    return;
+  }
+  tbody.innerHTML = entries.map(([shape, b]) => `
+    <tr>
+      <td>${shape}</td>
+      <td class="num">${b.count.toLocaleString()}</td>
+      <td class="num">${fmtMs(b.duration_ms_avg)}</td>
+      <td class="num">${fmtMs(b.queue_ms_avg)}</td>
+    </tr>
+  `).join("");
+}
+function renderGpuToday(a) {
+  const by = a.by_gpu_today || {};
+  const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
+  const tbody = document.getElementById("gpu-today-tbody");
+  if (!entries.length) {
+    tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no requests yet today</td></tr>`;
+    return;
+  }
+  // Drop "NVIDIA " / "Tesla " prefix to keep the GPU column narrow in the
+  // 3-column row layout.
+  const shortName = (n) => (n || "—").replace(/^(NVIDIA |Tesla )/, "");
+  tbody.innerHTML = entries.map(([name, b]) => `
+    <tr>
+      <td>${shortName(name)}</td>
+      <td class="num">${b.count.toLocaleString()}</td>
+      <td class="num">${fmtMs(b.duration_ms_avg)}</td>
+      <td class="num">${fmtMs(b.queue_ms_avg)}</td>
+    </tr>
+  `).join("");
+}
+function renderLatency(a) {
+  // Latency uses the last-50 window (recent_by_shape) so the numbers feel
+  // current — long-term shape avg is on the by-resolution table above.
+  document.getElementById("avg-latency").textContent = fmtMs(a.recent_avg_latency_ms);
+  const n = a.recent_count ?? 0;
+  document.getElementById("avg-latency-sub").textContent = `across last ${n.toLocaleString()} requests`;
+  const by = a.recent_by_shape || {};
+  const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
+  const tbody = document.getElementById("latency-tbody");
+  if (!entries.length) {
+    tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no recent requests yet</td></tr>`;
+    return;
+  }
+  tbody.innerHTML = entries.map(([shape, b]) => `
+    <tr>
+      <td>${shape}</td>
+      <td class="num">${b.count.toLocaleString()}</td>
+      <td class="num">${fmtMs(b.duration_ms_avg)}</td>
+      <td class="num">${fmtMs(b.queue_ms_avg)}</td>
+    </tr>
+  `).join("");
+}
+function renderHourly(a) {
+  // Today's by-hour, 24 ints indexed by UTC hour
+  const buckets = a.requests_by_hour || [];
+  hourlyChart.data.labels = buckets.map((_, i) => `${i.toString().padStart(2, "0")}:00`);
+  hourlyChart.data.datasets[0].data = buckets;
+  hourlyChart.update("none");
+}
+function renderRecent(a) {
+  const rows = (a.recent || []).slice().reverse().slice(0, 50);
+  const tbody = document.getElementById("recent-tbody");
+  // GPU shorthand: drop the "NVIDIA " prefix so the column stays narrow
+  // ("L40S" / "A10G" reads cleaner than "NVIDIA L40S"). Older recent entries
+  // (pre-feature) won't have r.gpu — fall back to "—".
+  const shortGpu = (g) => (g || "—").replace(/^(NVIDIA |Tesla )/, "");
+  tbody.innerHTML = rows.map(r => `
+    <tr>
+      <td>${fmtRelative(r.ts)}</td>
+      <td>${r.shape || "—"}</td>
+      <td>${shortGpu(r.gpu)}</td>
+      <td class="num">${r.queue_ms != null ? fmtMs(r.queue_ms) : "—"}</td>
+      <td class="num">${r.duration_ms ? (r.duration_ms / 1000).toFixed(1) + "s" : "—"}</td>
+      <td>${(r.ip_hash || "—").slice(0, 8)}</td>
+      <td class="${r.ok ? "status-ok" : "status-err"}">${r.ok ? "ok" : "err"}</td>
+    </tr>
+  `).join("");
+}
+function renderGPUs(g) {
+  const div = document.getElementById("gpus");
+  const gpus = g.gpus || [];
+  if (!gpus.length) {
+    div.innerHTML = `<div class="metric-sub">${g.error || "no GPU data yet"}</div>`;
+    return;
+  }
+  div.innerHTML = gpus.map(gpu => {
+    const memPct = gpu.memory_total_mb ? Math.round(100 * (gpu.memory_used_mb || 0) / gpu.memory_total_mb) : 0;
+    const util = gpu.util_pct ?? 0;
+    return `
+      <div style="margin-bottom: 12px;">
+        <div style="display: flex; justify-content: space-between;"><span><b>GPU ${gpu.index}</b> ${gpu.name || ""}</span><span class="metric-sub">${gpu.temp_c ?? "—"}°C · ${gpu.power_w ? gpu.power_w.toFixed(0) : "—"}W</span></div>
+        <div class="metric-sub" style="margin-top: 4px;">util ${util}%</div>
+        <div class="gpu-bar"><div class="gpu-bar-fill" style="width: ${util}%"></div></div>
+        <div class="metric-sub" style="margin-top: 4px;">mem ${gpu.memory_used_mb ?? "—"} / ${gpu.memory_total_mb ?? "—"} MB (${memPct}%)</div>
+        <div class="gpu-bar"><div class="gpu-bar-fill" style="width: ${memPct}%; background: #d97757;"></div></div>
+      </div>
+    `;
+  }).join("");
+  document.getElementById("footer").textContent = `GPU sample: ${fmtRelative(g.ts)}`;
+}
+// Refresh cadence: default 2s, override with `#refresh=N` in the URL
+// (where N is seconds, e.g. #refresh=1 for 1s, #refresh=0.5 for 500ms).
+// metrics_pusher writes JSON every 2s by default — polling faster than
+// that just re-reads the same file. Bump METRICS_INTERVAL env on the
+// Space too if you genuinely need sub-2s.
+function readRefreshMs() {
+  const m = (location.hash || "").match(/refresh=([0-9.]+)/);
+  if (m) {
+    const v = parseFloat(m[1]);
+    if (v >= 0.25 && v <= 60) return Math.round(v * 1000);
+  }
+  return 2000;
+}
+const REFRESH_MS = readRefreshMs();
+document.getElementById("refresh-label").textContent = `auto-refresh every ${(REFRESH_MS / 1000).toString()}s`;
+initCharts();
+refresh();
+setInterval(refresh, REFRESH_MS);
+</script>
+</body>
+</html>

space/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,323 @@

+#!/bin/bash
+# Bonsai-Image HF Space entrypoint.
+#
+# Boot order:
+#   1. Download the ternary gemlite model (~3.5 GB) — idempotent.
+#   2. Generate /tmp/.htpasswd from $DASHBOARD_KEY for the basic-auth gate.
+#   3. Build /tmp/nginx-upstream.conf from `nvidia-smi -L`. One server line
+#      per GPU. At N=1 the upstream has one entry; at N>1 we prepend
+#      `least_conn;` for variable-duration request routing.
+#   4. Spawn one `uvicorn space.app:app` per GPU on consecutive ports
+#      (CUDA_VISIBLE_DEVICES pinned). Each worker's lifespan warms the
+#      shapes listed in BONSAI_WARMUP_SHAPES.
+#   5. Wait for the first worker to be ready, then `next start` on :3000
+#      (internal — nginx will expose it on :7860).
+#   6. Start metrics_pusher sidecar with a watchdog.
+#   7. Exec nginx on :7860 (the one public port HF sees).
+#
+# Env (HF Space secrets):
+#   HF_TOKEN              model + tokenizer downloads
+#   DASHBOARD_KEY         basic-auth password for /dash-<obfuscated>
+#   BONSAI_WARMUP_SHAPES  default "512x512,1024x1024,1248x832"
+set -euo pipefail
+APP_DIR="${HOME:-/home/user}/app"
+cd "$APP_DIR"
+export PATH="$APP_DIR/.venv/bin:$PATH"
+export HF_HUB_ENABLE_HF_TRANSFER=1
+# ── GPU detection (early — needed for cache namespacing + tier-aware warmup) ─
+# nvidia-smi might not return data in some odd container states; treat as
+# "unknown" rather than crashing so the rest of the boot can still run.
+GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | xargs)
+GPU_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -1 | tr -d '.')
+[ -z "$GPU_NAME" ] && GPU_NAME="unknown"
+[ -z "$GPU_CAP" ]  && GPU_CAP="00"
+echo "[OK]  GPU: $GPU_NAME (sm_${GPU_CAP})"
+# Slow GPUs (T4, older Tesla cards): warm only the two square presets we
+# benchmark against (512² and 1024²) and extend the readiness deadline.
+# Skipping warmup entirely would shift the multi-minute first-call JIT
+# onto the first user request, which corrupts benchmark numbers — better
+# to bake it into boot. BONSAI_WARMUP_SHAPES + BACKEND_READY_TIMEOUT can
+# be overridden via Space Variables if you want different shapes or a
+# longer/shorter deadline.
+case "$GPU_NAME" in
+    *T4*|*P100*|*V100*|*K80*|*M60*)
+        echo "[WARN] $GPU_NAME is slow — warming only 512x512 + 1024x1024."
+        echo "       Extending readiness timeout to 30 min for the longer JIT."
+        : "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}"
+        : "${BACKEND_READY_TIMEOUT:=1800}"
+        export BONSAI_WARMUP_SHAPES BACKEND_READY_TIMEOUT
+        ;;
+esac
+# ── persistent storage detection ─────────────────────────────────────────────
+# Try to use /data (a Storage Bucket if mounted) for the model + kernel
+# caches + stats. Every filesystem op is wrapped so that if anything fails
+# midway — bucket detached mid-build, mkdir denied, symlink races — we
+# silently fall back to ephemeral storage and keep going. The dashboard
+# banner alerts the user via BONSAI_PERSISTENT_STORAGE.
+_setup_persistent() {
+    [ -d /data ] && [ -w /data ] || return 1
+    # Kernel caches namespaced by compute capability so a tier swap (e.g.
+    # L40S sm_89 → T4 sm_75 → back to L40S) doesn't pollute either GPU's
+    # autotune configs / Triton kernels.
+    _gemlite_dir="/data/cache/gemlite-sm${GPU_CAP}"
+    _triton_dir="/data/cache/triton-sm${GPU_CAP}"
+    # One-shot migration: if a non-namespaced cache exists from older
+    # builds, move it under the current GPU's namespace so we don't lose
+    # the pre-existing autotune work.
+    if [ -d /data/cache/gemlite ] && [ ! -e "$_gemlite_dir" ]; then
+        echo "[INFO] migrating /data/cache/gemlite → gemlite-sm${GPU_CAP}"
+        mv /data/cache/gemlite "$_gemlite_dir" 2>/dev/null || true
+    fi
+    if [ -d /data/cache/triton ] && [ ! -e "$_triton_dir" ]; then
+        echo "[INFO] migrating /data/cache/triton → triton-sm${GPU_CAP}"
+        mv /data/cache/triton "$_triton_dir" 2>/dev/null || true
+    fi
+    mkdir -p /data/models "$_gemlite_dir" "$_triton_dir" /data/state /data/state/daily 2>/dev/null || return 1
+    rm -rf "$APP_DIR/models" 2>/dev/null || return 1
+    ln -s /data/models "$APP_DIR/models" 2>/dev/null || return 1
+    mkdir -p "$APP_DIR/outputs" 2>/dev/null || return 1
+    rm -rf "$APP_DIR/outputs/.gemlite_cache" "$APP_DIR/outputs/.triton_cache" 2>/dev/null || true
+    ln -s "$_gemlite_dir" "$APP_DIR/outputs/.gemlite_cache" 2>/dev/null || return 1
+    ln -s "$_triton_dir"  "$APP_DIR/outputs/.triton_cache"  2>/dev/null || return 1
+    return 0
+}
+if _setup_persistent; then
+    echo "[OK]  /data Storage Bucket attached — model + caches + counters will persist"
+    export BONSAI_STATE_DIR=/data/state
+    export BONSAI_PERSISTENT_STORAGE=1
+else
+    if [ -d /data ]; then
+        echo "[WARN] /data is present but couldn't be set up (read-only? quota?). Falling back to ephemeral."
+    else
+        echo "[WARN] /data not mounted — model, kernel caches, and dashboard"
+        echo "       counters will reset on every Space restart. Enable a"
+        echo "       Storage Bucket in Space Settings → Storage to fix."
+    fi
+    export BONSAI_STATE_DIR="$APP_DIR/outputs/.state"
+    export BONSAI_PERSISTENT_STORAGE=0
+    mkdir -p "$BONSAI_STATE_DIR/daily" 2>/dev/null || true
+fi
+# ── shared IP-hash pepper across all replicas ────────────────────────────────
+# Every replica must hash IPs with the same pepper so unique-user counts
+# don't double across replicas. Extract from state.json if present (so the
+# pepper survives restarts), else generate a fresh one. Each worker reads
+# this via env, regardless of whether it loads cumulative state.
+if [ -f "$BONSAI_STATE_DIR/state.json" ]; then
+    BONSAI_IP_PEPPER=$(python3 - "$BONSAI_STATE_DIR/state.json" <<'PY' 2>/dev/null || true
+import json, sys
+try:
+    with open(sys.argv[1]) as f:
+        print(json.load(f).get("ip_pepper") or "")
+except Exception:
+    pass
+PY
+)
+fi
+if [ -z "${BONSAI_IP_PEPPER:-}" ]; then
+    BONSAI_IP_PEPPER=$(python3 -c "import secrets; print(secrets.token_hex(16))")
+fi
+export BONSAI_IP_PEPPER
+# Warm only the two square presets users hit most often (512² and 1024²).
+# Other resolutions JIT on first user request and join the on-disk caches
+# (/data/cache/{gemlite,triton}-smXX/) organically. The warmup-skip sentinel
+# (warmup-done.json next to gemlite autotune) tracks completed (backend,shape)
+# pairs across boots, so subsequent boots skip even these two if they're
+# already cached.
+#
+# Why so few shapes: multi-GPU boots collide during warmup — all N workers
+# race for /data bandwidth + CPU during the gemlite layer pack, and we've
+# seen 4-worker launches hang past BACKEND_READY_TIMEOUT. Two shapes covers
+# the common case (most users render at 512² or 1024²) without inflating
+# cold-boot wall time.
+: "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}"
+export BONSAI_WARMUP_SHAPES
+# Binary warmup disabled by default. When enabled, every replica swaps to
+# the binary transformer simultaneously after primary warmup — 4 parallel
+# 3.5 GB state_dict reads from /data + 4 parallel gemlite layer packs.
+# We've seen this hang multi-GPU boots indefinitely. First binary-arm click
+# pays a one-time JIT cost (~30s for an unwarmed shape, after which the
+# cache covers it forever).
+#
+# To re-enable on single-GPU rigs where the collision doesn't apply:
+#   set Space Variable BONSAI_WARMUP_EXTRA_BACKENDS=bonsai-binary-gemlite
+: "${BONSAI_WARMUP_EXTRA_BACKENDS:=}"
+export BONSAI_WARMUP_EXTRA_BACKENDS
+# ── token sanity check ───────────────────────────────────────────────────────
+if [ -z "${HF_TOKEN:-}" ]; then
+    echo "[ERR] HF_TOKEN not set — add it as a Space Secret so the model can download." >&2
+    exit 1
+fi
+export BONSAI_TOKEN="$HF_TOKEN"   # what download_model.sh expects
+# ── model download / sync ────────────────────────────────────────────────────
+# Ship BOTH ternary + binary so the picker's two options actually work. Each
+# repo is ~3.5 GB; first cold boot downloads ~7 GB total, but Storage Bucket
+# (/data/models, symlinked above) keeps them across restarts.
+#
+# We *always* invoke download_model.sh on boot (no file-exists guard). Under
+# the hood it calls huggingface_hub.snapshot_download with `local_dir` set,
+# which HEADs each file in the repo and skips any whose etag matches what's
+# already on disk — so cached boots cost ~10-30s of metadata checks instead
+# of a full redownload. The upside: pushing new weights to HF auto-propagates
+# on the next Space restart without a force flag or manual cache wipe.
+MODEL_DIR="$APP_DIR/models/bonsai-image-4B-ternary-gemlite"
+BINARY_MODEL_DIR="$APP_DIR/models/bonsai-image-4B-binary-gemlite"
+echo "==>  syncing bonsai-image-ternary-4B-gemlite-2bit ..."
+./scripts/download_model.sh --model ternary-gemlite
+echo "==>  syncing bonsai-image-binary-4B-gemlite-1bit ..."
+./scripts/download_model.sh --model binary-gemlite
+# ── htpasswd for the dashboard ───────────────────────────────────────────────
+# DASHBOARD_KEY is a Space Secret; fall back to a sentinel that prints a
+# big warning so missing-secret is obvious in the build log but the Space
+# still comes up (useful while iterating).
+if [ -n "${DASHBOARD_KEY:-}" ]; then
+    HASH=$(openssl passwd -apr1 "$DASHBOARD_KEY")
+    printf 'admin:%s\n' "$HASH" > /tmp/.htpasswd
+    echo "[OK]  dashboard: auth enabled (user=admin)"
+else
+    echo "[WARN] DASHBOARD_KEY not set — /dash-... is open with admin:open"
+    printf 'admin:$apr1$open$open\n' > /tmp/.htpasswd
+fi
+# ── nginx scratch dirs ─��─────────────────────────────────────────────────────
+mkdir -p /tmp/nginx-body /tmp/nginx-proxy /tmp/nginx-fastcgi /tmp/nginx-uwsgi /tmp/nginx-scgi
+# ── pre-seed dashboard JSON so the page doesn't 502 before first scrape ──────
+printf '{"updated_at":null,"persistent_storage":%s,"summary_total":{"requests":0,"success":0,"errors":0},"summary_today":{"requests":0,"unique_users":0},"summary_7d":{"requests":0,"unique_users":0},"by_shape":{},"requests_by_hour":[],"requests_by_day":[],"recent":[]}\n' \
+    "$([ "${BONSAI_PERSISTENT_STORAGE:-0}" = "1" ] && echo true || echo false)" \
+    > /tmp/analytics.json
+echo '{"ts":null,"gpus":[]}' > /tmp/gpu-stats.json
+# ── pin model paths once; shared across all workers ──────────────────────────
+# backend_gpu/pipeline_gpu.py reads SEPARATE env vars per variant
+# (TERNARY_TRANSFORMER_PATH vs BINARY_TRANSFORMER_PATH) and the packed
+# transformer subdir name differs per variant (transformer-gemlite-int2
+# for ternary, transformer-gemlite-int1 for binary). Glob each variant's
+# dir for whichever transformer-gemlite-* it actually ships and assign to
+# the right env var. Without the BINARY env var set, the pipeline falls
+# back to its hardcoded /root/models/bonsai-binary/ default → PermissionError
+# on a non-root container the moment a user picks binary in the UI.
+#
+# Note: text_encoder + vae + tokenizer are the SAME artifacts across both
+# variants (Qwen3-4B-4bit + BFL VAE). Pointing them at the ternary copy
+# is fine; binary's copy of these files sits idle on disk after download.
+# That's a one-time ~1 GB of duplication on disk for the simplicity of
+# letting download_model.sh pull the standard HF layout for each repo.
+export MFLUX_STUDIO_GPU_DEFAULT_BACKEND="bonsai-ternary-gemlite"
+_ternary_transformer_dir=$(ls -d "$MODEL_DIR"/transformer-gemlite-* 2>/dev/null | head -1)
+if [ -z "$_ternary_transformer_dir" ]; then
+    echo "[ERR] no transformer-gemlite-* subdir under $MODEL_DIR" >&2
+    exit 1
+fi
+_binary_transformer_dir=$(ls -d "$BINARY_MODEL_DIR"/transformer-gemlite-* 2>/dev/null | head -1)
+if [ -z "$_binary_transformer_dir" ]; then
+    echo "[ERR] no transformer-gemlite-* subdir under $BINARY_MODEL_DIR" >&2
+    exit 1
+fi
+export MFLUX_STUDIO_GPU_TERNARY_TRANSFORMER_PATH="$_ternary_transformer_dir"
+export MFLUX_STUDIO_GPU_BINARY_TRANSFORMER_PATH="$_binary_transformer_dir"
+export MFLUX_STUDIO_GPU_TEXT_ENCODER_PATH="$MODEL_DIR/text_encoder-hqq-4bit"
+export MFLUX_STUDIO_GPU_VAE_PATH="$MODEL_DIR/vae"
+export MFLUX_STUDIO_GPU_TOKENIZER_PATH="$MODEL_DIR/text_encoder-hqq-4bit/tokenizer"
+# ── detect GPUs + spawn one uvicorn per device ───────────────────────────────
+GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l || echo 1)
+[ "$GPU_COUNT" -lt 1 ] && GPU_COUNT=1
+echo "[OK]  detected $GPU_COUNT GPU(s)"
+# Stagger consecutive worker starts. Without this, all N uvicorns hit the
+# /data bucket simultaneously, contending for ~5 GB state_dict reads + the
+# CPU-bound fp16 cast + gemlite layer conversion. We've seen 4-worker
+# launches blow through BACKEND_READY_TIMEOUT this way. Staggering by ~30s
+# (a hair more than the single-worker transformer-load wall time observed
+# on warm bucket / sm_86) lets each worker get past torch.load + gemlite
+# convert before the next starts touching the same files.
+WORKER_START_STAGGER_SECONDS="${BONSAI_WORKER_START_STAGGER_SECONDS:-30}"
+BACKEND_URLS=""
+UPSTREAM_SERVERS=""
+for i in $(seq 0 $((GPU_COUNT - 1))); do
+    PORT=$((8000 + i))
+    # Per-replica GPU name (mixed-GPU rigs are rare but possible — look it
+    # up by physical index rather than reuse the top-level GPU_NAME).
+    REPLICA_GPU=$(nvidia-smi --query-gpu=name --format=csv,noheader -i "$i" 2>/dev/null | head -1 | xargs)
+    [ -z "$REPLICA_GPU" ] && REPLICA_GPU="$GPU_NAME"
+    echo "==>  starting backend on GPU $i ($REPLICA_GPU) → :$PORT  (warmup: $BONSAI_WARMUP_SHAPES)"
+    # BONSAI_REPLICA_INDEX: only replica 0 seeds counters from state.json;
+    # replicas 1+ start at 0 and report deltas. metrics_pusher sums them →
+    # correct cumulative without N-way inflation.
+    # BONSAI_GPU_NAME: surfaced via /metrics so the pusher can aggregate
+    # request counts/latencies per GPU model for the dashboard.
+    CUDA_VISIBLE_DEVICES=$i BONSAI_REPLICA_INDEX=$i BONSAI_GPU_NAME="$REPLICA_GPU" \
+    uvicorn space.app:app \
+        --host 127.0.0.1 --port "$PORT" \
+        --no-access-log &
+    UPSTREAM_SERVERS="${UPSTREAM_SERVERS}    server 127.0.0.1:$PORT;"$'\n'
+    [ -n "$BACKEND_URLS" ] && BACKEND_URLS="$BACKEND_URLS,"
+    BACKEND_URLS="${BACKEND_URLS}http://127.0.0.1:$PORT"
+    # Sleep between consecutive worker starts (skip after the last one).
+    # Set BONSAI_WORKER_START_STAGGER_SECONDS=0 to disable if cold-boot
+    # wall time matters more than first-boot reliability.
+    if [ "$i" -lt "$((GPU_COUNT - 1))" ] && [ "$WORKER_START_STAGGER_SECONDS" -gt 0 ]; then
+        echo "  ↳ sleeping ${WORKER_START_STAGGER_SECONDS}s before next worker (avoid /data + CPU contention)"
+        sleep "$WORKER_START_STAGGER_SECONDS"
+    fi
+done
+# At N>1 use least_conn (variable-duration requests — see space/nginx.conf).
+if [ "$GPU_COUNT" -gt 1 ]; then
+    LB_DIRECTIVE="    least_conn;"$'\n'
+else
+    LB_DIRECTIVE=""
+fi
+printf 'upstream bonsai_workers {\n%s%s}\n' "$LB_DIRECTIVE" "$UPSTREAM_SERVERS" > /tmp/nginx-upstream.conf
+export BACKEND_URLS
+# ── wait for backend readiness ───────────────────────────────────────────────
+# Workers only answer /backends after lifespan finishes (kernels compiled +
+# warmup shapes JITed). We poll the first one as a proxy for "ready enough."
+_ready_timeout="${BACKEND_READY_TIMEOUT:-600}"
+echo "==>  waiting for backend on :8000 (up to ${_ready_timeout}s) ..."
+for i in $(seq 1 "$_ready_timeout"); do
+    if curl -fsS -m 2 http://127.0.0.1:8000/backends > /dev/null 2>&1; then
+        echo "[OK]  backend ready after ${i}s"
+        break
+    fi
+    sleep 1
+    if [ "$i" -eq "$_ready_timeout" ]; then
+        echo "[ERR] backend did not come up within ${_ready_timeout}s" >&2
+        exit 1
+    fi
+done
+# ── frontend (next start) on internal :3000 ──────────────────────────────────
+echo "==>  starting frontend (next start) on :3000"
+(cd vendor/image-studio/frontend && exec npm start -- --port 3000 --hostname 127.0.0.1) &
+# ── metrics_pusher sidecar (watchdog restart on crash) ───────────────────────
+start_metrics_pusher() {
+    while true; do
+        echo "[watchdog] starting metrics_pusher.py"
+        python3 /home/user/app/space/metrics_pusher.py || true
+        echo "[watchdog] metrics_pusher.py exited, restarting in 5s"
+        sleep 5
+    done
+}
+start_metrics_pusher &
+# ── nginx — front everything on :7860 (the HF-exposed port) ──────────────────
+echo "==>  nginx on :7860"
+exec nginx -c /home/user/app/space/nginx.conf -p /home/user/app/

space/metrics_pusher.py ADDED Viewed

	@@ -0,0 +1,599 @@

+"""Sidecar that aggregates backend /metrics + nvidia-smi into JSON files.
+On every tick (default 5 s) it writes:
+  /tmp/analytics.json            current totals, today + 7d summaries, GPU info flag
+  /tmp/gpu-stats.json            nvidia-smi snapshot
+Every Nth tick (default 12 → ~1 min) it also writes:
+  $BONSAI_STATE_DIR/state.json              boot-recovery snapshot
+  $BONSAI_STATE_DIR/daily/YYYY-MM-DD.json   per-UTC-day archive (one file/day)
+Robust to:
+  - missing /data bucket (writes go to ephemeral $BONSAI_STATE_DIR fallback)
+  - missing nvidia-smi
+  - backend not yet up (HTTP errors logged, tick continues)
+  - FUSE-backed mounts that don't support atomic rename (falls back to in-place)
+"""
+from __future__ import annotations
+import json
+import os
+import subprocess
+import time
+import urllib.request
+from collections import defaultdict
+# Day bucketing is in UTC — matches what space.app uses for `_by_day` keys
+# (we tried PT but the CUDA Ubuntu base image strips tzdata).
+BACKEND_URLS = [u.strip() for u in os.environ.get("BACKEND_URLS", "http://127.0.0.1:8000").split(",") if u.strip()]
+INTERVAL = int(os.environ.get("METRICS_INTERVAL", "2"))
+ANALYTICS_PATH = "/tmp/analytics.json"
+GPU_PATH = "/tmp/gpu-stats.json"
+# Persisted state. STATE_DIR is /data/state when a bucket is mounted, else
+# ephemeral under outputs/ (gone on Space restart).
+STATE_DIR = os.environ.get("BONSAI_STATE_DIR", "/tmp")
+STATE_PATH = os.path.join(STATE_DIR, "state.json")
+DAILY_DIR = os.path.join(STATE_DIR, "daily")
+# Write durable files (state.json + daily archives) every Nth tick to amortize
+# disk traffic. Losing N*INTERVAL seconds of counter increments on unclean
+# shutdown is acceptable.
+STATE_WRITE_EVERY_N_TICKS = int(os.environ.get("STATE_WRITE_EVERY_N_TICKS", "12"))
+# Surfaces in analytics.json so the dashboard shows a "counters won't persist"
+# banner when a bucket is not mounted. Set by entrypoint.sh.
+PERSISTENT_STORAGE = os.environ.get("BONSAI_PERSISTENT_STORAGE", "0") == "1"
+def _fetch_json(url: str, timeout: float = 5.0) -> dict | None:
+    # 5s timeout (was 2s): under 16-concurrent /generate load the uvicorn
+    # event loop can briefly queue /metrics behind in-flight responses.
+    # 5s is still well under the dashboard's polling cadence (so the user
+    # doesn't see a delay) and gives the backend headroom under stress.
+    try:
+        with urllib.request.urlopen(url, timeout=timeout) as resp:
+            return json.loads(resp.read())
+    except Exception:
+        return None
+def fetch_backend_metrics() -> dict:
+    """Aggregate /metrics from every backend replica."""
+    agg: dict = {
+        "total_requests": 0,
+        "success": 0,
+        "errors": 0,
+        "uptime_s": 0,
+        "inflight": 0,             # sum across replicas — total in-flight requests
+        "generate_capacity": 0,    # sum of per-replica concurrency caps
+        "replicas_seen": 0,        # how many replicas answered /metrics this tick
+        # Per-replica details — list of {gpu_name, inflight, capacity,
+        # uptime_s, total_requests}. Used to compute accurate queue_depth
+        # (sum of per-replica (inflight - capacity)+ rather than the sum-
+        # then-subtract approximation that hides imbalance) and to render
+        # the multi-GPU health card on the dashboard.
+        "per_replica": [],
+        "by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0}),
+        # Cumulative per-variant counter. Replicas each report their own
+        # _by_variant; we sum them here. Variants are "ternary", "binary",
+        # or "unknown" — parsed from the request's `backend` field.
+        "by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
+        "by_day": {},  # date -> {requests, success, errors, by_shape, by_hour, unique_ips set, queue_ms_total}
+        # Per-GPU model breakdown — each replica's gpu_name + counts +
+        # duration sum get folded in. Multiple replicas on the same GPU
+        # model (e.g. l40sx4 = 4× "NVIDIA L40S") merge into one bucket.
+        "by_gpu": defaultdict(lambda: {"count": 0, "success": 0, "errors": 0, "duration_ms_total": 0, "replicas": 0}),
+        "recent": [],
+        "ip_pepper": None,
+    }
+    for url in BACKEND_URLS:
+        data = _fetch_json(f"{url}/metrics")
+        if not data:
+            continue
+        agg["replicas_seen"] += 1
+        agg["total_requests"] += data.get("total_requests", 0)
+        agg["success"] += data.get("success", 0)
+        agg["errors"] += data.get("errors", 0)
+        agg["uptime_s"] = max(agg["uptime_s"], data.get("uptime_s", 0))
+        replica_inflight = data.get("inflight", 0)
+        replica_capacity = data.get("generate_concurrency", 1)
+        agg["inflight"] += replica_inflight
+        agg["generate_capacity"] += replica_capacity
+        # Per-GPU rollup — fold this replica's totals into its GPU bucket.
+        # Default to NVIDIA L40S when missing so historical /metrics without
+        # gpu_name (pre-this-feature) don't show up as "unknown".
+        gpu = data.get("gpu_name") or "NVIDIA L40S"
+        # Per-replica record — keep the gpu_name + cap so the dashboard's
+        # multi-GPU health card can render "L40S · 1/1 busy" style rows
+        # and the queue calc can subtract per-replica.
+        agg["per_replica"].append({
+            "url": url,
+            "gpu_name": gpu,
+            "inflight": replica_inflight,
+            "capacity": replica_capacity,
+            "uptime_s": data.get("uptime_s", 0),
+            "total_requests": data.get("total_requests", 0),
+            "replica_index": data.get("replica_index"),
+        })
+        g = agg["by_gpu"][gpu]
+        g["count"] += data.get("total_requests", 0)
+        g["success"] += data.get("success", 0)
+        g["errors"] += data.get("errors", 0)
+        g["duration_ms_total"] += data.get("total_duration_ms", 0)
+        g["replicas"] += 1
+        for shape, b in data.get("by_shape", {}).items():
+            agg["by_shape"][shape]["count"] += b.get("count", 0)
+            agg["by_shape"][shape]["duration_ms_total"] += b.get("duration_ms_total", 0)
+        for v_name, v_data in (data.get("by_variant") or {}).items():
+            agg["by_variant"][v_name]["count"] += v_data.get("count", 0)
+            agg["by_variant"][v_name]["duration_ms_total"] += v_data.get("duration_ms_total", 0)
+            agg["by_variant"][v_name]["queue_ms_total"] += v_data.get("queue_ms_total", 0)
+        # Per-day merge: when we go multi-replica, each replica returns its
+        # own _by_day → we union them here (sum counters, union unique_ips).
+        for date, d in data.get("by_day", {}).items():
+            existing = agg["by_day"].setdefault(date, {
+                "requests": 0, "success": 0, "errors": 0,
+                "by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
+                "by_hour": [0] * 24,
+                "unique_ips": set(),
+                "by_gpu": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
+                "by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
+                "queue_ms_total": 0,
+            })
+            existing["requests"] += d.get("requests", 0)
+            existing["success"] += d.get("success", 0)
+            existing["errors"] += d.get("errors", 0)
+            existing["queue_ms_total"] += d.get("queue_ms_total", 0)
+            for shape, b in d.get("by_shape", {}).items():
+                existing["by_shape"][shape]["count"] += b.get("count", 0)
+                existing["by_shape"][shape]["duration_ms_total"] += b.get("duration_ms_total", 0)
+                existing["by_shape"][shape]["queue_ms_total"] += b.get("queue_ms_total", 0)
+            for i, c in enumerate(d.get("by_hour") or [0] * 24):
+                if i < 24:
+                    existing["by_hour"][i] += c
+            for h in d.get("unique_ips", []) or []:
+                existing["unique_ips"].add(h)
+            for g_name, g_data in (d.get("by_gpu") or {}).items():
+                existing["by_gpu"][g_name]["count"] += g_data.get("count", 0)
+                existing["by_gpu"][g_name]["duration_ms_total"] += g_data.get("duration_ms_total", 0)
+                existing["by_gpu"][g_name]["queue_ms_total"] += g_data.get("queue_ms_total", 0)
+            for v_name, v_data in (d.get("by_variant") or {}).items():
+                existing["by_variant"][v_name]["count"] += v_data.get("count", 0)
+                existing["by_variant"][v_name]["duration_ms_total"] += v_data.get("duration_ms_total", 0)
+                existing["by_variant"][v_name]["queue_ms_total"] += v_data.get("queue_ms_total", 0)
+        agg["recent"].extend(data.get("recent", []))
+        agg["ip_pepper"] = agg["ip_pepper"] or data.get("ip_pepper")
+    agg["recent"].sort(key=lambda r: r.get("ts", 0))
+    agg["recent"] = agg["recent"][-2000:]
+    return agg
+def fetch_gpu_stats() -> dict:
+    try:
+        out = subprocess.check_output(
+            [
+                "nvidia-smi",
+                "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit",
+                "--format=csv,noheader,nounits",
+            ],
+            timeout=2,
+        ).decode()
+    except Exception as exc:
+        return {"ts": int(time.time()), "gpus": [], "error": str(exc)}
+    def _maybe_int(s: str) -> int | None:
+        s = s.strip()
+        return int(s) if s.isdigit() else None
+    def _maybe_float(s: str) -> float | None:
+        try:
+            return float(s.strip())
+        except ValueError:
+            return None
+    gpus = []
+    for line in out.strip().splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) < 8:
+            continue
+        gpus.append({
+            "index": int(parts[0]),
+            "name": parts[1],
+            "util_pct": _maybe_int(parts[2]),
+            "memory_used_mb": _maybe_int(parts[3]),
+            "memory_total_mb": _maybe_int(parts[4]),
+            "temp_c": _maybe_int(parts[5]),
+            "power_w": _maybe_float(parts[6]),
+            "power_limit_w": _maybe_float(parts[7]),
+        })
+    return {"ts": int(time.time()), "gpus": gpus}
+def build_analytics(backend_data: dict) -> dict:
+    """The JSON the dashboard polls. Derived from /metrics so they stay in sync."""
+    now = int(time.time())
+    today = time.strftime("%Y-%m-%d", time.gmtime(now))
+    by_shape_total = {}
+    for shape, b in backend_data["by_shape"].items():
+        avg = b["duration_ms_total"] // b["count"] if b["count"] else 0
+        by_shape_total[shape] = {"count": b["count"], "duration_ms_avg": avg}
+    # Latency stats derived from the last 50 requests only — same set the
+    # dashboard renders in its Recent Requests table. Keeps the latency
+    # numbers reactive to current load rather than smoothed by old data.
+    recent_window = backend_data["recent"][-50:]
+    recent_by_shape_acc: dict[str, dict] = {}
+    for r in recent_window:
+        s = r.get("shape") or "unknown"
+        d = recent_by_shape_acc.setdefault(s, {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0})
+        d["count"] += 1
+        d["duration_ms_total"] += int(r.get("duration_ms") or 0)
+        d["queue_ms_total"] += int(r.get("queue_ms") or 0)
+    recent_by_shape = {}
+    for s, b in recent_by_shape_acc.items():
+        recent_by_shape[s] = {
+            "count": b["count"],
+            "duration_ms_avg": b["duration_ms_total"] // b["count"] if b["count"] else 0,
+            "queue_ms_avg": b["queue_ms_total"] // b["count"] if b["count"] else 0,
+        }
+    recent_count_total = sum(b["count"] for b in recent_by_shape_acc.values())
+    recent_duration_total = sum(b["duration_ms_total"] for b in recent_by_shape_acc.values())
+    recent_avg_latency_ms = recent_duration_total // recent_count_total if recent_count_total else 0
+    today_bucket = backend_data["by_day"].get(today, {})
+    today_unique_set = today_bucket.get("unique_ips", set())
+    today_unique = len(today_unique_set if isinstance(today_unique_set, set) else list(today_unique_set))
+    # Today-only mirrors of by_shape_total and by_gpu_out. Same shape so the
+    # dashboard can render them with the same table helpers; only the scope
+    # differs (cumulative vs reset-at-UTC-midnight). Useful for spotting
+    # today's tier mix or shape distribution at a glance vs the all-time avg
+    # which smooths over the full history. queue_ms_avg is included so the
+    # tables can show how queueing pressure is distributed.
+    by_shape_today = {}
+    for shape, b in (today_bucket.get("by_shape") or {}).items():
+        c = b.get("count", 0)
+        by_shape_today[shape] = {
+            "count": c,
+            "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
+            "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
+        }
+    by_gpu_today = {}
+    for gpu_name, b in (today_bucket.get("by_gpu") or {}).items():
+        c = b.get("count", 0)
+        by_gpu_today[gpu_name] = {
+            "count": c,
+            "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
+            "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
+        }
+    # by_variant slices: cumulative (across all of by_day history) + today.
+    # Today's view drives the new Variant tile in the dashboard summary row.
+    by_variant_total = {}
+    for v_name, b in backend_data["by_variant"].items():
+        c = b.get("count", 0)
+        by_variant_total[v_name] = {
+            "count": c,
+            "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
+            "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
+        }
+    by_variant_today = {}
+    for v_name, b in (today_bucket.get("by_variant") or {}).items():
+        c = b.get("count", 0)
+        by_variant_today[v_name] = {
+            "count": c,
+            "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
+            "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
+        }
+    # Today's overall avg queue, summed across all shapes/gpus. Surfaced as
+    # a single number in the Pending tile subtitle on the dashboard.
+    today_count = today_bucket.get("requests", 0)
+    today_avg_queue_ms = (today_bucket.get("queue_ms_total", 0) // today_count) if today_count else 0
+    def _summary_for_last(n_days: int) -> dict:
+        days = sorted(backend_data["by_day"].keys())[-n_days:]
+        req = sum(backend_data["by_day"][d].get("requests", 0) for d in days)
+        uniques: set = set()
+        for d in days:
+            ips = backend_data["by_day"][d].get("unique_ips", set())
+            uniques.update(ips if isinstance(ips, set) else list(ips))
+        return {"requests": req, "unique_users": len(uniques)}
+    # Include per-GPU counts on each day so the dashboard can stack the daily
+    # chart by GPU. Each day's by_gpu dict only carries GPUs that actually
+    # served traffic that day, so the dashboard derives the union of all GPU
+    # names client-side and fills missing days with 0. duration_ms_total is
+    # surfaced too so a future "stacked latency view" doesn't need new fields.
+    requests_by_day = [
+        {
+            "date": d,
+            "count": backend_data["by_day"][d].get("requests", 0),
+            "by_gpu": {
+                g_name: {
+                    "count": g.get("count", 0),
+                    "duration_ms_total": g.get("duration_ms_total", 0),
+                }
+                for g_name, g in (backend_data["by_day"][d].get("by_gpu") or {}).items()
+            },
+        }
+        for d in sorted(backend_data["by_day"].keys())[-30:]
+    ]
+    requests_by_hour = list(today_bucket.get("by_hour", [0] * 24))
+    # Overall average latency, derived from by_shape (since duration totals
+    # live there, not in the cumulative counter).
+    total_duration_ms = sum(b["duration_ms_total"] for b in backend_data["by_shape"].values())
+    total_durations_count = sum(b["count"] for b in backend_data["by_shape"].values())
+    avg_latency_ms = total_duration_ms // total_durations_count if total_durations_count else 0
+    # Queue depth = whatever is in-flight beyond GPU-running capacity. Has
+    # to be summed PER REPLICA: if 4 are queued on replica 0 and replica 1
+    # is idle, naive sum(inflight) - sum(capacity) = max(0, 4-2) = 2 hides
+    # the fact that replica 0 has a 3-deep queue while replica 1 idles.
+    # Per-replica max(0, inflight-capacity) correctly attributes the queue.
+    per_replica = backend_data.get("per_replica", [])
+    inflight = sum(r["inflight"] for r in per_replica) if per_replica else backend_data.get("inflight", 0)
+    capacity = sum(r["capacity"] for r in per_replica) if per_replica else (
+        backend_data.get("generate_capacity", 0) or backend_data.get("replicas_seen", 1)
+    )
+    queue_depth = sum(max(0, r["inflight"] - r["capacity"]) for r in per_replica)
+    running = sum(min(r["inflight"], r["capacity"]) for r in per_replica) if per_replica else min(inflight, capacity)
+    # Per-GPU breakdown for the bottom-of-dashboard "By GPU" card. Count,
+    # success/error split, avg latency per GPU model. Useful for spotting
+    # variance between tiers (e.g. L40S vs T4) during benchmarking.
+    by_gpu_out = {}
+    for gpu_name, b in backend_data["by_gpu"].items():
+        c = b["count"]
+        by_gpu_out[gpu_name] = {
+            "count": c,
+            "success": b["success"],
+            "errors": b["errors"],
+            "duration_ms_avg": b["duration_ms_total"] // c if c else 0,
+            "duration_ms_total": b["duration_ms_total"],
+            "replicas": b["replicas"],
+        }
+    return {
+        "updated_at": now,
+        "uptime_s": backend_data.get("uptime_s", 0),
+        "persistent_storage": PERSISTENT_STORAGE,
+        "state_dir": STATE_DIR,
+        "replicas_seen": backend_data.get("replicas_seen", 0),
+        # entrypoint.sh sets BACKEND_URLS once per boot, so this is the
+        # number we *expect* to see — diff against replicas_seen tells the
+        # dashboard "1 replica is unhealthy" vs "2 of 2 happy".
+        "replicas_expected": len(BACKEND_URLS),
+        "per_replica": backend_data.get("per_replica", []),
+        "inflight": inflight,
+        "running": running,
+        "queue_depth": queue_depth,
+        "capacity": capacity,
+        "today_avg_queue_ms": today_avg_queue_ms,
+        "summary_total": {
+            "requests": backend_data["total_requests"],
+            "success": backend_data["success"],
+            "errors": backend_data["errors"],
+        },
+        "summary_today": {
+            "requests": today_bucket.get("requests", 0),
+            "unique_users": today_unique,
+        },
+        "summary_7d": _summary_for_last(7),
+        "summary_30d": _summary_for_last(30),
+        "avg_latency_ms": avg_latency_ms,
+        "by_shape": by_shape_total,
+        "by_shape_today": by_shape_today,
+        "by_gpu": by_gpu_out,
+        "by_gpu_today": by_gpu_today,
+        "by_variant": by_variant_total,
+        "by_variant_today": by_variant_today,
+        "recent_by_shape": recent_by_shape,
+        "recent_avg_latency_ms": recent_avg_latency_ms,
+        "recent_count": recent_count_total,
+        "requests_by_hour": requests_by_hour,
+        "requests_by_day": requests_by_day,
+        "recent": backend_data["recent"][-100:],
+    }
+def _atomic_write(path: str, payload: dict, indent: int | None = None) -> None:
+    """Write JSON atomically. Falls back to direct overwrite if rename fails
+    (some FUSE-backed mounts don't support rename within a dir)."""
+    text = json.dumps(payload, indent=indent, sort_keys=indent is not None)
+    tmp = path + ".tmp"
+    try:
+        with open(tmp, "w") as f:
+            f.write(text)
+        os.replace(tmp, path)
+    except OSError as exc:
+        print(f"[metrics_pusher] atomic rename failed for {path} ({exc}); writing in place", flush=True)
+        try:
+            with open(path, "w") as f:
+                f.write(text)
+        except OSError as exc2:
+            print(f"[metrics_pusher] direct write also failed for {path} ({exc2})", flush=True)
+        finally:
+            try:
+                os.unlink(tmp)
+            except OSError:
+                pass
+def write_state(backend_data: dict) -> None:
+    """Snapshot for boot-recovery. Includes per-day so the app can resume
+    counter buckets for in-flight days."""
+    by_day_out = {}
+    for date, d in backend_data["by_day"].items():
+        ips = d["unique_ips"]
+        by_day_out[date] = {
+            "requests": d["requests"],
+            "success": d["success"],
+            "errors": d["errors"],
+            "queue_ms_total": d.get("queue_ms_total", 0),
+            "by_shape": {
+                s: {
+                    "count": b["count"],
+                    "duration_ms_total": b["duration_ms_total"],
+                    "queue_ms_total": b.get("queue_ms_total", 0),
+                }
+                for s, b in d["by_shape"].items()
+            },
+            "by_hour": list(d["by_hour"]),
+            "unique_ips": sorted(ips) if isinstance(ips, set) else list(ips),
+            "by_gpu": {
+                g: {
+                    "count": v["count"],
+                    "duration_ms_total": v["duration_ms_total"],
+                    "queue_ms_total": v.get("queue_ms_total", 0),
+                }
+                for g, v in (d.get("by_gpu") or {}).items()
+            },
+            "by_variant": {
+                v: {
+                    "count": b["count"],
+                    "duration_ms_total": b["duration_ms_total"],
+                    "queue_ms_total": b.get("queue_ms_total", 0),
+                }
+                for v, b in (d.get("by_variant") or {}).items()
+            },
+        }
+    payload = {
+        "total_requests": backend_data["total_requests"],
+        "success": backend_data["success"],
+        "errors": backend_data["errors"],
+        "by_shape": {
+            shape: {"count": b["count"], "duration_ms_total": b["duration_ms_total"]}
+            for shape, b in backend_data["by_shape"].items()
+        },
+        "by_variant": {
+            v: {
+                "count": b["count"],
+                "duration_ms_total": b["duration_ms_total"],
+                "queue_ms_total": b.get("queue_ms_total", 0),
+            }
+            for v, b in backend_data["by_variant"].items()
+        },
+        "by_day": by_day_out,
+        "recent": backend_data["recent"][-100:],
+        "ip_pepper": backend_data.get("ip_pepper"),
+        "saved_at": int(time.time()),
+    }
+    try:
+        os.makedirs(STATE_DIR, exist_ok=True)
+    except OSError as exc:
+        print(f"[metrics_pusher] mkdir {STATE_DIR} failed ({exc}); skipping state write", flush=True)
+        return
+    _atomic_write(STATE_PATH, payload)
+def write_daily_archives(backend_data: dict) -> None:
+    """One JSON file per UTC date. Today's file gets rewritten each tick; past
+    days only on a restart that reloads their bucket from state.json."""
+    if not backend_data["by_day"]:
+        return
+    try:
+        os.makedirs(DAILY_DIR, exist_ok=True)
+    except OSError as exc:
+        print(f"[metrics_pusher] mkdir {DAILY_DIR} failed ({exc}); skipping daily writes", flush=True)
+        return
+    for date, d in backend_data["by_day"].items():
+        by_shape_out = {}
+        for shape, b in d["by_shape"].items():
+            c = b["count"]
+            by_shape_out[shape] = {
+                "count": c,
+                "duration_ms_total": b["duration_ms_total"],
+                "duration_ms_avg": b["duration_ms_total"] // c if c else 0,
+                "queue_ms_total": b.get("queue_ms_total", 0),
+                "queue_ms_avg": b.get("queue_ms_total", 0) // c if c else 0,
+            }
+        by_gpu_out = {}
+        for g_name, g in (d.get("by_gpu") or {}).items():
+            c = g["count"]
+            by_gpu_out[g_name] = {
+                "count": c,
+                "duration_ms_total": g["duration_ms_total"],
+                "duration_ms_avg": g["duration_ms_total"] // c if c else 0,
+                "queue_ms_total": g.get("queue_ms_total", 0),
+                "queue_ms_avg": g.get("queue_ms_total", 0) // c if c else 0,
+            }
+        by_variant_out = {}
+        for v_name, v in (d.get("by_variant") or {}).items():
+            c = v["count"]
+            by_variant_out[v_name] = {
+                "count": c,
+                "duration_ms_total": v["duration_ms_total"],
+                "duration_ms_avg": v["duration_ms_total"] // c if c else 0,
+                "queue_ms_total": v.get("queue_ms_total", 0),
+                "queue_ms_avg": v.get("queue_ms_total", 0) // c if c else 0,
+            }
+        ips = d["unique_ips"]
+        day_req = d["requests"]
+        day_queue_total = d.get("queue_ms_total", 0)
+        payload = {
+            "date": date,
+            "updated_at": int(time.time()),
+            "requests": day_req,
+            "success": d["success"],
+            "errors": d["errors"],
+            "queue_ms_total": day_queue_total,
+            "queue_ms_avg": day_queue_total // day_req if day_req else 0,
+            "unique_users": len(ips) if isinstance(ips, set) else len(list(ips)),
+            "by_shape": by_shape_out,
+            "by_hour": list(d["by_hour"]),
+            "by_gpu": by_gpu_out,
+            "by_variant": by_variant_out,
+        }
+        _atomic_write(os.path.join(DAILY_DIR, f"{date}.json"), payload, indent=2)
+def main() -> None:
+    print(
+        f"[metrics_pusher] backends={BACKEND_URLS} interval={INTERVAL}s "
+        f"state_dir={STATE_DIR} persistent_storage={PERSISTENT_STORAGE}",
+        flush=True,
+    )
+    tick = 0
+    consecutive_zero = 0
+    while True:
+        try:
+            backend_data = fetch_backend_metrics()
+            gpu_data = fetch_gpu_stats()
+            # nvidia-smi runs locally and is independent of backend health,
+            # so always refresh GPU stats.
+            _atomic_write(GPU_PATH, gpu_data)
+            if backend_data["replicas_seen"] == 0:
+                # NO replicas answered /metrics this tick — usually means
+                # they're all saturated. DON'T overwrite analytics.json
+                # with zero-everywhere defaults; keep the prior file so
+                # the dashboard stays meaningful. Updated_at age will
+                # naturally drift to indicate staleness.
+                consecutive_zero += 1
+                print(
+                    f"[metrics_pusher] tick {tick}: no replicas responded "
+                    f"(consecutive={consecutive_zero}); keeping prior analytics.json",
+                    flush=True,
+                )
+            else:
+                if consecutive_zero > 0:
+                    print(f"[metrics_pusher] backends recovered after {consecutive_zero} miss(es)", flush=True)
+                consecutive_zero = 0
+                analytics = build_analytics(backend_data)
+                _atomic_write(ANALYTICS_PATH, analytics)
+                if tick % STATE_WRITE_EVERY_N_TICKS == 0:
+                    write_state(backend_data)
+                    write_daily_archives(backend_data)
+        except Exception as exc:
+            print(f"[metrics_pusher] tick error: {exc}", flush=True)
+        tick += 1
+        time.sleep(INTERVAL)
+if __name__ == "__main__":
+    main()

space/nginx.conf ADDED Viewed

	@@ -0,0 +1,117 @@

+# nginx config for the Bonsai-Image HF Space.
+# - :7860 is the only public port (HF exposes it).
+# - / and /api/* go to the Next.js frontend on :3000.
+# - /generate, /backends, /docs go to one (or many) uvicorn backends via
+#   the upstream block, which entrypoint.sh builds from `nvidia-smi -L`.
+#   At N=1 it's just one server line; at N>1 we add least_conn.
+# - /dash-<obfuscated> is the metrics dashboard, basic-auth gated.
+#
+# Run as: nginx -c /home/user/app/space/nginx.conf -p /home/user/app/
+worker_processes 1;
+daemon off;
+pid /tmp/nginx.pid;
+error_log /dev/stderr warn;
+events {
+    worker_connections 256;
+}
+http {
+    default_type application/octet-stream;
+    sendfile on;
+    keepalive_timeout 65;
+    # nginx's stock /var/log/... isn't writable by uid 1000 on the HF image,
+    # so redirect everything into /tmp where we have write access.
+    client_body_temp_path /tmp/nginx-body;
+    proxy_temp_path       /tmp/nginx-proxy;
+    fastcgi_temp_path     /tmp/nginx-fastcgi;
+    uwsgi_temp_path       /tmp/nginx-uwsgi;
+    scgi_temp_path        /tmp/nginx-scgi;
+    access_log            /tmp/nginx-access.log;
+    # Built at boot by entrypoint.sh from `nvidia-smi -L` — one server line
+    # per GPU. Today: one server at :8000.
+    include /tmp/nginx-upstream.conf;
+    server {
+        listen 7860 default_server;
+        client_max_body_size 16M;
+        # ── frontend ────────────────────────────────────────────────────────
+        location / {
+            proxy_pass http://127.0.0.1:3000;
+            proxy_http_version 1.1;
+            proxy_set_header Upgrade $http_upgrade;
+            proxy_set_header Connection "upgrade";
+            proxy_set_header Host $host;
+            # APPEND $remote_addr to existing X-Forwarded-For (the chain HF's
+            # edge proxy already set with the real visitor IP). Using
+            # $remote_addr alone would overwrite that with the edge proxy's
+            # IP — same for all visitors — collapsing every user to one hash
+            # in the dashboard's unique-user counter.
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            # Generations can run several seconds; Next.js streams the
+            # response back so don't time the connection out.
+            proxy_read_timeout 600s;
+        }
+        # ── backend API surface (called by Next.js api/generate route + curl) ─
+        location ~ ^/(generate|backends|docs|openapi\.json)$ {
+            proxy_pass http://bonsai_workers;
+            proxy_http_version 1.1;
+            proxy_set_header Host $host;
+            # APPEND $remote_addr to existing X-Forwarded-For (the chain HF's
+            # edge proxy already set with the real visitor IP). Using
+            # $remote_addr alone would overwrite that with the edge proxy's
+            # IP — same for all visitors — collapsing every user to one hash
+            # in the dashboard's unique-user counter.
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_read_timeout 600s;
+            proxy_buffering off;  # stream PNG bytes back immediately
+        }
+        # ── dashboard ───────────────────────────────────────────────────────
+        # Obfuscated path + basic auth. Path suffix is in source (visible to
+        # anyone with repo read access); auth is the actual gate.
+        # Trailing-slash exact-match keeps /dash-... from leaking into other
+        # locations.
+        location = /dash-10a08e9c1ee4 {
+            auth_basic "Bonsai Dashboard";
+            auth_basic_user_file /tmp/.htpasswd;
+            alias /home/user/app/space/dashboard.html;
+            default_type text/html;
+            add_header Cache-Control "no-store" always;
+        }
+        location = /dash-10a08e9c1ee4/analytics.json {
+            auth_basic "Bonsai Dashboard";
+            auth_basic_user_file /tmp/.htpasswd;
+            alias /tmp/analytics.json;
+            default_type application/json;
+            add_header Cache-Control "no-store" always;
+        }
+        location = /dash-10a08e9c1ee4/gpu-stats.json {
+            auth_basic "Bonsai Dashboard";
+            auth_basic_user_file /tmp/.htpasswd;
+            alias /tmp/gpu-stats.json;
+            default_type application/json;
+            add_header Cache-Control "no-store" always;
+        }
+        # Catchall under the dashboard prefix → 404 (don't reveal what else
+        # might exist there).
+        location ~ ^/dash- {
+            return 404;
+        }
+        # /metrics on the backend is loopback-only; nginx doesn't forward it.
+        # (metrics_pusher.py scrapes it directly at 127.0.0.1:8000/metrics.)
+        location = /metrics {
+            return 404;
+        }
+    }
+}