pashak commited on
Commit
17895f4
Β·
0 Parent(s):
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1.6
2
+ # CUDA 12.8 runtime β€” gemlite/Triton kernels JIT against the runtime ptxas
3
+ # that comes with this image; no need for the larger -devel variant.
4
+ FROM nvidia/cuda:12.8.0-runtime-ubuntu22.04
5
+
6
+ # ── system deps ──────────────────────────────────────────────────────────────
7
+ # build-essential is needed because some sdists (gemlite among them) compile C
8
+ # extensions at install time. python3 is the host interpreter that bootstraps
9
+ # uv; uv then provisions its own pinned interpreter for the venv. nginx fronts
10
+ # everything on :7860 (frontend, backend API, dashboard).
11
+ RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ ca-certificates curl git build-essential python3 python3-venv \
13
+ libgomp1 libssl3 nginx openssl \
14
+ && rm -rf /var/lib/apt/lists/* \
15
+ && chown -R 1000:1000 /var/lib/nginx /var/log/nginx /run
16
+
17
+ # Non-root user: HF Spaces convention is uid 1000.
18
+ RUN useradd -m -u 1000 user
19
+ USER user
20
+ ENV HOME=/home/user PATH="/home/user/.local/bin:$PATH"
21
+
22
+ # uv (Python venv + package manager). The demo's setup.sh assumes it's on PATH.
23
+ RUN curl -fsSL https://astral.sh/uv/install.sh | sh
24
+
25
+ WORKDIR /home/user/app
26
+
27
+ # ── clone + run setup.sh in one RUN so GH_TOKEN never lands in a layer ───────
28
+ # GH_TOKEN is supplied by `--mount=type=secret`; the secret file is only
29
+ # visible during this single RUN and is not stored in the image.
30
+ # SKIP_DOWNLOAD=1 keeps setup.sh from pulling the 3.5 GB model at build time
31
+ # β€” entrypoint.sh handles that at boot so a Space restart doesn't have to
32
+ # rebuild the image.
33
+ RUN --mount=type=secret,id=GH_TOKEN,uid=1000,required=true \
34
+ git config --global credential.helper '!f() { echo "username=oauth2"; echo "password=$(cat /run/secrets/GH_TOKEN)"; }; f' \
35
+ && git clone https://github.com/PrismML-Eng/Bonsai-image-demo.git . \
36
+ && SKIP_DOWNLOAD=1 BONSAI_PACKAGE_MIN_AGE_DAYS=0 ./setup.sh \
37
+ && git config --global --unset credential.helper
38
+
39
+ # ── pre-build the Next.js frontend ───────────────────────────────────────────
40
+ # Baking the build into the image so the first browser visit doesn't pay
41
+ # `npm install + next build` (~2 min) on top of model load. NEXT_PUBLIC_*
42
+ # vars are inlined at build time and don't change at runtime, so the
43
+ # backend URL (always loopback inside this container) is baked too.
44
+ RUN cd vendor/image-studio/frontend \
45
+ && PATH="$HOME/app/.venv/bin:$PATH" \
46
+ NEXT_PUBLIC_BACKEND_URL=http://127.0.0.1:8000 \
47
+ npm install --no-audit --no-fund \
48
+ && PATH="$HOME/app/.venv/bin:$PATH" \
49
+ NEXT_PUBLIC_BACKEND_URL=http://127.0.0.1:8000 \
50
+ npm run build
51
+
52
+ # ── Space-local files ────────────────────────────────────────────────────────
53
+ # All Space-specific code lives under space/ (Python package + sidecar +
54
+ # dashboard + nginx config + entrypoint). The demo's own code stays at the
55
+ # repo root (cloned earlier) so the two namespaces don't collide.
56
+ COPY --chown=user space/ /home/user/app/space/
57
+ RUN chmod +x /home/user/app/space/entrypoint.sh
58
+
59
+ EXPOSE 7860
60
+
61
+ CMD ["/home/user/app/space/entrypoint.sh"]
README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Bonsai Image (1-bit + 1.58-bit) GPU
3
+ emoji: 🎨
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: docker
7
+ app_port: 7860
8
+ suggested_hardware: l40sx1
9
+ pinned: true
10
+ short_description: Run 1-bit and 1.58-bit Bonsai-Image-4B on GPU
11
+ models:
12
+ - prism-ml/bonsai-image-ternary-4B-mlx-2bit
13
+ - prism-ml/bonsai-image-ternary-4B-gemlite-2bit
14
+ - prism-ml/bonsai-image-ternary-4B-unpacked
15
+ - prism-ml/bonsai-image-binary-4B-mlx-1bit
16
+ - prism-ml/bonsai-image-binary-4B-gemlite-1bit
17
+ - prism-ml/bonsai-image-binary-4B-unpacked
18
+ ---
19
+
20
+ # Bonsai Image Demo
21
+
22
+ - Ternary (1.58-bit)
23
+ - Binary (1-bit)
24
+
25
+ ## Privacy
26
+
27
+ - **We do not log prompts or generated images.** Generation runs in-process and outputs are streamed back over HTTPS.
28
+ - The studio UI keeps your prompt history **in your browser's local storage only**. Clearing your browser cache erases it.
29
+ - Please do not submit sensitive, private, or confidential content in your prompts.
30
+
31
+ ## Fair Use
32
+
33
+ Single-GPU demo, shared across all visitors. Heavy load may queue requests. Please avoid bursts of automated traffic so everyone can try it.
space/__init__.py ADDED
File without changes
space/app.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HF Space wrapper around scripts.local_backend.
2
+
3
+ Adds a metrics middleware that:
4
+ - tracks total / success / error counters (cumulative since first launch)
5
+ - per-shape latency histogram (rolling)
6
+ - rolling 1000-request log with hashed-IP for unique-user count
7
+ - per-day buckets (UTC date) for the daily archives the metrics_pusher
8
+ sidecar writes under $BONSAI_STATE_DIR/daily/YYYY-MM-DD.json
9
+
10
+ State loaded at boot from $BONSAI_STATE_DIR/state.json so counters survive
11
+ Space restarts (assuming a persistent storage bucket is mounted; entrypoint
12
+ falls back to ephemeral disk otherwise).
13
+
14
+ Run with: uvicorn space.app:app
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import hashlib
20
+ import json
21
+ import os
22
+ import time
23
+ from collections import defaultdict, deque
24
+ from datetime import datetime, timezone
25
+ from threading import Lock
26
+
27
+ from fastapi import Request
28
+
29
+ # Re-export the real backend's app object so /generate, /backends, /docs
30
+ # are served untouched.
31
+ from scripts.local_backend import app # noqa: F401 (re-exported)
32
+
33
+ # ── in-memory state ──────────────────────────────────────────────────────────
34
+ _lock = Lock()
35
+ _started_at = time.monotonic()
36
+ _total = {"requests": 0, "success": 0, "errors": 0}
37
+ _by_shape: dict[str, dict] = defaultdict(
38
+ lambda: {"count": 0, "duration_ms_total": 0, "durations": deque(maxlen=200)}
39
+ )
40
+ # Cumulative by-variant counter. The `variant` key is "ternary", "binary",
41
+ # or "unknown" (parsed from the request's `backend` field β€” see middleware).
42
+ # Mirrors by_shape's shape so the dashboard can show "ternary: X Β· binary: Y"
43
+ # across all time without re-summing the by_day history.
44
+ _by_variant: dict[str, dict] = defaultdict(
45
+ lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}
46
+ )
47
+ _recent: deque = deque(maxlen=1000)
48
+
49
+ # Per-day buckets keyed by UTC YYYY-MM-DD. Last 30 days kept in memory;
50
+ # older days remain on disk (metrics_pusher writes one file per day under
51
+ # $BONSAI_STATE_DIR/daily/).
52
+ _MAX_DAYS_IN_MEMORY = 30
53
+ _by_day: dict[str, dict] = {}
54
+
55
+ # UTC bucketing. (We tried Pacific Time, but `zoneinfo.ZoneInfo` needs
56
+ # /usr/share/zoneinfo/ which our CUDA Ubuntu base image strips with
57
+ # --no-install-recommends. To re-enable PT, install `tzdata` in the
58
+ # Dockerfile and swap these back to ZoneInfo("America/Los_Angeles").)
59
+
60
+
61
+ def _today() -> str:
62
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d")
63
+
64
+
65
+ def _now_hour() -> int:
66
+ return datetime.now(timezone.utc).hour
67
+
68
+
69
+ def _empty_day() -> dict:
70
+ return {
71
+ "requests": 0,
72
+ "success": 0,
73
+ "errors": 0,
74
+ # queue_ms_total at three levels: day-total + per_shape + per_gpu.
75
+ # Day-total powers the dashboard's "today avg queue" tile; the
76
+ # per-shape and per-gpu views surface where queueing pressure is
77
+ # actually landing (e.g. is the slow GPU starving on small shapes?).
78
+ "by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
79
+ "by_hour": [0] * 24,
80
+ "unique_ips": set(),
81
+ # Per-GPU attribution for this day. Persisted to state.json +
82
+ # written into daily/YYYY-MM-DD.json so historical days retain
83
+ # their original GPU split even after a tier swap.
84
+ "by_gpu": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
85
+ # Per-variant attribution (ternary/binary/unknown). Tells you which
86
+ # arm took the traffic on this day independent of which GPU served
87
+ # it β€” useful for "did users actually click binary today, or are
88
+ # they all defaulting to ternary?" analysis.
89
+ "by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
90
+ "queue_ms_total": 0,
91
+ }
92
+
93
+
94
+ def _bump_day(date: str, ok: bool, shape: str, dt_ms: int, queue_ms: int, hour: int, ip_hash: str, variant: str) -> None:
95
+ """Increment today's bucket. Caller must hold _lock."""
96
+ if date not in _by_day:
97
+ _by_day[date] = _empty_day()
98
+ d = _by_day[date]
99
+ d["requests"] += 1
100
+ if ok:
101
+ d["success"] += 1
102
+ else:
103
+ d["errors"] += 1
104
+ d["by_shape"][shape]["count"] += 1
105
+ d["by_shape"][shape]["duration_ms_total"] += dt_ms
106
+ d["by_shape"][shape]["queue_ms_total"] += queue_ms
107
+ d["by_hour"][hour] += 1
108
+ d["unique_ips"].add(ip_hash)
109
+ d["by_gpu"][_GPU_NAME]["count"] += 1
110
+ d["by_gpu"][_GPU_NAME]["duration_ms_total"] += dt_ms
111
+ d["by_gpu"][_GPU_NAME]["queue_ms_total"] += queue_ms
112
+ d["by_variant"][variant]["count"] += 1
113
+ d["by_variant"][variant]["duration_ms_total"] += dt_ms
114
+ d["by_variant"][variant]["queue_ms_total"] += queue_ms
115
+ d["queue_ms_total"] += queue_ms
116
+ if len(_by_day) > _MAX_DAYS_IN_MEMORY:
117
+ for stale in sorted(_by_day)[:-_MAX_DAYS_IN_MEMORY]:
118
+ del _by_day[stale]
119
+
120
+
121
+ # ── persisted state ──────────────────────────────────────────────────────────
122
+ # $BONSAI_STATE_DIR is set by entrypoint.sh β€” /data/state if a persistent
123
+ # storage bucket is mounted, else $APP_DIR/outputs/.state (ephemeral).
124
+ _STATE_DIR = os.environ.get("BONSAI_STATE_DIR", "/tmp")
125
+ _STATE_PATH = os.path.join(_STATE_DIR, "state.json")
126
+ # entrypoint.sh sets this to "1" when /data is mounted + writable, else "0".
127
+ # Surfaced to the dashboard so it can show a "counters won't persist" warning.
128
+ _PERSISTENT_STORAGE = os.environ.get("BONSAI_PERSISTENT_STORAGE", "0") == "1"
129
+
130
+
131
+ def _load_state() -> dict:
132
+ """Return a dict with all persisted fields, or fresh defaults on miss / parse error."""
133
+ fresh = {
134
+ "pepper": os.urandom(16).hex().encode(),
135
+ "totals": {"requests": 0, "success": 0, "errors": 0},
136
+ "by_shape": {},
137
+ "by_variant": {}, # parallel to by_shape; new in this build, may be missing in old state files
138
+ "recent": [],
139
+ "by_day": {},
140
+ }
141
+ try:
142
+ with open(_STATE_PATH) as f:
143
+ data = json.load(f)
144
+ except (FileNotFoundError, json.JSONDecodeError, OSError) as exc:
145
+ print(f"[space.app] no prior state ({type(exc).__name__}: {exc}); starting fresh", flush=True)
146
+ return fresh
147
+ try:
148
+ fresh["pepper"] = (data.get("ip_pepper") or fresh["pepper"].decode()).encode()
149
+ fresh["totals"] = {
150
+ "requests": int(data.get("total_requests", 0)),
151
+ "success": int(data.get("success", 0)),
152
+ "errors": int(data.get("errors", 0)),
153
+ }
154
+ by_shape_raw = data.get("by_shape", {}) or {}
155
+ by_shape_loaded = {}
156
+ for shape, b in by_shape_raw.items():
157
+ by_shape_loaded[shape] = {
158
+ "count": int(b.get("count", 0)),
159
+ "duration_ms_total": int(b.get("duration_ms_total", 0)),
160
+ "durations": deque(maxlen=200), # p50/p95 starts fresh after a boot
161
+ }
162
+ fresh["by_shape"] = by_shape_loaded
163
+ # by_variant: parallel to by_shape, no `durations` deque (no need
164
+ # for p50/p95 yet, just cumulative count + duration + queue).
165
+ by_variant_raw = data.get("by_variant", {}) or {}
166
+ by_variant_loaded = {}
167
+ for variant, b in by_variant_raw.items():
168
+ by_variant_loaded[variant] = {
169
+ "count": int(b.get("count", 0)),
170
+ "duration_ms_total": int(b.get("duration_ms_total", 0)),
171
+ "queue_ms_total": int(b.get("queue_ms_total", 0)),
172
+ }
173
+ fresh["by_variant"] = by_variant_loaded
174
+ fresh["recent"] = data.get("recent", []) or []
175
+ # Per-day
176
+ by_day_raw = data.get("by_day", {}) or {}
177
+ by_day_loaded: dict[str, dict] = {}
178
+ for date, d in by_day_raw.items():
179
+ bd = _empty_day()
180
+ bd["requests"] = int(d.get("requests", 0))
181
+ bd["success"] = int(d.get("success", 0))
182
+ bd["errors"] = int(d.get("errors", 0))
183
+ # queue_ms_total fields default to 0 for state files persisted
184
+ # before this feature shipped β€” keeps reload graceful.
185
+ bd["queue_ms_total"] = int(d.get("queue_ms_total", 0))
186
+ for shape, s in (d.get("by_shape", {}) or {}).items():
187
+ bd["by_shape"][shape] = {
188
+ "count": int(s.get("count", 0)),
189
+ "duration_ms_total": int(s.get("duration_ms_total", 0)),
190
+ "queue_ms_total": int(s.get("queue_ms_total", 0)),
191
+ }
192
+ bh = d.get("by_hour") or [0] * 24
193
+ bd["by_hour"] = list(bh) + [0] * max(0, 24 - len(bh))
194
+ bd["unique_ips"] = set(d.get("unique_ips", []) or [])
195
+ for gpu_name, g in (d.get("by_gpu", {}) or {}).items():
196
+ bd["by_gpu"][gpu_name] = {
197
+ "count": int(g.get("count", 0)),
198
+ "duration_ms_total": int(g.get("duration_ms_total", 0)),
199
+ "queue_ms_total": int(g.get("queue_ms_total", 0)),
200
+ }
201
+ for variant_name, v in (d.get("by_variant", {}) or {}).items():
202
+ bd["by_variant"][variant_name] = {
203
+ "count": int(v.get("count", 0)),
204
+ "duration_ms_total": int(v.get("duration_ms_total", 0)),
205
+ "queue_ms_total": int(v.get("queue_ms_total", 0)),
206
+ }
207
+ by_day_loaded[date] = bd
208
+ fresh["by_day"] = by_day_loaded
209
+ except Exception as exc:
210
+ print(f"[space.app] state file partially malformed ({exc}); using what we could parse", flush=True)
211
+ return fresh
212
+
213
+
214
+ # ── replica gating for multi-GPU deploys ─────────────────────────────────────
215
+ # Each uvicorn process (one per GPU) sets BONSAI_REPLICA_INDEX via entrypoint.
216
+ # Only replica 0 seeds its in-memory counters from state.json β€” other
217
+ # replicas start at zero. metrics_pusher polls every replica and sums them,
218
+ # so this avoids N-way inflation of cumulative counts. Pepper comes from
219
+ # the env (set by entrypoint), shared across all replicas so unique-user
220
+ # hashing is consistent.
221
+ _REPLICA_INDEX = int(os.environ.get("BONSAI_REPLICA_INDEX", "0"))
222
+ # Name of the GPU this replica is pinned to (entrypoint sets it from
223
+ # `nvidia-smi --query-gpu=name`). Exposed in /metrics so the pusher can
224
+ # aggregate per-GPU averages on the dashboard. Falls back to "unknown"
225
+ # if not provided.
226
+ # Default to NVIDIA L40S if entrypoint didn't supply a name β€” that's the
227
+ # tier we ran on for most of the demo's history, so unattributed counters
228
+ # get folded into the L40S bucket rather than a misleading "unknown".
229
+ _GPU_NAME = os.environ.get("BONSAI_GPU_NAME", "").strip() or "NVIDIA L40S"
230
+ _loaded = _load_state()
231
+ if _REPLICA_INDEX == 0:
232
+ _total.update(_loaded["totals"])
233
+ for _s, _b in _loaded["by_shape"].items():
234
+ _by_shape[_s] = _b
235
+ for _v, _b in _loaded["by_variant"].items():
236
+ _by_variant[_v] = _b
237
+ for _r in _loaded["recent"][-1000:]:
238
+ _recent.append(_r)
239
+ _by_day.update(_loaded["by_day"])
240
+ print(
241
+ f"[space.app] replica 0: seeded counters from {_STATE_PATH} "
242
+ f"(requests={_total['requests']} days={len(_by_day)} "
243
+ f"persistent_storage={_PERSISTENT_STORAGE})",
244
+ flush=True,
245
+ )
246
+ else:
247
+ print(
248
+ f"[space.app] replica {_REPLICA_INDEX}: starting counters at 0 "
249
+ f"(replica 0 owns cumulative state)",
250
+ flush=True,
251
+ )
252
+
253
+ # Pepper: prefer env (entrypoint exports a single value for all replicas).
254
+ # Fall back to whatever _load_state surfaced (typically random on first
255
+ # launch) β€” fine for single-replica or testing.
256
+ _IP_PEPPER = os.environ.get("BONSAI_IP_PEPPER", _loaded["pepper"].decode()).encode()
257
+
258
+
259
+ def _hash_ip(ip: str) -> str:
260
+ return hashlib.sha256(_IP_PEPPER + ip.encode()).hexdigest()[:12]
261
+
262
+
263
+ # Concurrency cap per replica. Image-gen is compute-bound; two concurrent
264
+ # requests at one GPU just contend for the same SMs and serialize at the
265
+ # kernel-launch level, wasting time. With Semaphore(1), additional requests
266
+ # queue at the asyncio level, and nginx's least_conn sees them as "this
267
+ # replica is busy" β†’ routes to a free GPU when one's available.
268
+ _GENERATE_CONCURRENCY = int(os.environ.get("BONSAI_GENERATE_CONCURRENCY", "1"))
269
+ _generate_sem = asyncio.Semaphore(_GENERATE_CONCURRENCY)
270
+
271
+ # In-flight gauge. Incremented when a /generate request enters the middleware
272
+ # (before semaphore acquire β€” so queued requests count), decremented in
273
+ # finally. metrics_pusher sums across replicas and derives queue depth as
274
+ # max(0, total_inflight - total_concurrency).
275
+ _inflight = 0
276
+ _inflight_lock = Lock()
277
+
278
+
279
+ # ── middleware ───────────────────────────────────────────────────────────────
280
+ @app.middleware("http")
281
+ async def _track_generate(request: Request, call_next):
282
+ if request.url.path != "/generate" or request.method != "POST":
283
+ return await call_next(request)
284
+
285
+ # Read + replay the body so the downstream handler still sees it.
286
+ body = await request.body()
287
+
288
+ async def _receive() -> dict:
289
+ return {"type": "http.request", "body": body, "more_body": False}
290
+
291
+ request._receive = _receive # type: ignore[attr-defined]
292
+
293
+ shape = "unknown"
294
+ # variant: "ternary" / "binary" / "unknown". Parsed from the request's
295
+ # `backend` field β€” values look like "bonsai-ternary-gemlite" or
296
+ # "bonsai-binary-mlx". If the client omits backend, FastAPI's default
297
+ # picks the resident pipeline arm (set by MFLUX_STUDIO_GPU_DEFAULT_BACKEND
298
+ # in entrypoint.sh β€” currently bonsai-ternary-gemlite) so we mirror that
299
+ # default here for fair attribution.
300
+ variant = "ternary"
301
+ try:
302
+ payload = json.loads(body or b"{}")
303
+ w, h = int(payload.get("width", 0)), int(payload.get("height", 0))
304
+ if w and h:
305
+ shape = f"{w}x{h}"
306
+ backend = (payload.get("backend") or "").lower()
307
+ if "ternary" in backend:
308
+ variant = "ternary"
309
+ elif "binary" in backend:
310
+ variant = "binary"
311
+ elif backend:
312
+ variant = "unknown"
313
+ # else: backend missing β†’ keep the default "ternary" set above
314
+ except Exception:
315
+ pass
316
+
317
+ # Identity for unique-user counting. Preference order:
318
+ # 1. X-IP-Token β€” set by HF when the visitor is logged into
319
+ # huggingface.co and viewing the Space via the embed. Tied to
320
+ # their HF session, stable across home↔mobile network changes.
321
+ # 2. X-Forwarded-For β€” real client IP, set by nginx (and propagated
322
+ # by Next.js's /api/generate route handler).
323
+ # 3. request.client.host β€” direct-loopback fallback (mostly never).
324
+ # The "hf:" / "ip:" prefix keeps the two namespaces from colliding.
325
+ hf_token = request.headers.get("x-ip-token")
326
+ if hf_token:
327
+ identity = f"hf:{hf_token}"
328
+ else:
329
+ forwarded = request.headers.get("x-forwarded-for")
330
+ ip = forwarded.split(",")[0].strip() if forwarded else (request.client.host if request.client else "0.0.0.0")
331
+ identity = f"ip:{ip}"
332
+ ip_hash = _hash_ip(identity)
333
+
334
+ date = _today()
335
+ hour = _now_hour()
336
+
337
+ # Increment in-flight gauge BEFORE the semaphore so queued requests are
338
+ # visible to the dashboard ("X pending"). Decrement in finally so the
339
+ # gauge stays accurate even on exceptions.
340
+ global _inflight
341
+ t_enqueue = time.monotonic()
342
+ with _inflight_lock:
343
+ _inflight += 1
344
+ try:
345
+ # Queue at the semaphore so only N requests per replica run on the
346
+ # GPU at once. The HTTP connection stays open while we wait, which
347
+ # makes nginx's least_conn see this replica as busy β†’ routes new
348
+ # arrivals to a free GPU when one's available.
349
+ async with _generate_sem:
350
+ t_start = time.monotonic()
351
+ queue_ms = int((t_start - t_enqueue) * 1000)
352
+ try:
353
+ response = await call_next(request)
354
+ except Exception:
355
+ dt_ms = int((time.monotonic() - t_start) * 1000)
356
+ with _lock:
357
+ _total["requests"] += 1
358
+ _total["errors"] += 1
359
+ _by_variant[variant]["count"] += 1
360
+ _by_variant[variant]["duration_ms_total"] += dt_ms
361
+ _by_variant[variant]["queue_ms_total"] += queue_ms
362
+ _recent.append({"ts": int(time.time()), "shape": shape, "duration_ms": dt_ms, "queue_ms": queue_ms, "ip_hash": ip_hash, "gpu": _GPU_NAME, "variant": variant, "ok": False})
363
+ _bump_day(date, False, shape, dt_ms, queue_ms, hour, ip_hash, variant)
364
+ raise
365
+
366
+ dt_ms = int((time.monotonic() - t_start) * 1000)
367
+ ok = response.status_code < 400
368
+ with _lock:
369
+ _total["requests"] += 1
370
+ if ok:
371
+ _total["success"] += 1
372
+ else:
373
+ _total["errors"] += 1
374
+ bucket = _by_shape[shape]
375
+ bucket["count"] += 1
376
+ bucket["duration_ms_total"] += dt_ms
377
+ bucket["durations"].append(dt_ms)
378
+ _by_variant[variant]["count"] += 1
379
+ _by_variant[variant]["duration_ms_total"] += dt_ms
380
+ _by_variant[variant]["queue_ms_total"] += queue_ms
381
+ _recent.append({"ts": int(time.time()), "shape": shape, "duration_ms": dt_ms, "queue_ms": queue_ms, "ip_hash": ip_hash, "gpu": _GPU_NAME, "variant": variant, "ok": ok})
382
+ _bump_day(date, ok, shape, dt_ms, queue_ms, hour, ip_hash, variant)
383
+ return response
384
+ finally:
385
+ with _inflight_lock:
386
+ _inflight -= 1
387
+
388
+
389
+ # ── /metrics endpoint (loopback-only via nginx) ──────────────────────────────
390
+ def _percentile(xs: list[int], p: int) -> int | None:
391
+ if not xs:
392
+ return None
393
+ s = sorted(xs)
394
+ idx = min(int(len(s) * p / 100), len(s) - 1)
395
+ return s[idx]
396
+
397
+
398
+ @app.get("/metrics")
399
+ def get_metrics() -> dict:
400
+ """Scraped by metrics_pusher every few seconds. Returns the full in-memory
401
+ state so the sidecar can rebuild analytics.json + write daily archives.
402
+ """
403
+ with _lock:
404
+ by_shape = {}
405
+ for shape, b in _by_shape.items():
406
+ durs = list(b["durations"])
407
+ by_shape[shape] = {
408
+ "count": b["count"],
409
+ "duration_ms_total": b["duration_ms_total"],
410
+ "duration_ms_p50": _percentile(durs, 50),
411
+ "duration_ms_p95": _percentile(durs, 95),
412
+ }
413
+
414
+ by_day_out = {}
415
+ for date, d in _by_day.items():
416
+ by_day_out[date] = {
417
+ "requests": d["requests"],
418
+ "success": d["success"],
419
+ "errors": d["errors"],
420
+ # queue_ms_total exposed at all three levels (day + per-shape +
421
+ # per-gpu) so the pusher can compute today's average queue at
422
+ # arbitrary slicing without re-summing recent[].
423
+ "queue_ms_total": d.get("queue_ms_total", 0),
424
+ "by_shape": {
425
+ s: {
426
+ "count": b["count"],
427
+ "duration_ms_total": b["duration_ms_total"],
428
+ "queue_ms_total": b.get("queue_ms_total", 0),
429
+ }
430
+ for s, b in d["by_shape"].items()
431
+ },
432
+ "by_hour": list(d["by_hour"]),
433
+ "unique_users": len(d["unique_ips"]),
434
+ "unique_ips": list(d["unique_ips"]), # for round-trip persistence
435
+ "by_gpu": {
436
+ g: {
437
+ "count": v["count"],
438
+ "duration_ms_total": v["duration_ms_total"],
439
+ "queue_ms_total": v.get("queue_ms_total", 0),
440
+ }
441
+ for g, v in d["by_gpu"].items()
442
+ },
443
+ "by_variant": {
444
+ v: {
445
+ "count": b["count"],
446
+ "duration_ms_total": b["duration_ms_total"],
447
+ "queue_ms_total": b.get("queue_ms_total", 0),
448
+ }
449
+ for v, b in d.get("by_variant", {}).items()
450
+ },
451
+ }
452
+
453
+ with _inflight_lock:
454
+ inflight = _inflight
455
+ # Replica's own cumulative duration sum (sum across all shapes).
456
+ # Used by the pusher to compute per-GPU avg latency without
457
+ # rebuilding it from `recent` (which would lose history).
458
+ total_duration_ms = sum(b["duration_ms_total"] for b in _by_shape.values())
459
+ return {
460
+ "uptime_s": int(time.monotonic() - _started_at),
461
+ "replica_index": _REPLICA_INDEX,
462
+ "gpu_name": _GPU_NAME,
463
+ "inflight": inflight,
464
+ "generate_concurrency": _GENERATE_CONCURRENCY,
465
+ "total_requests": _total["requests"],
466
+ "success": _total["success"],
467
+ "errors": _total["errors"],
468
+ "total_duration_ms": total_duration_ms,
469
+ "by_shape": by_shape,
470
+ "by_variant": {
471
+ v: {
472
+ "count": b["count"],
473
+ "duration_ms_total": b["duration_ms_total"],
474
+ "queue_ms_total": b.get("queue_ms_total", 0),
475
+ }
476
+ for v, b in _by_variant.items()
477
+ },
478
+ "by_day": by_day_out,
479
+ "recent": list(_recent),
480
+ "ip_pepper": _IP_PEPPER.decode(),
481
+ "persistent_storage": _PERSISTENT_STORAGE,
482
+ "state_dir": _STATE_DIR,
483
+ }
space/dashboard.html ADDED
@@ -0,0 +1,594 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1">
6
+ <title>Bonsai-Image Dashboard</title>
7
+ <script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.4/dist/chart.umd.min.js"></script>
8
+ <style>
9
+ :root {
10
+ --bg: #0e1116;
11
+ --panel: #161b22;
12
+ --panel-border: #1f2630;
13
+ --text: #d7dde6;
14
+ --muted: #7d8694;
15
+ --accent: #4cb583;
16
+ --warn: #d97757;
17
+ --grid: #21272f;
18
+ }
19
+ * { box-sizing: border-box; }
20
+ body {
21
+ margin: 0; padding: 24px;
22
+ font: 14px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
23
+ background: var(--bg); color: var(--text);
24
+ }
25
+ header { display: flex; align-items: baseline; justify-content: space-between; margin-bottom: 20px; }
26
+ h1 { margin: 0; font-size: 18px; font-weight: 600; }
27
+ .subtitle { color: var(--muted); font-size: 12px; }
28
+ .grid { display: grid; gap: 16px; grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); margin-bottom: 16px; }
29
+ .card { background: var(--panel); border: 1px solid var(--panel-border); border-radius: 8px; padding: 16px; }
30
+ .card h2 { margin: 0 0 12px 0; font-size: 12px; font-weight: 600; color: var(--muted); text-transform: uppercase; letter-spacing: 0.05em; }
31
+ .metric { font-size: 28px; font-weight: 600; line-height: 1; }
32
+ .metric-sub { color: var(--muted); font-size: 12px; margin-top: 6px; }
33
+ .row { display: grid; gap: 16px; grid-template-columns: 2fr 1fr; margin-bottom: 16px; }
34
+ .row.single { grid-template-columns: 1fr; }
35
+ .row.equal { grid-template-columns: 1fr 1fr; }
36
+ .row.three { grid-template-columns: 1fr 1fr 1fr; }
37
+ @media (max-width: 1100px) { .row.three { grid-template-columns: 1fr 1fr; } }
38
+ @media (max-width: 900px) { .row, .row.equal, .row.three { grid-template-columns: 1fr; } }
39
+ /* Replica pills: one chip per active uvicorn worker, color-coded by GPU
40
+ tier so a glance at the Replicas tile shows mixed vs homogeneous fleets. */
41
+ .replicas { display: flex; flex-wrap: wrap; gap: 6px; margin-top: 8px; }
42
+ .replica-pill { display: inline-flex; align-items: center; gap: 4px; font-size: 11px; padding: 2px 8px; border-radius: 999px; background: var(--grid); }
43
+ .replica-pill .dot { width: 6px; height: 6px; border-radius: 50%; background: var(--accent); }
44
+ .status-warn { color: var(--warn); }
45
+ canvas { max-width: 100%; }
46
+ table { width: 100%; border-collapse: collapse; font-size: 12px; }
47
+ th, td { padding: 6px 8px; text-align: left; border-bottom: 1px solid var(--grid); }
48
+ th { color: var(--muted); font-weight: 500; }
49
+ th.num, td.num { text-align: right; font-variant-numeric: tabular-nums; }
50
+ .status-ok { color: var(--accent); }
51
+ .status-err { color: var(--warn); }
52
+ footer { color: var(--muted); font-size: 11px; margin-top: 20px; text-align: right; }
53
+ .gpu-bar { background: var(--grid); border-radius: 4px; height: 6px; overflow: hidden; margin-top: 4px; }
54
+ .gpu-bar-fill { background: var(--accent); height: 100%; transition: width 0.3s; }
55
+ .banner { display: none; padding: 12px 16px; border-radius: 8px; margin-bottom: 16px; font-size: 13px; line-height: 1.5; border: 1px solid; }
56
+ .banner.warn { background: rgba(217, 119, 87, 0.08); border-color: rgba(217, 119, 87, 0.4); color: #e8a280; }
57
+ .banner.error { background: rgba(217, 119, 87, 0.15); border-color: rgba(217, 119, 87, 0.5); color: #f0a890; }
58
+ </style>
59
+ </head>
60
+ <body>
61
+
62
+ <header>
63
+ <div>
64
+ <h1>🌿 Bonsai-Image Dashboard</h1>
65
+ <div class="subtitle" id="updated">loading...</div>
66
+ </div>
67
+ <div class="subtitle" id="refresh-label">auto-refresh every 2s</div>
68
+ </header>
69
+
70
+ <div id="storage-banner" class="banner warn"></div>
71
+ <div id="stale-banner" class="banner warn"></div>
72
+ <div id="error-banner" class="banner error"></div>
73
+
74
+ <div class="grid">
75
+ <div class="card"><h2>Total images</h2><div class="metric" id="total-requests">β€”</div><div class="metric-sub" id="total-sub">β€” ok / β€” errors</div></div>
76
+ <div class="card"><h2>Today (UTC)</h2><div class="metric" id="req-today">β€”</div><div class="metric-sub" id="users-today">β€” unique users</div></div>
77
+ <div class="card"><h2>Last 7 days</h2><div class="metric" id="req-7d">β€”</div><div class="metric-sub" id="users-7d">β€” unique users</div></div>
78
+ <div class="card"><h2>Last 30 days</h2><div class="metric" id="req-30d">β€”</div><div class="metric-sub" id="users-30d">β€” unique users</div></div>
79
+ <div class="card"><h2>Pending</h2><div class="metric" id="pending">β€”</div><div class="metric-sub" id="pending-sub">β€” running / β€” capacity</div></div>
80
+ <div class="card">
81
+ <h2>Replicas</h2>
82
+ <div class="metric" id="replicas-metric">β€”</div>
83
+ <div class="metric-sub" id="replicas-sub">β€”</div>
84
+ <div class="replicas" id="replicas-pills"></div>
85
+ </div>
86
+ <div class="card">
87
+ <h2>By Variant</h2>
88
+ <div class="metric" id="variant-metric">β€”</div>
89
+ <div class="metric-sub" id="variant-sub">all-time Β· β€” today</div>
90
+ </div>
91
+ <div class="card"><h2>Uptime</h2><div class="metric" id="uptime">β€”</div><div class="metric-sub">since last restart</div></div>
92
+ </div>
93
+
94
+ <!-- Row: both charts side-by-side. Daily covers 30d, hourly covers today. -->
95
+ <div class="row equal">
96
+ <div class="card">
97
+ <h2>Requests per day (last 30d)</h2>
98
+ <canvas id="daily-chart" height="80"></canvas>
99
+ </div>
100
+ <div class="card">
101
+ <h2>Today's requests by hour (UTC)</h2>
102
+ <canvas id="hourly-chart" height="80"></canvas>
103
+ </div>
104
+ </div>
105
+
106
+ <!-- Row: image-time stats. Three views of latency: rolling 50, all-time
107
+ per-resolution, today per-resolution. Same column shape so eye can
108
+ scan left→right and spot drift. -->
109
+ <div class="row three">
110
+ <div class="card">
111
+ <h2>Average latency (last 50 requests)</h2>
112
+ <div style="display: flex; align-items: baseline; gap: 16px; margin-bottom: 12px;">
113
+ <div class="metric" id="avg-latency">β€”</div>
114
+ <div class="metric-sub" id="avg-latency-sub">across last β€” requests</div>
115
+ </div>
116
+ <table>
117
+ <thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg latency</th><th class="num">Avg queue</th></tr></thead>
118
+ <tbody id="latency-tbody"></tbody>
119
+ </table>
120
+ </div>
121
+ <div class="card">
122
+ <h2>By resolution (all-time)</h2>
123
+ <table>
124
+ <thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg time</th></tr></thead>
125
+ <tbody id="shape-tbody"></tbody>
126
+ </table>
127
+ </div>
128
+ <div class="card">
129
+ <h2>By resolution (today)</h2>
130
+ <table>
131
+ <thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg time</th><th class="num">Avg queue</th></tr></thead>
132
+ <tbody id="shape-today-tbody"></tbody>
133
+ </table>
134
+ </div>
135
+ </div>
136
+
137
+ <!-- Row: GPU stats. Live nvidia-smi snapshot, today's per-GPU breakdown,
138
+ all-time per-GPU breakdown. Lets you spot tier mix today vs total. -->
139
+ <div class="row three">
140
+ <div class="card">
141
+ <h2>GPUs (live)</h2>
142
+ <div id="gpus"></div>
143
+ </div>
144
+ <div class="card">
145
+ <h2>By GPU (today)</h2>
146
+ <table>
147
+ <thead><tr><th>GPU</th><th class="num">Count</th><th class="num">Avg latency</th><th class="num">Avg queue</th></tr></thead>
148
+ <tbody id="gpu-today-tbody"></tbody>
149
+ </table>
150
+ </div>
151
+ <div class="card">
152
+ <h2>By GPU (all-time)</h2>
153
+ <table>
154
+ <thead><tr><th>GPU</th><th class="num">Reps</th><th class="num">Count</th><th class="num">Errors</th><th class="num">Avg latency</th></tr></thead>
155
+ <tbody id="gpu-tbody"></tbody>
156
+ </table>
157
+ </div>
158
+ </div>
159
+
160
+ <div class="row single">
161
+ <div class="card">
162
+ <h2>Recent requests (last 50)</h2>
163
+ <table>
164
+ <thead><tr><th>Time</th><th>Shape</th><th>GPU</th><th class="num">Queued</th><th class="num">Duration</th><th>User</th><th>Status</th></tr></thead>
165
+ <tbody id="recent-tbody"></tbody>
166
+ </table>
167
+ </div>
168
+ </div>
169
+
170
+ <footer id="footer">β€”</footer>
171
+
172
+ <script>
173
+ // Absolute paths β€” the dashboard URL has no trailing slash, so relative
174
+ // `analytics.json` would resolve to `/analytics.json` (wrong) rather than
175
+ // `/dash-…/analytics.json`. nginx has explicit location blocks for these.
176
+ const ANALYTICS_URL = "/dash-10a08e9c1ee4/analytics.json";
177
+ const GPU_URL = "/dash-10a08e9c1ee4/gpu-stats.json";
178
+
179
+ function fmtDuration(s) {
180
+ if (!s) return "β€”";
181
+ const days = Math.floor(s / 86400);
182
+ const hours = Math.floor((s % 86400) / 3600);
183
+ const mins = Math.floor((s % 3600) / 60);
184
+ if (days) return `${days}d ${hours}h`;
185
+ if (hours) return `${hours}h ${mins}m`;
186
+ return `${mins}m`;
187
+ }
188
+ function fmtTime(ts) {
189
+ if (!ts) return "β€”";
190
+ return new Date(ts * 1000).toLocaleString();
191
+ }
192
+ function fmtRelative(ts) {
193
+ const dt = Date.now() / 1000 - ts;
194
+ if (dt < 60) return `${Math.floor(dt)}s ago`;
195
+ if (dt < 3600) return `${Math.floor(dt / 60)}m ago`;
196
+ if (dt < 86400) return `${Math.floor(dt / 3600)}h ago`;
197
+ return `${Math.floor(dt / 86400)}d ago`;
198
+ }
199
+
200
+ let hourlyChart, dailyChart;
201
+ function initCharts() {
202
+ Chart.defaults.color = "#7d8694";
203
+ Chart.defaults.borderColor = "#21272f";
204
+ Chart.defaults.font.family = "-apple-system, BlinkMacSystemFont, Segoe UI, Roboto, sans-serif";
205
+ hourlyChart = new Chart(document.getElementById("hourly-chart"), {
206
+ type: "line",
207
+ data: {
208
+ labels: [],
209
+ datasets: [{
210
+ label: "requests",
211
+ data: [],
212
+ borderColor: "#4cb583",
213
+ backgroundColor: "rgba(76, 181, 131, 0.12)",
214
+ fill: true,
215
+ tension: 0.25,
216
+ pointRadius: 3,
217
+ pointBackgroundColor: "#4cb583",
218
+ pointHoverRadius: 6,
219
+ }],
220
+ },
221
+ options: {
222
+ plugins: { legend: { display: false } },
223
+ scales: { y: { beginAtZero: true, ticks: { precision: 0 } } },
224
+ },
225
+ });
226
+ // Daily chart: single-series line, same style as the hourly chart. We
227
+ // KEEP the per-GPU breakdown data in requests_by_day[].by_gpu β€” it's just
228
+ // not rendered on the chart. Per-GPU averages are surfaced in the By GPU
229
+ // (today/all-time) tables; this chart sticks to volume-over-time.
230
+ dailyChart = new Chart(document.getElementById("daily-chart"), {
231
+ type: "line",
232
+ data: {
233
+ labels: [],
234
+ datasets: [{
235
+ label: "requests",
236
+ data: [],
237
+ borderColor: "#4cb583",
238
+ backgroundColor: "rgba(76, 181, 131, 0.12)",
239
+ fill: true,
240
+ tension: 0.25,
241
+ pointRadius: 3,
242
+ pointBackgroundColor: "#4cb583",
243
+ pointHoverRadius: 6,
244
+ spanGaps: true,
245
+ }],
246
+ },
247
+ options: {
248
+ plugins: { legend: { display: false } },
249
+ scales: { y: { beginAtZero: true, ticks: { precision: 0 } } },
250
+ },
251
+ });
252
+ }
253
+
254
+ function renderStorageBanner(a) {
255
+ const banner = document.getElementById("storage-banner");
256
+ if (!a) { banner.style.display = "none"; return; }
257
+ if (a.persistent_storage === false) {
258
+ banner.style.display = "block";
259
+ banner.textContent = "⚠ Persistent storage bucket not mounted at /data β€” counters, model weights, and kernel caches reset on every Space restart. Enable a Storage Bucket in Space Settings β†’ Storage.";
260
+ } else {
261
+ banner.style.display = "none";
262
+ }
263
+ }
264
+
265
+ function renderErrorBanner(msg) {
266
+ const banner = document.getElementById("error-banner");
267
+ if (!msg) { banner.style.display = "none"; return; }
268
+ banner.style.display = "block";
269
+ banner.textContent = msg;
270
+ }
271
+
272
+ function renderStaleBanner(a) {
273
+ // analytics.json gets rewritten every metrics_pusher tick (~2s). If the
274
+ // age creeps past ~10s the pusher is either struggling to reach the
275
+ // backends (load, restart, /metrics timeouts) or the pusher itself is
276
+ // wedged. Either way, surface it so the user doesn't mistake stale
277
+ // numbers for a real lull or zero-out.
278
+ const banner = document.getElementById("stale-banner");
279
+ if (!a || !a.updated_at) { banner.style.display = "none"; return; }
280
+ const ageSec = Math.floor(Date.now() / 1000 - a.updated_at);
281
+ if (ageSec > 10) {
282
+ banner.style.display = "block";
283
+ banner.textContent = `⚠ Metrics are ${ageSec}s stale β€” the backend likely couldn't answer the last few /metrics polls (often because it's busy with /generate). Numbers shown are the last good scrape.`;
284
+ } else {
285
+ banner.style.display = "none";
286
+ }
287
+ }
288
+
289
+ async function refresh() {
290
+ try {
291
+ const [aResp, gResp] = await Promise.all([fetch(ANALYTICS_URL, { cache: "no-store" }), fetch(GPU_URL, { cache: "no-store" })]);
292
+ if (!aResp.ok || !gResp.ok) throw new Error(`http ${aResp.status}/${gResp.status}`);
293
+ const a = await aResp.json();
294
+ const g = await gResp.json();
295
+ renderStorageBanner(a);
296
+ renderErrorBanner(null);
297
+ renderStaleBanner(a);
298
+ renderSummary(a);
299
+ renderReplicas(a);
300
+ renderVariant(a);
301
+ renderHourly(a);
302
+ renderShapeList(a);
303
+ renderShapeToday(a);
304
+ renderGpuToday(a);
305
+ renderDaily(a);
306
+ renderRecent(a);
307
+ renderGPUs(g);
308
+ renderLatency(a);
309
+ renderByGPU(a);
310
+ document.getElementById("updated").textContent = `updated ${fmtRelative(a.updated_at)}`;
311
+ } catch (e) {
312
+ renderErrorBanner(`Could not load metrics: ${e.message}. Sidecar may be down, or the bucket isn't ready yet.`);
313
+ document.getElementById("updated").textContent = `error: ${e.message}`;
314
+ }
315
+ }
316
+
317
+ function fmtMs(ms) {
318
+ if (ms == null || isNaN(ms)) return "β€”";
319
+ if (ms < 1000) return `${ms} ms`;
320
+ return `${(ms / 1000).toFixed(1)} s`;
321
+ }
322
+
323
+ function renderSummary(a) {
324
+ const t = a.summary_total || { requests: 0, success: 0, errors: 0 };
325
+ document.getElementById("total-requests").textContent = t.requests.toLocaleString();
326
+ document.getElementById("total-sub").innerHTML = `<span class="status-ok">${t.success.toLocaleString()} ok</span> Β· <span class="status-err">${t.errors.toLocaleString()} errors</span>`;
327
+ const today = a.summary_today || { requests: 0, unique_users: 0 };
328
+ document.getElementById("req-today").textContent = today.requests.toLocaleString();
329
+ document.getElementById("users-today").textContent = `${today.unique_users.toLocaleString()} unique users`;
330
+ const d7 = a.summary_7d || { requests: 0, unique_users: 0 };
331
+ document.getElementById("req-7d").textContent = d7.requests.toLocaleString();
332
+ document.getElementById("users-7d").textContent = `${d7.unique_users.toLocaleString()} unique users`;
333
+ const d30 = a.summary_30d || { requests: 0, unique_users: 0 };
334
+ document.getElementById("req-30d").textContent = d30.requests.toLocaleString();
335
+ document.getElementById("users-30d").textContent = `${d30.unique_users.toLocaleString()} unique users`;
336
+ const queue = a.queue_depth ?? 0;
337
+ const running = a.running ?? 0;
338
+ const cap = a.capacity ?? 0;
339
+ const todayQueueAvg = a.today_avg_queue_ms ?? 0;
340
+ document.getElementById("pending").textContent = queue.toLocaleString();
341
+ // Pending subtitle: live cap utilization + today's avg queue. The avg is
342
+ // computed in metrics_pusher from today_bucket.queue_ms_total / requests,
343
+ // so it includes successful + errored requests but not currently-queued
344
+ // ones (those haven't tripped queue_ms yet).
345
+ const queueAvgPart = today.requests ? ` Β· today queue avg ${fmtMs(todayQueueAvg)}` : "";
346
+ document.getElementById("pending-sub").textContent = `${running} running / ${cap} GPU slot${cap === 1 ? "" : "s"}${queueAvgPart}`;
347
+ document.getElementById("uptime").textContent = fmtDuration(a.uptime_s);
348
+ }
349
+
350
+ // Variant breakdown tile β€” shows the ternary vs binary mix at a glance.
351
+ // Big number: "T:1234 Β· B:567" (all-time). Subtitle: today's split. Variants
352
+ // keyed by name ("ternary" / "binary" / "unknown") from the request's
353
+ // `backend` field; metrics_pusher exposes them under by_variant + by_variant_today.
354
+ function renderVariant(a) {
355
+ const fmtMix = (data) => {
356
+ const t = data?.ternary?.count || 0;
357
+ const b = data?.binary?.count || 0;
358
+ const u = data?.unknown?.count || 0;
359
+ const parts = [`T:${t.toLocaleString()}`, `B:${b.toLocaleString()}`];
360
+ if (u) parts.push(`?:${u.toLocaleString()}`);
361
+ return parts.join(" Β· ");
362
+ };
363
+ document.getElementById("variant-metric").textContent = fmtMix(a.by_variant);
364
+ const todayMix = fmtMix(a.by_variant_today);
365
+ const todayTotal = Object.values(a.by_variant_today || {}).reduce((s, b) => s + (b.count || 0), 0);
366
+ document.getElementById("variant-sub").textContent = todayTotal
367
+ ? `all-time Β· today: ${todayMix}`
368
+ : `all-time Β· no requests yet today`;
369
+ }
370
+
371
+ // Multi-GPU health card. Shows replicas_seen/expected up top and a row of
372
+ // pills below β€” one per active replica, dot color reflects healthy or
373
+ // errored. If the seen count is below expected, "X/Y (1 down)" + warn tint.
374
+ function renderReplicas(a) {
375
+ const seen = a.replicas_seen ?? 0;
376
+ const expected = a.replicas_expected ?? seen;
377
+ const per = a.per_replica || [];
378
+ const metricEl = document.getElementById("replicas-metric");
379
+ metricEl.textContent = expected ? `${seen} / ${expected}` : String(seen);
380
+ metricEl.className = "metric" + (seen < expected ? " status-warn" : "");
381
+ const down = Math.max(0, expected - seen);
382
+ const subParts = [];
383
+ if (per.length) {
384
+ // Summarize tier mix: count GPUs by name. "L40S Γ— 2" or "L40S + A10G".
385
+ const tierCounts = new Map();
386
+ for (const r of per) tierCounts.set(r.gpu_name, (tierCounts.get(r.gpu_name) || 0) + 1);
387
+ const tierStr = [...tierCounts.entries()]
388
+ .map(([n, c]) => c > 1 ? `${n.replace(/^(NVIDIA |Tesla )/, "")} Γ— ${c}` : n.replace(/^(NVIDIA |Tesla )/, ""))
389
+ .join(" + ");
390
+ subParts.push(tierStr);
391
+ } else {
392
+ subParts.push("no replicas responding");
393
+ }
394
+ if (down) subParts.push(`${down} down`);
395
+ document.getElementById("replicas-sub").textContent = subParts.join(" Β· ");
396
+
397
+ // Per-replica pills: short tier label + current inflight/capacity. Hover
398
+ // shows the full gpu_name + uptime via title attribute.
399
+ const pillsEl = document.getElementById("replicas-pills");
400
+ pillsEl.innerHTML = per.map(r => {
401
+ const short = (r.gpu_name || "?").replace(/^(NVIDIA |Tesla )/, "");
402
+ const busy = r.inflight > 0;
403
+ const dotColor = busy ? "var(--warn)" : "var(--accent)";
404
+ const title = `${r.gpu_name || "unknown"} Β· uptime ${fmtDuration(r.uptime_s)} Β· total ${(r.total_requests ?? 0).toLocaleString()}`;
405
+ return `<span class="replica-pill" title="${title}"><span class="dot" style="background: ${dotColor}"></span>${short} ${r.inflight}/${r.capacity}</span>`;
406
+ }).join("");
407
+ }
408
+
409
+ function renderDaily(a) {
410
+ const days = a.requests_by_day || [];
411
+ dailyChart.data.labels = days.map(d => d.date.slice(5)); // MM-DD
412
+ dailyChart.data.datasets[0].data = days.map(d => d.count);
413
+ dailyChart.update("none");
414
+ }
415
+
416
+ function renderShapeList(a) {
417
+ const by = a.by_shape || {};
418
+ const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
419
+ const tbody = document.getElementById("shape-tbody");
420
+ if (!entries.length) {
421
+ tbody.innerHTML = `<tr><td colspan="3" class="metric-sub">no requests yet</td></tr>`;
422
+ return;
423
+ }
424
+ tbody.innerHTML = entries.map(([shape, b]) => `
425
+ <tr>
426
+ <td>${shape}</td>
427
+ <td class="num">${b.count.toLocaleString()}</td>
428
+ <td class="num">${fmtMs(b.duration_ms_avg)}</td>
429
+ </tr>
430
+ `).join("");
431
+ }
432
+
433
+ function renderByGPU(a) {
434
+ const by = a.by_gpu || {};
435
+ const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
436
+ const tbody = document.getElementById("gpu-tbody");
437
+ if (!entries.length) {
438
+ tbody.innerHTML = `<tr><td colspan="5" class="metric-sub">no per-GPU data yet</td></tr>`;
439
+ return;
440
+ }
441
+ // Dropped the explicit Success column from this table when we shrunk it
442
+ // into a 3-col row β€” success is implied by count - errors and rarely
443
+ // useful at a glance. Error count gets the warn color when nonzero.
444
+ const shortName = (n) => (n || "β€”").replace(/^(NVIDIA |Tesla )/, "");
445
+ tbody.innerHTML = entries.map(([name, b]) => `
446
+ <tr>
447
+ <td>${shortName(name)}</td>
448
+ <td class="num">${(b.replicas ?? 0).toLocaleString()}</td>
449
+ <td class="num">${b.count.toLocaleString()}</td>
450
+ <td class="num ${(b.errors ?? 0) > 0 ? "status-err" : ""}">${(b.errors ?? 0).toLocaleString()}</td>
451
+ <td class="num">${fmtMs(b.duration_ms_avg)}</td>
452
+ </tr>
453
+ `).join("");
454
+ }
455
+
456
+ // Today-scoped mirrors of renderShapeList / renderByGPU. Same shape of input
457
+ // from metrics_pusher (count + duration_ms_avg per key) so the table markup
458
+ // matches; columns are trimmed since today's per-GPU bucket doesn't carry
459
+ // replicas/success/errors splits.
460
+ function renderShapeToday(a) {
461
+ const by = a.by_shape_today || {};
462
+ const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
463
+ const tbody = document.getElementById("shape-today-tbody");
464
+ if (!entries.length) {
465
+ tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no requests yet today</td></tr>`;
466
+ return;
467
+ }
468
+ tbody.innerHTML = entries.map(([shape, b]) => `
469
+ <tr>
470
+ <td>${shape}</td>
471
+ <td class="num">${b.count.toLocaleString()}</td>
472
+ <td class="num">${fmtMs(b.duration_ms_avg)}</td>
473
+ <td class="num">${fmtMs(b.queue_ms_avg)}</td>
474
+ </tr>
475
+ `).join("");
476
+ }
477
+
478
+ function renderGpuToday(a) {
479
+ const by = a.by_gpu_today || {};
480
+ const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
481
+ const tbody = document.getElementById("gpu-today-tbody");
482
+ if (!entries.length) {
483
+ tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no requests yet today</td></tr>`;
484
+ return;
485
+ }
486
+ // Drop "NVIDIA " / "Tesla " prefix to keep the GPU column narrow in the
487
+ // 3-column row layout.
488
+ const shortName = (n) => (n || "β€”").replace(/^(NVIDIA |Tesla )/, "");
489
+ tbody.innerHTML = entries.map(([name, b]) => `
490
+ <tr>
491
+ <td>${shortName(name)}</td>
492
+ <td class="num">${b.count.toLocaleString()}</td>
493
+ <td class="num">${fmtMs(b.duration_ms_avg)}</td>
494
+ <td class="num">${fmtMs(b.queue_ms_avg)}</td>
495
+ </tr>
496
+ `).join("");
497
+ }
498
+
499
+ function renderLatency(a) {
500
+ // Latency uses the last-50 window (recent_by_shape) so the numbers feel
501
+ // current β€” long-term shape avg is on the by-resolution table above.
502
+ document.getElementById("avg-latency").textContent = fmtMs(a.recent_avg_latency_ms);
503
+ const n = a.recent_count ?? 0;
504
+ document.getElementById("avg-latency-sub").textContent = `across last ${n.toLocaleString()} requests`;
505
+ const by = a.recent_by_shape || {};
506
+ const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
507
+ const tbody = document.getElementById("latency-tbody");
508
+ if (!entries.length) {
509
+ tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no recent requests yet</td></tr>`;
510
+ return;
511
+ }
512
+ tbody.innerHTML = entries.map(([shape, b]) => `
513
+ <tr>
514
+ <td>${shape}</td>
515
+ <td class="num">${b.count.toLocaleString()}</td>
516
+ <td class="num">${fmtMs(b.duration_ms_avg)}</td>
517
+ <td class="num">${fmtMs(b.queue_ms_avg)}</td>
518
+ </tr>
519
+ `).join("");
520
+ }
521
+
522
+ function renderHourly(a) {
523
+ // Today's by-hour, 24 ints indexed by UTC hour
524
+ const buckets = a.requests_by_hour || [];
525
+ hourlyChart.data.labels = buckets.map((_, i) => `${i.toString().padStart(2, "0")}:00`);
526
+ hourlyChart.data.datasets[0].data = buckets;
527
+ hourlyChart.update("none");
528
+ }
529
+
530
+ function renderRecent(a) {
531
+ const rows = (a.recent || []).slice().reverse().slice(0, 50);
532
+ const tbody = document.getElementById("recent-tbody");
533
+ // GPU shorthand: drop the "NVIDIA " prefix so the column stays narrow
534
+ // ("L40S" / "A10G" reads cleaner than "NVIDIA L40S"). Older recent entries
535
+ // (pre-feature) won't have r.gpu β€” fall back to "β€”".
536
+ const shortGpu = (g) => (g || "β€”").replace(/^(NVIDIA |Tesla )/, "");
537
+ tbody.innerHTML = rows.map(r => `
538
+ <tr>
539
+ <td>${fmtRelative(r.ts)}</td>
540
+ <td>${r.shape || "β€”"}</td>
541
+ <td>${shortGpu(r.gpu)}</td>
542
+ <td class="num">${r.queue_ms != null ? fmtMs(r.queue_ms) : "β€”"}</td>
543
+ <td class="num">${r.duration_ms ? (r.duration_ms / 1000).toFixed(1) + "s" : "β€”"}</td>
544
+ <td>${(r.ip_hash || "β€”").slice(0, 8)}</td>
545
+ <td class="${r.ok ? "status-ok" : "status-err"}">${r.ok ? "ok" : "err"}</td>
546
+ </tr>
547
+ `).join("");
548
+ }
549
+
550
+ function renderGPUs(g) {
551
+ const div = document.getElementById("gpus");
552
+ const gpus = g.gpus || [];
553
+ if (!gpus.length) {
554
+ div.innerHTML = `<div class="metric-sub">${g.error || "no GPU data yet"}</div>`;
555
+ return;
556
+ }
557
+ div.innerHTML = gpus.map(gpu => {
558
+ const memPct = gpu.memory_total_mb ? Math.round(100 * (gpu.memory_used_mb || 0) / gpu.memory_total_mb) : 0;
559
+ const util = gpu.util_pct ?? 0;
560
+ return `
561
+ <div style="margin-bottom: 12px;">
562
+ <div style="display: flex; justify-content: space-between;"><span><b>GPU ${gpu.index}</b> ${gpu.name || ""}</span><span class="metric-sub">${gpu.temp_c ?? "β€”"}Β°C Β· ${gpu.power_w ? gpu.power_w.toFixed(0) : "β€”"}W</span></div>
563
+ <div class="metric-sub" style="margin-top: 4px;">util ${util}%</div>
564
+ <div class="gpu-bar"><div class="gpu-bar-fill" style="width: ${util}%"></div></div>
565
+ <div class="metric-sub" style="margin-top: 4px;">mem ${gpu.memory_used_mb ?? "β€”"} / ${gpu.memory_total_mb ?? "β€”"} MB (${memPct}%)</div>
566
+ <div class="gpu-bar"><div class="gpu-bar-fill" style="width: ${memPct}%; background: #d97757;"></div></div>
567
+ </div>
568
+ `;
569
+ }).join("");
570
+ document.getElementById("footer").textContent = `GPU sample: ${fmtRelative(g.ts)}`;
571
+ }
572
+
573
+ // Refresh cadence: default 2s, override with `#refresh=N` in the URL
574
+ // (where N is seconds, e.g. #refresh=1 for 1s, #refresh=0.5 for 500ms).
575
+ // metrics_pusher writes JSON every 2s by default β€” polling faster than
576
+ // that just re-reads the same file. Bump METRICS_INTERVAL env on the
577
+ // Space too if you genuinely need sub-2s.
578
+ function readRefreshMs() {
579
+ const m = (location.hash || "").match(/refresh=([0-9.]+)/);
580
+ if (m) {
581
+ const v = parseFloat(m[1]);
582
+ if (v >= 0.25 && v <= 60) return Math.round(v * 1000);
583
+ }
584
+ return 2000;
585
+ }
586
+ const REFRESH_MS = readRefreshMs();
587
+ document.getElementById("refresh-label").textContent = `auto-refresh every ${(REFRESH_MS / 1000).toString()}s`;
588
+
589
+ initCharts();
590
+ refresh();
591
+ setInterval(refresh, REFRESH_MS);
592
+ </script>
593
+ </body>
594
+ </html>
space/entrypoint.sh ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Bonsai-Image HF Space entrypoint.
3
+ #
4
+ # Boot order:
5
+ # 1. Download the ternary gemlite model (~3.5 GB) β€” idempotent.
6
+ # 2. Generate /tmp/.htpasswd from $DASHBOARD_KEY for the basic-auth gate.
7
+ # 3. Build /tmp/nginx-upstream.conf from `nvidia-smi -L`. One server line
8
+ # per GPU. At N=1 the upstream has one entry; at N>1 we prepend
9
+ # `least_conn;` for variable-duration request routing.
10
+ # 4. Spawn one `uvicorn space.app:app` per GPU on consecutive ports
11
+ # (CUDA_VISIBLE_DEVICES pinned). Each worker's lifespan warms the
12
+ # shapes listed in BONSAI_WARMUP_SHAPES.
13
+ # 5. Wait for the first worker to be ready, then `next start` on :3000
14
+ # (internal β€” nginx will expose it on :7860).
15
+ # 6. Start metrics_pusher sidecar with a watchdog.
16
+ # 7. Exec nginx on :7860 (the one public port HF sees).
17
+ #
18
+ # Env (HF Space secrets):
19
+ # HF_TOKEN model + tokenizer downloads
20
+ # DASHBOARD_KEY basic-auth password for /dash-<obfuscated>
21
+ # BONSAI_WARMUP_SHAPES default "512x512,1024x1024,1248x832"
22
+ set -euo pipefail
23
+
24
+ APP_DIR="${HOME:-/home/user}/app"
25
+ cd "$APP_DIR"
26
+
27
+ export PATH="$APP_DIR/.venv/bin:$PATH"
28
+ export HF_HUB_ENABLE_HF_TRANSFER=1
29
+
30
+ # ── GPU detection (early β€” needed for cache namespacing + tier-aware warmup) ─
31
+ # nvidia-smi might not return data in some odd container states; treat as
32
+ # "unknown" rather than crashing so the rest of the boot can still run.
33
+ GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | xargs)
34
+ GPU_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -1 | tr -d '.')
35
+ [ -z "$GPU_NAME" ] && GPU_NAME="unknown"
36
+ [ -z "$GPU_CAP" ] && GPU_CAP="00"
37
+ echo "[OK] GPU: $GPU_NAME (sm_${GPU_CAP})"
38
+
39
+ # Slow GPUs (T4, older Tesla cards): warm only the two square presets we
40
+ # benchmark against (512Β² and 1024Β²) and extend the readiness deadline.
41
+ # Skipping warmup entirely would shift the multi-minute first-call JIT
42
+ # onto the first user request, which corrupts benchmark numbers β€” better
43
+ # to bake it into boot. BONSAI_WARMUP_SHAPES + BACKEND_READY_TIMEOUT can
44
+ # be overridden via Space Variables if you want different shapes or a
45
+ # longer/shorter deadline.
46
+ case "$GPU_NAME" in
47
+ *T4*|*P100*|*V100*|*K80*|*M60*)
48
+ echo "[WARN] $GPU_NAME is slow β€” warming only 512x512 + 1024x1024."
49
+ echo " Extending readiness timeout to 30 min for the longer JIT."
50
+ : "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}"
51
+ : "${BACKEND_READY_TIMEOUT:=1800}"
52
+ export BONSAI_WARMUP_SHAPES BACKEND_READY_TIMEOUT
53
+ ;;
54
+ esac
55
+
56
+ # ── persistent storage detection ─────────────────────────────────────────────
57
+ # Try to use /data (a Storage Bucket if mounted) for the model + kernel
58
+ # caches + stats. Every filesystem op is wrapped so that if anything fails
59
+ # midway β€” bucket detached mid-build, mkdir denied, symlink races β€” we
60
+ # silently fall back to ephemeral storage and keep going. The dashboard
61
+ # banner alerts the user via BONSAI_PERSISTENT_STORAGE.
62
+ _setup_persistent() {
63
+ [ -d /data ] && [ -w /data ] || return 1
64
+
65
+ # Kernel caches namespaced by compute capability so a tier swap (e.g.
66
+ # L40S sm_89 β†’ T4 sm_75 β†’ back to L40S) doesn't pollute either GPU's
67
+ # autotune configs / Triton kernels.
68
+ _gemlite_dir="/data/cache/gemlite-sm${GPU_CAP}"
69
+ _triton_dir="/data/cache/triton-sm${GPU_CAP}"
70
+
71
+ # One-shot migration: if a non-namespaced cache exists from older
72
+ # builds, move it under the current GPU's namespace so we don't lose
73
+ # the pre-existing autotune work.
74
+ if [ -d /data/cache/gemlite ] && [ ! -e "$_gemlite_dir" ]; then
75
+ echo "[INFO] migrating /data/cache/gemlite β†’ gemlite-sm${GPU_CAP}"
76
+ mv /data/cache/gemlite "$_gemlite_dir" 2>/dev/null || true
77
+ fi
78
+ if [ -d /data/cache/triton ] && [ ! -e "$_triton_dir" ]; then
79
+ echo "[INFO] migrating /data/cache/triton β†’ triton-sm${GPU_CAP}"
80
+ mv /data/cache/triton "$_triton_dir" 2>/dev/null || true
81
+ fi
82
+
83
+ mkdir -p /data/models "$_gemlite_dir" "$_triton_dir" /data/state /data/state/daily 2>/dev/null || return 1
84
+ rm -rf "$APP_DIR/models" 2>/dev/null || return 1
85
+ ln -s /data/models "$APP_DIR/models" 2>/dev/null || return 1
86
+ mkdir -p "$APP_DIR/outputs" 2>/dev/null || return 1
87
+ rm -rf "$APP_DIR/outputs/.gemlite_cache" "$APP_DIR/outputs/.triton_cache" 2>/dev/null || true
88
+ ln -s "$_gemlite_dir" "$APP_DIR/outputs/.gemlite_cache" 2>/dev/null || return 1
89
+ ln -s "$_triton_dir" "$APP_DIR/outputs/.triton_cache" 2>/dev/null || return 1
90
+ return 0
91
+ }
92
+
93
+ if _setup_persistent; then
94
+ echo "[OK] /data Storage Bucket attached β€” model + caches + counters will persist"
95
+ export BONSAI_STATE_DIR=/data/state
96
+ export BONSAI_PERSISTENT_STORAGE=1
97
+ else
98
+ if [ -d /data ]; then
99
+ echo "[WARN] /data is present but couldn't be set up (read-only? quota?). Falling back to ephemeral."
100
+ else
101
+ echo "[WARN] /data not mounted β€” model, kernel caches, and dashboard"
102
+ echo " counters will reset on every Space restart. Enable a"
103
+ echo " Storage Bucket in Space Settings β†’ Storage to fix."
104
+ fi
105
+ export BONSAI_STATE_DIR="$APP_DIR/outputs/.state"
106
+ export BONSAI_PERSISTENT_STORAGE=0
107
+ mkdir -p "$BONSAI_STATE_DIR/daily" 2>/dev/null || true
108
+ fi
109
+
110
+ # ── shared IP-hash pepper across all replicas ────────────────────────────────
111
+ # Every replica must hash IPs with the same pepper so unique-user counts
112
+ # don't double across replicas. Extract from state.json if present (so the
113
+ # pepper survives restarts), else generate a fresh one. Each worker reads
114
+ # this via env, regardless of whether it loads cumulative state.
115
+ if [ -f "$BONSAI_STATE_DIR/state.json" ]; then
116
+ BONSAI_IP_PEPPER=$(python3 - "$BONSAI_STATE_DIR/state.json" <<'PY' 2>/dev/null || true
117
+ import json, sys
118
+ try:
119
+ with open(sys.argv[1]) as f:
120
+ print(json.load(f).get("ip_pepper") or "")
121
+ except Exception:
122
+ pass
123
+ PY
124
+ )
125
+ fi
126
+ if [ -z "${BONSAI_IP_PEPPER:-}" ]; then
127
+ BONSAI_IP_PEPPER=$(python3 -c "import secrets; print(secrets.token_hex(16))")
128
+ fi
129
+ export BONSAI_IP_PEPPER
130
+ # Warm only the two square presets users hit most often (512Β² and 1024Β²).
131
+ # Other resolutions JIT on first user request and join the on-disk caches
132
+ # (/data/cache/{gemlite,triton}-smXX/) organically. The warmup-skip sentinel
133
+ # (warmup-done.json next to gemlite autotune) tracks completed (backend,shape)
134
+ # pairs across boots, so subsequent boots skip even these two if they're
135
+ # already cached.
136
+ #
137
+ # Why so few shapes: multi-GPU boots collide during warmup β€” all N workers
138
+ # race for /data bandwidth + CPU during the gemlite layer pack, and we've
139
+ # seen 4-worker launches hang past BACKEND_READY_TIMEOUT. Two shapes covers
140
+ # the common case (most users render at 512Β² or 1024Β²) without inflating
141
+ # cold-boot wall time.
142
+ : "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}"
143
+ export BONSAI_WARMUP_SHAPES
144
+
145
+ # Binary warmup disabled by default. When enabled, every replica swaps to
146
+ # the binary transformer simultaneously after primary warmup β€” 4 parallel
147
+ # 3.5 GB state_dict reads from /data + 4 parallel gemlite layer packs.
148
+ # We've seen this hang multi-GPU boots indefinitely. First binary-arm click
149
+ # pays a one-time JIT cost (~30s for an unwarmed shape, after which the
150
+ # cache covers it forever).
151
+ #
152
+ # To re-enable on single-GPU rigs where the collision doesn't apply:
153
+ # set Space Variable BONSAI_WARMUP_EXTRA_BACKENDS=bonsai-binary-gemlite
154
+ : "${BONSAI_WARMUP_EXTRA_BACKENDS:=}"
155
+ export BONSAI_WARMUP_EXTRA_BACKENDS
156
+
157
+ # ── token sanity check ───────────────────────────────────────────────────────
158
+ if [ -z "${HF_TOKEN:-}" ]; then
159
+ echo "[ERR] HF_TOKEN not set β€” add it as a Space Secret so the model can download." >&2
160
+ exit 1
161
+ fi
162
+ export BONSAI_TOKEN="$HF_TOKEN" # what download_model.sh expects
163
+
164
+ # ── model download / sync ────────────────────────────────────────────────────
165
+ # Ship BOTH ternary + binary so the picker's two options actually work. Each
166
+ # repo is ~3.5 GB; first cold boot downloads ~7 GB total, but Storage Bucket
167
+ # (/data/models, symlinked above) keeps them across restarts.
168
+ #
169
+ # We *always* invoke download_model.sh on boot (no file-exists guard). Under
170
+ # the hood it calls huggingface_hub.snapshot_download with `local_dir` set,
171
+ # which HEADs each file in the repo and skips any whose etag matches what's
172
+ # already on disk β€” so cached boots cost ~10-30s of metadata checks instead
173
+ # of a full redownload. The upside: pushing new weights to HF auto-propagates
174
+ # on the next Space restart without a force flag or manual cache wipe.
175
+ MODEL_DIR="$APP_DIR/models/bonsai-image-4B-ternary-gemlite"
176
+ BINARY_MODEL_DIR="$APP_DIR/models/bonsai-image-4B-binary-gemlite"
177
+ echo "==> syncing bonsai-image-ternary-4B-gemlite-2bit ..."
178
+ ./scripts/download_model.sh --model ternary-gemlite
179
+ echo "==> syncing bonsai-image-binary-4B-gemlite-1bit ..."
180
+ ./scripts/download_model.sh --model binary-gemlite
181
+
182
+ # ── htpasswd for the dashboard ───────────────────────────────────────────────
183
+ # DASHBOARD_KEY is a Space Secret; fall back to a sentinel that prints a
184
+ # big warning so missing-secret is obvious in the build log but the Space
185
+ # still comes up (useful while iterating).
186
+ if [ -n "${DASHBOARD_KEY:-}" ]; then
187
+ HASH=$(openssl passwd -apr1 "$DASHBOARD_KEY")
188
+ printf 'admin:%s\n' "$HASH" > /tmp/.htpasswd
189
+ echo "[OK] dashboard: auth enabled (user=admin)"
190
+ else
191
+ echo "[WARN] DASHBOARD_KEY not set β€” /dash-... is open with admin:open"
192
+ printf 'admin:$apr1$open$open\n' > /tmp/.htpasswd
193
+ fi
194
+
195
+ # ── nginx scratch dirs ─��─────────────────────────────────────────────────────
196
+ mkdir -p /tmp/nginx-body /tmp/nginx-proxy /tmp/nginx-fastcgi /tmp/nginx-uwsgi /tmp/nginx-scgi
197
+
198
+ # ── pre-seed dashboard JSON so the page doesn't 502 before first scrape ──────
199
+ printf '{"updated_at":null,"persistent_storage":%s,"summary_total":{"requests":0,"success":0,"errors":0},"summary_today":{"requests":0,"unique_users":0},"summary_7d":{"requests":0,"unique_users":0},"by_shape":{},"requests_by_hour":[],"requests_by_day":[],"recent":[]}\n' \
200
+ "$([ "${BONSAI_PERSISTENT_STORAGE:-0}" = "1" ] && echo true || echo false)" \
201
+ > /tmp/analytics.json
202
+ echo '{"ts":null,"gpus":[]}' > /tmp/gpu-stats.json
203
+
204
+ # ── pin model paths once; shared across all workers ──────────────────────────
205
+ # backend_gpu/pipeline_gpu.py reads SEPARATE env vars per variant
206
+ # (TERNARY_TRANSFORMER_PATH vs BINARY_TRANSFORMER_PATH) and the packed
207
+ # transformer subdir name differs per variant (transformer-gemlite-int2
208
+ # for ternary, transformer-gemlite-int1 for binary). Glob each variant's
209
+ # dir for whichever transformer-gemlite-* it actually ships and assign to
210
+ # the right env var. Without the BINARY env var set, the pipeline falls
211
+ # back to its hardcoded /root/models/bonsai-binary/ default β†’ PermissionError
212
+ # on a non-root container the moment a user picks binary in the UI.
213
+ #
214
+ # Note: text_encoder + vae + tokenizer are the SAME artifacts across both
215
+ # variants (Qwen3-4B-4bit + BFL VAE). Pointing them at the ternary copy
216
+ # is fine; binary's copy of these files sits idle on disk after download.
217
+ # That's a one-time ~1 GB of duplication on disk for the simplicity of
218
+ # letting download_model.sh pull the standard HF layout for each repo.
219
+ export MFLUX_STUDIO_GPU_DEFAULT_BACKEND="bonsai-ternary-gemlite"
220
+ _ternary_transformer_dir=$(ls -d "$MODEL_DIR"/transformer-gemlite-* 2>/dev/null | head -1)
221
+ if [ -z "$_ternary_transformer_dir" ]; then
222
+ echo "[ERR] no transformer-gemlite-* subdir under $MODEL_DIR" >&2
223
+ exit 1
224
+ fi
225
+ _binary_transformer_dir=$(ls -d "$BINARY_MODEL_DIR"/transformer-gemlite-* 2>/dev/null | head -1)
226
+ if [ -z "$_binary_transformer_dir" ]; then
227
+ echo "[ERR] no transformer-gemlite-* subdir under $BINARY_MODEL_DIR" >&2
228
+ exit 1
229
+ fi
230
+ export MFLUX_STUDIO_GPU_TERNARY_TRANSFORMER_PATH="$_ternary_transformer_dir"
231
+ export MFLUX_STUDIO_GPU_BINARY_TRANSFORMER_PATH="$_binary_transformer_dir"
232
+ export MFLUX_STUDIO_GPU_TEXT_ENCODER_PATH="$MODEL_DIR/text_encoder-hqq-4bit"
233
+ export MFLUX_STUDIO_GPU_VAE_PATH="$MODEL_DIR/vae"
234
+ export MFLUX_STUDIO_GPU_TOKENIZER_PATH="$MODEL_DIR/text_encoder-hqq-4bit/tokenizer"
235
+
236
+ # ── detect GPUs + spawn one uvicorn per device ───────────────────────────────
237
+ GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l || echo 1)
238
+ [ "$GPU_COUNT" -lt 1 ] && GPU_COUNT=1
239
+ echo "[OK] detected $GPU_COUNT GPU(s)"
240
+
241
+ # Stagger consecutive worker starts. Without this, all N uvicorns hit the
242
+ # /data bucket simultaneously, contending for ~5 GB state_dict reads + the
243
+ # CPU-bound fp16 cast + gemlite layer conversion. We've seen 4-worker
244
+ # launches blow through BACKEND_READY_TIMEOUT this way. Staggering by ~30s
245
+ # (a hair more than the single-worker transformer-load wall time observed
246
+ # on warm bucket / sm_86) lets each worker get past torch.load + gemlite
247
+ # convert before the next starts touching the same files.
248
+ WORKER_START_STAGGER_SECONDS="${BONSAI_WORKER_START_STAGGER_SECONDS:-30}"
249
+
250
+ BACKEND_URLS=""
251
+ UPSTREAM_SERVERS=""
252
+ for i in $(seq 0 $((GPU_COUNT - 1))); do
253
+ PORT=$((8000 + i))
254
+ # Per-replica GPU name (mixed-GPU rigs are rare but possible β€” look it
255
+ # up by physical index rather than reuse the top-level GPU_NAME).
256
+ REPLICA_GPU=$(nvidia-smi --query-gpu=name --format=csv,noheader -i "$i" 2>/dev/null | head -1 | xargs)
257
+ [ -z "$REPLICA_GPU" ] && REPLICA_GPU="$GPU_NAME"
258
+ echo "==> starting backend on GPU $i ($REPLICA_GPU) β†’ :$PORT (warmup: $BONSAI_WARMUP_SHAPES)"
259
+ # BONSAI_REPLICA_INDEX: only replica 0 seeds counters from state.json;
260
+ # replicas 1+ start at 0 and report deltas. metrics_pusher sums them β†’
261
+ # correct cumulative without N-way inflation.
262
+ # BONSAI_GPU_NAME: surfaced via /metrics so the pusher can aggregate
263
+ # request counts/latencies per GPU model for the dashboard.
264
+ CUDA_VISIBLE_DEVICES=$i BONSAI_REPLICA_INDEX=$i BONSAI_GPU_NAME="$REPLICA_GPU" \
265
+ uvicorn space.app:app \
266
+ --host 127.0.0.1 --port "$PORT" \
267
+ --no-access-log &
268
+ UPSTREAM_SERVERS="${UPSTREAM_SERVERS} server 127.0.0.1:$PORT;"$'\n'
269
+ [ -n "$BACKEND_URLS" ] && BACKEND_URLS="$BACKEND_URLS,"
270
+ BACKEND_URLS="${BACKEND_URLS}http://127.0.0.1:$PORT"
271
+ # Sleep between consecutive worker starts (skip after the last one).
272
+ # Set BONSAI_WORKER_START_STAGGER_SECONDS=0 to disable if cold-boot
273
+ # wall time matters more than first-boot reliability.
274
+ if [ "$i" -lt "$((GPU_COUNT - 1))" ] && [ "$WORKER_START_STAGGER_SECONDS" -gt 0 ]; then
275
+ echo " ↳ sleeping ${WORKER_START_STAGGER_SECONDS}s before next worker (avoid /data + CPU contention)"
276
+ sleep "$WORKER_START_STAGGER_SECONDS"
277
+ fi
278
+ done
279
+
280
+ # At N>1 use least_conn (variable-duration requests β€” see space/nginx.conf).
281
+ if [ "$GPU_COUNT" -gt 1 ]; then
282
+ LB_DIRECTIVE=" least_conn;"$'\n'
283
+ else
284
+ LB_DIRECTIVE=""
285
+ fi
286
+ printf 'upstream bonsai_workers {\n%s%s}\n' "$LB_DIRECTIVE" "$UPSTREAM_SERVERS" > /tmp/nginx-upstream.conf
287
+ export BACKEND_URLS
288
+
289
+ # ── wait for backend readiness ───────────────────────────────────────────────
290
+ # Workers only answer /backends after lifespan finishes (kernels compiled +
291
+ # warmup shapes JITed). We poll the first one as a proxy for "ready enough."
292
+ _ready_timeout="${BACKEND_READY_TIMEOUT:-600}"
293
+ echo "==> waiting for backend on :8000 (up to ${_ready_timeout}s) ..."
294
+ for i in $(seq 1 "$_ready_timeout"); do
295
+ if curl -fsS -m 2 http://127.0.0.1:8000/backends > /dev/null 2>&1; then
296
+ echo "[OK] backend ready after ${i}s"
297
+ break
298
+ fi
299
+ sleep 1
300
+ if [ "$i" -eq "$_ready_timeout" ]; then
301
+ echo "[ERR] backend did not come up within ${_ready_timeout}s" >&2
302
+ exit 1
303
+ fi
304
+ done
305
+
306
+ # ── frontend (next start) on internal :3000 ──────────────────────────────────
307
+ echo "==> starting frontend (next start) on :3000"
308
+ (cd vendor/image-studio/frontend && exec npm start -- --port 3000 --hostname 127.0.0.1) &
309
+
310
+ # ── metrics_pusher sidecar (watchdog restart on crash) ───────────────────────
311
+ start_metrics_pusher() {
312
+ while true; do
313
+ echo "[watchdog] starting metrics_pusher.py"
314
+ python3 /home/user/app/space/metrics_pusher.py || true
315
+ echo "[watchdog] metrics_pusher.py exited, restarting in 5s"
316
+ sleep 5
317
+ done
318
+ }
319
+ start_metrics_pusher &
320
+
321
+ # ── nginx β€” front everything on :7860 (the HF-exposed port) ──────────────────
322
+ echo "==> nginx on :7860"
323
+ exec nginx -c /home/user/app/space/nginx.conf -p /home/user/app/
space/metrics_pusher.py ADDED
@@ -0,0 +1,599 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sidecar that aggregates backend /metrics + nvidia-smi into JSON files.
2
+
3
+ On every tick (default 5 s) it writes:
4
+ /tmp/analytics.json current totals, today + 7d summaries, GPU info flag
5
+ /tmp/gpu-stats.json nvidia-smi snapshot
6
+
7
+ Every Nth tick (default 12 β†’ ~1 min) it also writes:
8
+ $BONSAI_STATE_DIR/state.json boot-recovery snapshot
9
+ $BONSAI_STATE_DIR/daily/YYYY-MM-DD.json per-UTC-day archive (one file/day)
10
+
11
+ Robust to:
12
+ - missing /data bucket (writes go to ephemeral $BONSAI_STATE_DIR fallback)
13
+ - missing nvidia-smi
14
+ - backend not yet up (HTTP errors logged, tick continues)
15
+ - FUSE-backed mounts that don't support atomic rename (falls back to in-place)
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import os
21
+ import subprocess
22
+ import time
23
+ import urllib.request
24
+ from collections import defaultdict
25
+
26
+ # Day bucketing is in UTC β€” matches what space.app uses for `_by_day` keys
27
+ # (we tried PT but the CUDA Ubuntu base image strips tzdata).
28
+
29
+ BACKEND_URLS = [u.strip() for u in os.environ.get("BACKEND_URLS", "http://127.0.0.1:8000").split(",") if u.strip()]
30
+ INTERVAL = int(os.environ.get("METRICS_INTERVAL", "2"))
31
+ ANALYTICS_PATH = "/tmp/analytics.json"
32
+ GPU_PATH = "/tmp/gpu-stats.json"
33
+
34
+ # Persisted state. STATE_DIR is /data/state when a bucket is mounted, else
35
+ # ephemeral under outputs/ (gone on Space restart).
36
+ STATE_DIR = os.environ.get("BONSAI_STATE_DIR", "/tmp")
37
+ STATE_PATH = os.path.join(STATE_DIR, "state.json")
38
+ DAILY_DIR = os.path.join(STATE_DIR, "daily")
39
+
40
+ # Write durable files (state.json + daily archives) every Nth tick to amortize
41
+ # disk traffic. Losing N*INTERVAL seconds of counter increments on unclean
42
+ # shutdown is acceptable.
43
+ STATE_WRITE_EVERY_N_TICKS = int(os.environ.get("STATE_WRITE_EVERY_N_TICKS", "12"))
44
+
45
+ # Surfaces in analytics.json so the dashboard shows a "counters won't persist"
46
+ # banner when a bucket is not mounted. Set by entrypoint.sh.
47
+ PERSISTENT_STORAGE = os.environ.get("BONSAI_PERSISTENT_STORAGE", "0") == "1"
48
+
49
+
50
+ def _fetch_json(url: str, timeout: float = 5.0) -> dict | None:
51
+ # 5s timeout (was 2s): under 16-concurrent /generate load the uvicorn
52
+ # event loop can briefly queue /metrics behind in-flight responses.
53
+ # 5s is still well under the dashboard's polling cadence (so the user
54
+ # doesn't see a delay) and gives the backend headroom under stress.
55
+ try:
56
+ with urllib.request.urlopen(url, timeout=timeout) as resp:
57
+ return json.loads(resp.read())
58
+ except Exception:
59
+ return None
60
+
61
+
62
+ def fetch_backend_metrics() -> dict:
63
+ """Aggregate /metrics from every backend replica."""
64
+ agg: dict = {
65
+ "total_requests": 0,
66
+ "success": 0,
67
+ "errors": 0,
68
+ "uptime_s": 0,
69
+ "inflight": 0, # sum across replicas β€” total in-flight requests
70
+ "generate_capacity": 0, # sum of per-replica concurrency caps
71
+ "replicas_seen": 0, # how many replicas answered /metrics this tick
72
+ # Per-replica details β€” list of {gpu_name, inflight, capacity,
73
+ # uptime_s, total_requests}. Used to compute accurate queue_depth
74
+ # (sum of per-replica (inflight - capacity)+ rather than the sum-
75
+ # then-subtract approximation that hides imbalance) and to render
76
+ # the multi-GPU health card on the dashboard.
77
+ "per_replica": [],
78
+ "by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0}),
79
+ # Cumulative per-variant counter. Replicas each report their own
80
+ # _by_variant; we sum them here. Variants are "ternary", "binary",
81
+ # or "unknown" β€” parsed from the request's `backend` field.
82
+ "by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
83
+ "by_day": {}, # date -> {requests, success, errors, by_shape, by_hour, unique_ips set, queue_ms_total}
84
+ # Per-GPU model breakdown β€” each replica's gpu_name + counts +
85
+ # duration sum get folded in. Multiple replicas on the same GPU
86
+ # model (e.g. l40sx4 = 4Γ— "NVIDIA L40S") merge into one bucket.
87
+ "by_gpu": defaultdict(lambda: {"count": 0, "success": 0, "errors": 0, "duration_ms_total": 0, "replicas": 0}),
88
+ "recent": [],
89
+ "ip_pepper": None,
90
+ }
91
+ for url in BACKEND_URLS:
92
+ data = _fetch_json(f"{url}/metrics")
93
+ if not data:
94
+ continue
95
+ agg["replicas_seen"] += 1
96
+ agg["total_requests"] += data.get("total_requests", 0)
97
+ agg["success"] += data.get("success", 0)
98
+ agg["errors"] += data.get("errors", 0)
99
+ agg["uptime_s"] = max(agg["uptime_s"], data.get("uptime_s", 0))
100
+ replica_inflight = data.get("inflight", 0)
101
+ replica_capacity = data.get("generate_concurrency", 1)
102
+ agg["inflight"] += replica_inflight
103
+ agg["generate_capacity"] += replica_capacity
104
+ # Per-GPU rollup β€” fold this replica's totals into its GPU bucket.
105
+ # Default to NVIDIA L40S when missing so historical /metrics without
106
+ # gpu_name (pre-this-feature) don't show up as "unknown".
107
+ gpu = data.get("gpu_name") or "NVIDIA L40S"
108
+ # Per-replica record β€” keep the gpu_name + cap so the dashboard's
109
+ # multi-GPU health card can render "L40S Β· 1/1 busy" style rows
110
+ # and the queue calc can subtract per-replica.
111
+ agg["per_replica"].append({
112
+ "url": url,
113
+ "gpu_name": gpu,
114
+ "inflight": replica_inflight,
115
+ "capacity": replica_capacity,
116
+ "uptime_s": data.get("uptime_s", 0),
117
+ "total_requests": data.get("total_requests", 0),
118
+ "replica_index": data.get("replica_index"),
119
+ })
120
+ g = agg["by_gpu"][gpu]
121
+ g["count"] += data.get("total_requests", 0)
122
+ g["success"] += data.get("success", 0)
123
+ g["errors"] += data.get("errors", 0)
124
+ g["duration_ms_total"] += data.get("total_duration_ms", 0)
125
+ g["replicas"] += 1
126
+ for shape, b in data.get("by_shape", {}).items():
127
+ agg["by_shape"][shape]["count"] += b.get("count", 0)
128
+ agg["by_shape"][shape]["duration_ms_total"] += b.get("duration_ms_total", 0)
129
+ for v_name, v_data in (data.get("by_variant") or {}).items():
130
+ agg["by_variant"][v_name]["count"] += v_data.get("count", 0)
131
+ agg["by_variant"][v_name]["duration_ms_total"] += v_data.get("duration_ms_total", 0)
132
+ agg["by_variant"][v_name]["queue_ms_total"] += v_data.get("queue_ms_total", 0)
133
+ # Per-day merge: when we go multi-replica, each replica returns its
134
+ # own _by_day β†’ we union them here (sum counters, union unique_ips).
135
+ for date, d in data.get("by_day", {}).items():
136
+ existing = agg["by_day"].setdefault(date, {
137
+ "requests": 0, "success": 0, "errors": 0,
138
+ "by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
139
+ "by_hour": [0] * 24,
140
+ "unique_ips": set(),
141
+ "by_gpu": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
142
+ "by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
143
+ "queue_ms_total": 0,
144
+ })
145
+ existing["requests"] += d.get("requests", 0)
146
+ existing["success"] += d.get("success", 0)
147
+ existing["errors"] += d.get("errors", 0)
148
+ existing["queue_ms_total"] += d.get("queue_ms_total", 0)
149
+ for shape, b in d.get("by_shape", {}).items():
150
+ existing["by_shape"][shape]["count"] += b.get("count", 0)
151
+ existing["by_shape"][shape]["duration_ms_total"] += b.get("duration_ms_total", 0)
152
+ existing["by_shape"][shape]["queue_ms_total"] += b.get("queue_ms_total", 0)
153
+ for i, c in enumerate(d.get("by_hour") or [0] * 24):
154
+ if i < 24:
155
+ existing["by_hour"][i] += c
156
+ for h in d.get("unique_ips", []) or []:
157
+ existing["unique_ips"].add(h)
158
+ for g_name, g_data in (d.get("by_gpu") or {}).items():
159
+ existing["by_gpu"][g_name]["count"] += g_data.get("count", 0)
160
+ existing["by_gpu"][g_name]["duration_ms_total"] += g_data.get("duration_ms_total", 0)
161
+ existing["by_gpu"][g_name]["queue_ms_total"] += g_data.get("queue_ms_total", 0)
162
+ for v_name, v_data in (d.get("by_variant") or {}).items():
163
+ existing["by_variant"][v_name]["count"] += v_data.get("count", 0)
164
+ existing["by_variant"][v_name]["duration_ms_total"] += v_data.get("duration_ms_total", 0)
165
+ existing["by_variant"][v_name]["queue_ms_total"] += v_data.get("queue_ms_total", 0)
166
+ agg["recent"].extend(data.get("recent", []))
167
+ agg["ip_pepper"] = agg["ip_pepper"] or data.get("ip_pepper")
168
+
169
+ agg["recent"].sort(key=lambda r: r.get("ts", 0))
170
+ agg["recent"] = agg["recent"][-2000:]
171
+ return agg
172
+
173
+
174
+ def fetch_gpu_stats() -> dict:
175
+ try:
176
+ out = subprocess.check_output(
177
+ [
178
+ "nvidia-smi",
179
+ "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit",
180
+ "--format=csv,noheader,nounits",
181
+ ],
182
+ timeout=2,
183
+ ).decode()
184
+ except Exception as exc:
185
+ return {"ts": int(time.time()), "gpus": [], "error": str(exc)}
186
+
187
+ def _maybe_int(s: str) -> int | None:
188
+ s = s.strip()
189
+ return int(s) if s.isdigit() else None
190
+
191
+ def _maybe_float(s: str) -> float | None:
192
+ try:
193
+ return float(s.strip())
194
+ except ValueError:
195
+ return None
196
+
197
+ gpus = []
198
+ for line in out.strip().splitlines():
199
+ parts = [p.strip() for p in line.split(",")]
200
+ if len(parts) < 8:
201
+ continue
202
+ gpus.append({
203
+ "index": int(parts[0]),
204
+ "name": parts[1],
205
+ "util_pct": _maybe_int(parts[2]),
206
+ "memory_used_mb": _maybe_int(parts[3]),
207
+ "memory_total_mb": _maybe_int(parts[4]),
208
+ "temp_c": _maybe_int(parts[5]),
209
+ "power_w": _maybe_float(parts[6]),
210
+ "power_limit_w": _maybe_float(parts[7]),
211
+ })
212
+ return {"ts": int(time.time()), "gpus": gpus}
213
+
214
+
215
+ def build_analytics(backend_data: dict) -> dict:
216
+ """The JSON the dashboard polls. Derived from /metrics so they stay in sync."""
217
+ now = int(time.time())
218
+ today = time.strftime("%Y-%m-%d", time.gmtime(now))
219
+
220
+ by_shape_total = {}
221
+ for shape, b in backend_data["by_shape"].items():
222
+ avg = b["duration_ms_total"] // b["count"] if b["count"] else 0
223
+ by_shape_total[shape] = {"count": b["count"], "duration_ms_avg": avg}
224
+
225
+ # Latency stats derived from the last 50 requests only β€” same set the
226
+ # dashboard renders in its Recent Requests table. Keeps the latency
227
+ # numbers reactive to current load rather than smoothed by old data.
228
+ recent_window = backend_data["recent"][-50:]
229
+ recent_by_shape_acc: dict[str, dict] = {}
230
+ for r in recent_window:
231
+ s = r.get("shape") or "unknown"
232
+ d = recent_by_shape_acc.setdefault(s, {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0})
233
+ d["count"] += 1
234
+ d["duration_ms_total"] += int(r.get("duration_ms") or 0)
235
+ d["queue_ms_total"] += int(r.get("queue_ms") or 0)
236
+
237
+ recent_by_shape = {}
238
+ for s, b in recent_by_shape_acc.items():
239
+ recent_by_shape[s] = {
240
+ "count": b["count"],
241
+ "duration_ms_avg": b["duration_ms_total"] // b["count"] if b["count"] else 0,
242
+ "queue_ms_avg": b["queue_ms_total"] // b["count"] if b["count"] else 0,
243
+ }
244
+ recent_count_total = sum(b["count"] for b in recent_by_shape_acc.values())
245
+ recent_duration_total = sum(b["duration_ms_total"] for b in recent_by_shape_acc.values())
246
+ recent_avg_latency_ms = recent_duration_total // recent_count_total if recent_count_total else 0
247
+
248
+ today_bucket = backend_data["by_day"].get(today, {})
249
+ today_unique_set = today_bucket.get("unique_ips", set())
250
+ today_unique = len(today_unique_set if isinstance(today_unique_set, set) else list(today_unique_set))
251
+
252
+ # Today-only mirrors of by_shape_total and by_gpu_out. Same shape so the
253
+ # dashboard can render them with the same table helpers; only the scope
254
+ # differs (cumulative vs reset-at-UTC-midnight). Useful for spotting
255
+ # today's tier mix or shape distribution at a glance vs the all-time avg
256
+ # which smooths over the full history. queue_ms_avg is included so the
257
+ # tables can show how queueing pressure is distributed.
258
+ by_shape_today = {}
259
+ for shape, b in (today_bucket.get("by_shape") or {}).items():
260
+ c = b.get("count", 0)
261
+ by_shape_today[shape] = {
262
+ "count": c,
263
+ "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
264
+ "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
265
+ }
266
+ by_gpu_today = {}
267
+ for gpu_name, b in (today_bucket.get("by_gpu") or {}).items():
268
+ c = b.get("count", 0)
269
+ by_gpu_today[gpu_name] = {
270
+ "count": c,
271
+ "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
272
+ "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
273
+ }
274
+ # by_variant slices: cumulative (across all of by_day history) + today.
275
+ # Today's view drives the new Variant tile in the dashboard summary row.
276
+ by_variant_total = {}
277
+ for v_name, b in backend_data["by_variant"].items():
278
+ c = b.get("count", 0)
279
+ by_variant_total[v_name] = {
280
+ "count": c,
281
+ "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
282
+ "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
283
+ }
284
+ by_variant_today = {}
285
+ for v_name, b in (today_bucket.get("by_variant") or {}).items():
286
+ c = b.get("count", 0)
287
+ by_variant_today[v_name] = {
288
+ "count": c,
289
+ "duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
290
+ "queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
291
+ }
292
+ # Today's overall avg queue, summed across all shapes/gpus. Surfaced as
293
+ # a single number in the Pending tile subtitle on the dashboard.
294
+ today_count = today_bucket.get("requests", 0)
295
+ today_avg_queue_ms = (today_bucket.get("queue_ms_total", 0) // today_count) if today_count else 0
296
+
297
+ def _summary_for_last(n_days: int) -> dict:
298
+ days = sorted(backend_data["by_day"].keys())[-n_days:]
299
+ req = sum(backend_data["by_day"][d].get("requests", 0) for d in days)
300
+ uniques: set = set()
301
+ for d in days:
302
+ ips = backend_data["by_day"][d].get("unique_ips", set())
303
+ uniques.update(ips if isinstance(ips, set) else list(ips))
304
+ return {"requests": req, "unique_users": len(uniques)}
305
+
306
+ # Include per-GPU counts on each day so the dashboard can stack the daily
307
+ # chart by GPU. Each day's by_gpu dict only carries GPUs that actually
308
+ # served traffic that day, so the dashboard derives the union of all GPU
309
+ # names client-side and fills missing days with 0. duration_ms_total is
310
+ # surfaced too so a future "stacked latency view" doesn't need new fields.
311
+ requests_by_day = [
312
+ {
313
+ "date": d,
314
+ "count": backend_data["by_day"][d].get("requests", 0),
315
+ "by_gpu": {
316
+ g_name: {
317
+ "count": g.get("count", 0),
318
+ "duration_ms_total": g.get("duration_ms_total", 0),
319
+ }
320
+ for g_name, g in (backend_data["by_day"][d].get("by_gpu") or {}).items()
321
+ },
322
+ }
323
+ for d in sorted(backend_data["by_day"].keys())[-30:]
324
+ ]
325
+ requests_by_hour = list(today_bucket.get("by_hour", [0] * 24))
326
+
327
+ # Overall average latency, derived from by_shape (since duration totals
328
+ # live there, not in the cumulative counter).
329
+ total_duration_ms = sum(b["duration_ms_total"] for b in backend_data["by_shape"].values())
330
+ total_durations_count = sum(b["count"] for b in backend_data["by_shape"].values())
331
+ avg_latency_ms = total_duration_ms // total_durations_count if total_durations_count else 0
332
+
333
+ # Queue depth = whatever is in-flight beyond GPU-running capacity. Has
334
+ # to be summed PER REPLICA: if 4 are queued on replica 0 and replica 1
335
+ # is idle, naive sum(inflight) - sum(capacity) = max(0, 4-2) = 2 hides
336
+ # the fact that replica 0 has a 3-deep queue while replica 1 idles.
337
+ # Per-replica max(0, inflight-capacity) correctly attributes the queue.
338
+ per_replica = backend_data.get("per_replica", [])
339
+ inflight = sum(r["inflight"] for r in per_replica) if per_replica else backend_data.get("inflight", 0)
340
+ capacity = sum(r["capacity"] for r in per_replica) if per_replica else (
341
+ backend_data.get("generate_capacity", 0) or backend_data.get("replicas_seen", 1)
342
+ )
343
+ queue_depth = sum(max(0, r["inflight"] - r["capacity"]) for r in per_replica)
344
+ running = sum(min(r["inflight"], r["capacity"]) for r in per_replica) if per_replica else min(inflight, capacity)
345
+
346
+ # Per-GPU breakdown for the bottom-of-dashboard "By GPU" card. Count,
347
+ # success/error split, avg latency per GPU model. Useful for spotting
348
+ # variance between tiers (e.g. L40S vs T4) during benchmarking.
349
+ by_gpu_out = {}
350
+ for gpu_name, b in backend_data["by_gpu"].items():
351
+ c = b["count"]
352
+ by_gpu_out[gpu_name] = {
353
+ "count": c,
354
+ "success": b["success"],
355
+ "errors": b["errors"],
356
+ "duration_ms_avg": b["duration_ms_total"] // c if c else 0,
357
+ "duration_ms_total": b["duration_ms_total"],
358
+ "replicas": b["replicas"],
359
+ }
360
+
361
+ return {
362
+ "updated_at": now,
363
+ "uptime_s": backend_data.get("uptime_s", 0),
364
+ "persistent_storage": PERSISTENT_STORAGE,
365
+ "state_dir": STATE_DIR,
366
+ "replicas_seen": backend_data.get("replicas_seen", 0),
367
+ # entrypoint.sh sets BACKEND_URLS once per boot, so this is the
368
+ # number we *expect* to see β€” diff against replicas_seen tells the
369
+ # dashboard "1 replica is unhealthy" vs "2 of 2 happy".
370
+ "replicas_expected": len(BACKEND_URLS),
371
+ "per_replica": backend_data.get("per_replica", []),
372
+ "inflight": inflight,
373
+ "running": running,
374
+ "queue_depth": queue_depth,
375
+ "capacity": capacity,
376
+ "today_avg_queue_ms": today_avg_queue_ms,
377
+ "summary_total": {
378
+ "requests": backend_data["total_requests"],
379
+ "success": backend_data["success"],
380
+ "errors": backend_data["errors"],
381
+ },
382
+ "summary_today": {
383
+ "requests": today_bucket.get("requests", 0),
384
+ "unique_users": today_unique,
385
+ },
386
+ "summary_7d": _summary_for_last(7),
387
+ "summary_30d": _summary_for_last(30),
388
+ "avg_latency_ms": avg_latency_ms,
389
+ "by_shape": by_shape_total,
390
+ "by_shape_today": by_shape_today,
391
+ "by_gpu": by_gpu_out,
392
+ "by_gpu_today": by_gpu_today,
393
+ "by_variant": by_variant_total,
394
+ "by_variant_today": by_variant_today,
395
+ "recent_by_shape": recent_by_shape,
396
+ "recent_avg_latency_ms": recent_avg_latency_ms,
397
+ "recent_count": recent_count_total,
398
+ "requests_by_hour": requests_by_hour,
399
+ "requests_by_day": requests_by_day,
400
+ "recent": backend_data["recent"][-100:],
401
+ }
402
+
403
+
404
+ def _atomic_write(path: str, payload: dict, indent: int | None = None) -> None:
405
+ """Write JSON atomically. Falls back to direct overwrite if rename fails
406
+ (some FUSE-backed mounts don't support rename within a dir)."""
407
+ text = json.dumps(payload, indent=indent, sort_keys=indent is not None)
408
+ tmp = path + ".tmp"
409
+ try:
410
+ with open(tmp, "w") as f:
411
+ f.write(text)
412
+ os.replace(tmp, path)
413
+ except OSError as exc:
414
+ print(f"[metrics_pusher] atomic rename failed for {path} ({exc}); writing in place", flush=True)
415
+ try:
416
+ with open(path, "w") as f:
417
+ f.write(text)
418
+ except OSError as exc2:
419
+ print(f"[metrics_pusher] direct write also failed for {path} ({exc2})", flush=True)
420
+ finally:
421
+ try:
422
+ os.unlink(tmp)
423
+ except OSError:
424
+ pass
425
+
426
+
427
+ def write_state(backend_data: dict) -> None:
428
+ """Snapshot for boot-recovery. Includes per-day so the app can resume
429
+ counter buckets for in-flight days."""
430
+ by_day_out = {}
431
+ for date, d in backend_data["by_day"].items():
432
+ ips = d["unique_ips"]
433
+ by_day_out[date] = {
434
+ "requests": d["requests"],
435
+ "success": d["success"],
436
+ "errors": d["errors"],
437
+ "queue_ms_total": d.get("queue_ms_total", 0),
438
+ "by_shape": {
439
+ s: {
440
+ "count": b["count"],
441
+ "duration_ms_total": b["duration_ms_total"],
442
+ "queue_ms_total": b.get("queue_ms_total", 0),
443
+ }
444
+ for s, b in d["by_shape"].items()
445
+ },
446
+ "by_hour": list(d["by_hour"]),
447
+ "unique_ips": sorted(ips) if isinstance(ips, set) else list(ips),
448
+ "by_gpu": {
449
+ g: {
450
+ "count": v["count"],
451
+ "duration_ms_total": v["duration_ms_total"],
452
+ "queue_ms_total": v.get("queue_ms_total", 0),
453
+ }
454
+ for g, v in (d.get("by_gpu") or {}).items()
455
+ },
456
+ "by_variant": {
457
+ v: {
458
+ "count": b["count"],
459
+ "duration_ms_total": b["duration_ms_total"],
460
+ "queue_ms_total": b.get("queue_ms_total", 0),
461
+ }
462
+ for v, b in (d.get("by_variant") or {}).items()
463
+ },
464
+ }
465
+ payload = {
466
+ "total_requests": backend_data["total_requests"],
467
+ "success": backend_data["success"],
468
+ "errors": backend_data["errors"],
469
+ "by_shape": {
470
+ shape: {"count": b["count"], "duration_ms_total": b["duration_ms_total"]}
471
+ for shape, b in backend_data["by_shape"].items()
472
+ },
473
+ "by_variant": {
474
+ v: {
475
+ "count": b["count"],
476
+ "duration_ms_total": b["duration_ms_total"],
477
+ "queue_ms_total": b.get("queue_ms_total", 0),
478
+ }
479
+ for v, b in backend_data["by_variant"].items()
480
+ },
481
+ "by_day": by_day_out,
482
+ "recent": backend_data["recent"][-100:],
483
+ "ip_pepper": backend_data.get("ip_pepper"),
484
+ "saved_at": int(time.time()),
485
+ }
486
+ try:
487
+ os.makedirs(STATE_DIR, exist_ok=True)
488
+ except OSError as exc:
489
+ print(f"[metrics_pusher] mkdir {STATE_DIR} failed ({exc}); skipping state write", flush=True)
490
+ return
491
+ _atomic_write(STATE_PATH, payload)
492
+
493
+
494
+ def write_daily_archives(backend_data: dict) -> None:
495
+ """One JSON file per UTC date. Today's file gets rewritten each tick; past
496
+ days only on a restart that reloads their bucket from state.json."""
497
+ if not backend_data["by_day"]:
498
+ return
499
+ try:
500
+ os.makedirs(DAILY_DIR, exist_ok=True)
501
+ except OSError as exc:
502
+ print(f"[metrics_pusher] mkdir {DAILY_DIR} failed ({exc}); skipping daily writes", flush=True)
503
+ return
504
+ for date, d in backend_data["by_day"].items():
505
+ by_shape_out = {}
506
+ for shape, b in d["by_shape"].items():
507
+ c = b["count"]
508
+ by_shape_out[shape] = {
509
+ "count": c,
510
+ "duration_ms_total": b["duration_ms_total"],
511
+ "duration_ms_avg": b["duration_ms_total"] // c if c else 0,
512
+ "queue_ms_total": b.get("queue_ms_total", 0),
513
+ "queue_ms_avg": b.get("queue_ms_total", 0) // c if c else 0,
514
+ }
515
+ by_gpu_out = {}
516
+ for g_name, g in (d.get("by_gpu") or {}).items():
517
+ c = g["count"]
518
+ by_gpu_out[g_name] = {
519
+ "count": c,
520
+ "duration_ms_total": g["duration_ms_total"],
521
+ "duration_ms_avg": g["duration_ms_total"] // c if c else 0,
522
+ "queue_ms_total": g.get("queue_ms_total", 0),
523
+ "queue_ms_avg": g.get("queue_ms_total", 0) // c if c else 0,
524
+ }
525
+ by_variant_out = {}
526
+ for v_name, v in (d.get("by_variant") or {}).items():
527
+ c = v["count"]
528
+ by_variant_out[v_name] = {
529
+ "count": c,
530
+ "duration_ms_total": v["duration_ms_total"],
531
+ "duration_ms_avg": v["duration_ms_total"] // c if c else 0,
532
+ "queue_ms_total": v.get("queue_ms_total", 0),
533
+ "queue_ms_avg": v.get("queue_ms_total", 0) // c if c else 0,
534
+ }
535
+ ips = d["unique_ips"]
536
+ day_req = d["requests"]
537
+ day_queue_total = d.get("queue_ms_total", 0)
538
+ payload = {
539
+ "date": date,
540
+ "updated_at": int(time.time()),
541
+ "requests": day_req,
542
+ "success": d["success"],
543
+ "errors": d["errors"],
544
+ "queue_ms_total": day_queue_total,
545
+ "queue_ms_avg": day_queue_total // day_req if day_req else 0,
546
+ "unique_users": len(ips) if isinstance(ips, set) else len(list(ips)),
547
+ "by_shape": by_shape_out,
548
+ "by_hour": list(d["by_hour"]),
549
+ "by_gpu": by_gpu_out,
550
+ "by_variant": by_variant_out,
551
+ }
552
+ _atomic_write(os.path.join(DAILY_DIR, f"{date}.json"), payload, indent=2)
553
+
554
+
555
+ def main() -> None:
556
+ print(
557
+ f"[metrics_pusher] backends={BACKEND_URLS} interval={INTERVAL}s "
558
+ f"state_dir={STATE_DIR} persistent_storage={PERSISTENT_STORAGE}",
559
+ flush=True,
560
+ )
561
+ tick = 0
562
+ consecutive_zero = 0
563
+ while True:
564
+ try:
565
+ backend_data = fetch_backend_metrics()
566
+ gpu_data = fetch_gpu_stats()
567
+ # nvidia-smi runs locally and is independent of backend health,
568
+ # so always refresh GPU stats.
569
+ _atomic_write(GPU_PATH, gpu_data)
570
+
571
+ if backend_data["replicas_seen"] == 0:
572
+ # NO replicas answered /metrics this tick β€” usually means
573
+ # they're all saturated. DON'T overwrite analytics.json
574
+ # with zero-everywhere defaults; keep the prior file so
575
+ # the dashboard stays meaningful. Updated_at age will
576
+ # naturally drift to indicate staleness.
577
+ consecutive_zero += 1
578
+ print(
579
+ f"[metrics_pusher] tick {tick}: no replicas responded "
580
+ f"(consecutive={consecutive_zero}); keeping prior analytics.json",
581
+ flush=True,
582
+ )
583
+ else:
584
+ if consecutive_zero > 0:
585
+ print(f"[metrics_pusher] backends recovered after {consecutive_zero} miss(es)", flush=True)
586
+ consecutive_zero = 0
587
+ analytics = build_analytics(backend_data)
588
+ _atomic_write(ANALYTICS_PATH, analytics)
589
+ if tick % STATE_WRITE_EVERY_N_TICKS == 0:
590
+ write_state(backend_data)
591
+ write_daily_archives(backend_data)
592
+ except Exception as exc:
593
+ print(f"[metrics_pusher] tick error: {exc}", flush=True)
594
+ tick += 1
595
+ time.sleep(INTERVAL)
596
+
597
+
598
+ if __name__ == "__main__":
599
+ main()
space/nginx.conf ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # nginx config for the Bonsai-Image HF Space.
2
+ # - :7860 is the only public port (HF exposes it).
3
+ # - / and /api/* go to the Next.js frontend on :3000.
4
+ # - /generate, /backends, /docs go to one (or many) uvicorn backends via
5
+ # the upstream block, which entrypoint.sh builds from `nvidia-smi -L`.
6
+ # At N=1 it's just one server line; at N>1 we add least_conn.
7
+ # - /dash-<obfuscated> is the metrics dashboard, basic-auth gated.
8
+ #
9
+ # Run as: nginx -c /home/user/app/space/nginx.conf -p /home/user/app/
10
+
11
+ worker_processes 1;
12
+ daemon off;
13
+ pid /tmp/nginx.pid;
14
+ error_log /dev/stderr warn;
15
+
16
+ events {
17
+ worker_connections 256;
18
+ }
19
+
20
+ http {
21
+ default_type application/octet-stream;
22
+ sendfile on;
23
+ keepalive_timeout 65;
24
+
25
+ # nginx's stock /var/log/... isn't writable by uid 1000 on the HF image,
26
+ # so redirect everything into /tmp where we have write access.
27
+ client_body_temp_path /tmp/nginx-body;
28
+ proxy_temp_path /tmp/nginx-proxy;
29
+ fastcgi_temp_path /tmp/nginx-fastcgi;
30
+ uwsgi_temp_path /tmp/nginx-uwsgi;
31
+ scgi_temp_path /tmp/nginx-scgi;
32
+ access_log /tmp/nginx-access.log;
33
+
34
+ # Built at boot by entrypoint.sh from `nvidia-smi -L` β€” one server line
35
+ # per GPU. Today: one server at :8000.
36
+ include /tmp/nginx-upstream.conf;
37
+
38
+ server {
39
+ listen 7860 default_server;
40
+ client_max_body_size 16M;
41
+
42
+ # ── frontend ────────────────────────────────────────────────────────
43
+ location / {
44
+ proxy_pass http://127.0.0.1:3000;
45
+ proxy_http_version 1.1;
46
+ proxy_set_header Upgrade $http_upgrade;
47
+ proxy_set_header Connection "upgrade";
48
+ proxy_set_header Host $host;
49
+ # APPEND $remote_addr to existing X-Forwarded-For (the chain HF's
50
+ # edge proxy already set with the real visitor IP). Using
51
+ # $remote_addr alone would overwrite that with the edge proxy's
52
+ # IP β€” same for all visitors β€” collapsing every user to one hash
53
+ # in the dashboard's unique-user counter.
54
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
55
+ proxy_set_header X-Forwarded-Proto $scheme;
56
+ # Generations can run several seconds; Next.js streams the
57
+ # response back so don't time the connection out.
58
+ proxy_read_timeout 600s;
59
+ }
60
+
61
+ # ── backend API surface (called by Next.js api/generate route + curl) ─
62
+ location ~ ^/(generate|backends|docs|openapi\.json)$ {
63
+ proxy_pass http://bonsai_workers;
64
+ proxy_http_version 1.1;
65
+ proxy_set_header Host $host;
66
+ # APPEND $remote_addr to existing X-Forwarded-For (the chain HF's
67
+ # edge proxy already set with the real visitor IP). Using
68
+ # $remote_addr alone would overwrite that with the edge proxy's
69
+ # IP β€” same for all visitors β€” collapsing every user to one hash
70
+ # in the dashboard's unique-user counter.
71
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
72
+ proxy_read_timeout 600s;
73
+ proxy_buffering off; # stream PNG bytes back immediately
74
+ }
75
+
76
+ # ── dashboard ───────────────────────────────────────────────────────
77
+ # Obfuscated path + basic auth. Path suffix is in source (visible to
78
+ # anyone with repo read access); auth is the actual gate.
79
+ # Trailing-slash exact-match keeps /dash-... from leaking into other
80
+ # locations.
81
+ location = /dash-10a08e9c1ee4 {
82
+ auth_basic "Bonsai Dashboard";
83
+ auth_basic_user_file /tmp/.htpasswd;
84
+ alias /home/user/app/space/dashboard.html;
85
+ default_type text/html;
86
+ add_header Cache-Control "no-store" always;
87
+ }
88
+
89
+ location = /dash-10a08e9c1ee4/analytics.json {
90
+ auth_basic "Bonsai Dashboard";
91
+ auth_basic_user_file /tmp/.htpasswd;
92
+ alias /tmp/analytics.json;
93
+ default_type application/json;
94
+ add_header Cache-Control "no-store" always;
95
+ }
96
+
97
+ location = /dash-10a08e9c1ee4/gpu-stats.json {
98
+ auth_basic "Bonsai Dashboard";
99
+ auth_basic_user_file /tmp/.htpasswd;
100
+ alias /tmp/gpu-stats.json;
101
+ default_type application/json;
102
+ add_header Cache-Control "no-store" always;
103
+ }
104
+
105
+ # Catchall under the dashboard prefix β†’ 404 (don't reveal what else
106
+ # might exist there).
107
+ location ~ ^/dash- {
108
+ return 404;
109
+ }
110
+
111
+ # /metrics on the backend is loopback-only; nginx doesn't forward it.
112
+ # (metrics_pusher.py scrapes it directly at 127.0.0.1:8000/metrics.)
113
+ location = /metrics {
114
+ return 404;
115
+ }
116
+ }
117
+ }