Spaces:
Running on L40S
Running on L40S
Commit Β·
17895f4
0
Parent(s):
init
Browse files- .gitattributes +35 -0
- Dockerfile +61 -0
- README.md +33 -0
- space/__init__.py +0 -0
- space/app.py +483 -0
- space/dashboard.html +594 -0
- space/entrypoint.sh +323 -0
- space/metrics_pusher.py +599 -0
- space/nginx.conf +117 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax=docker/dockerfile:1.6
|
| 2 |
+
# CUDA 12.8 runtime β gemlite/Triton kernels JIT against the runtime ptxas
|
| 3 |
+
# that comes with this image; no need for the larger -devel variant.
|
| 4 |
+
FROM nvidia/cuda:12.8.0-runtime-ubuntu22.04
|
| 5 |
+
|
| 6 |
+
# ββ system deps ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 7 |
+
# build-essential is needed because some sdists (gemlite among them) compile C
|
| 8 |
+
# extensions at install time. python3 is the host interpreter that bootstraps
|
| 9 |
+
# uv; uv then provisions its own pinned interpreter for the venv. nginx fronts
|
| 10 |
+
# everything on :7860 (frontend, backend API, dashboard).
|
| 11 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 12 |
+
ca-certificates curl git build-essential python3 python3-venv \
|
| 13 |
+
libgomp1 libssl3 nginx openssl \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 15 |
+
&& chown -R 1000:1000 /var/lib/nginx /var/log/nginx /run
|
| 16 |
+
|
| 17 |
+
# Non-root user: HF Spaces convention is uid 1000.
|
| 18 |
+
RUN useradd -m -u 1000 user
|
| 19 |
+
USER user
|
| 20 |
+
ENV HOME=/home/user PATH="/home/user/.local/bin:$PATH"
|
| 21 |
+
|
| 22 |
+
# uv (Python venv + package manager). The demo's setup.sh assumes it's on PATH.
|
| 23 |
+
RUN curl -fsSL https://astral.sh/uv/install.sh | sh
|
| 24 |
+
|
| 25 |
+
WORKDIR /home/user/app
|
| 26 |
+
|
| 27 |
+
# ββ clone + run setup.sh in one RUN so GH_TOKEN never lands in a layer βββββββ
|
| 28 |
+
# GH_TOKEN is supplied by `--mount=type=secret`; the secret file is only
|
| 29 |
+
# visible during this single RUN and is not stored in the image.
|
| 30 |
+
# SKIP_DOWNLOAD=1 keeps setup.sh from pulling the 3.5 GB model at build time
|
| 31 |
+
# β entrypoint.sh handles that at boot so a Space restart doesn't have to
|
| 32 |
+
# rebuild the image.
|
| 33 |
+
RUN --mount=type=secret,id=GH_TOKEN,uid=1000,required=true \
|
| 34 |
+
git config --global credential.helper '!f() { echo "username=oauth2"; echo "password=$(cat /run/secrets/GH_TOKEN)"; }; f' \
|
| 35 |
+
&& git clone https://github.com/PrismML-Eng/Bonsai-image-demo.git . \
|
| 36 |
+
&& SKIP_DOWNLOAD=1 BONSAI_PACKAGE_MIN_AGE_DAYS=0 ./setup.sh \
|
| 37 |
+
&& git config --global --unset credential.helper
|
| 38 |
+
|
| 39 |
+
# ββ pre-build the Next.js frontend βββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
# Baking the build into the image so the first browser visit doesn't pay
|
| 41 |
+
# `npm install + next build` (~2 min) on top of model load. NEXT_PUBLIC_*
|
| 42 |
+
# vars are inlined at build time and don't change at runtime, so the
|
| 43 |
+
# backend URL (always loopback inside this container) is baked too.
|
| 44 |
+
RUN cd vendor/image-studio/frontend \
|
| 45 |
+
&& PATH="$HOME/app/.venv/bin:$PATH" \
|
| 46 |
+
NEXT_PUBLIC_BACKEND_URL=http://127.0.0.1:8000 \
|
| 47 |
+
npm install --no-audit --no-fund \
|
| 48 |
+
&& PATH="$HOME/app/.venv/bin:$PATH" \
|
| 49 |
+
NEXT_PUBLIC_BACKEND_URL=http://127.0.0.1:8000 \
|
| 50 |
+
npm run build
|
| 51 |
+
|
| 52 |
+
# ββ Space-local files ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
# All Space-specific code lives under space/ (Python package + sidecar +
|
| 54 |
+
# dashboard + nginx config + entrypoint). The demo's own code stays at the
|
| 55 |
+
# repo root (cloned earlier) so the two namespaces don't collide.
|
| 56 |
+
COPY --chown=user space/ /home/user/app/space/
|
| 57 |
+
RUN chmod +x /home/user/app/space/entrypoint.sh
|
| 58 |
+
|
| 59 |
+
EXPOSE 7860
|
| 60 |
+
|
| 61 |
+
CMD ["/home/user/app/space/entrypoint.sh"]
|
README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Bonsai Image (1-bit + 1.58-bit) GPU
|
| 3 |
+
emoji: π¨
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
suggested_hardware: l40sx1
|
| 9 |
+
pinned: true
|
| 10 |
+
short_description: Run 1-bit and 1.58-bit Bonsai-Image-4B on GPU
|
| 11 |
+
models:
|
| 12 |
+
- prism-ml/bonsai-image-ternary-4B-mlx-2bit
|
| 13 |
+
- prism-ml/bonsai-image-ternary-4B-gemlite-2bit
|
| 14 |
+
- prism-ml/bonsai-image-ternary-4B-unpacked
|
| 15 |
+
- prism-ml/bonsai-image-binary-4B-mlx-1bit
|
| 16 |
+
- prism-ml/bonsai-image-binary-4B-gemlite-1bit
|
| 17 |
+
- prism-ml/bonsai-image-binary-4B-unpacked
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# Bonsai Image Demo
|
| 21 |
+
|
| 22 |
+
- Ternary (1.58-bit)
|
| 23 |
+
- Binary (1-bit)
|
| 24 |
+
|
| 25 |
+
## Privacy
|
| 26 |
+
|
| 27 |
+
- **We do not log prompts or generated images.** Generation runs in-process and outputs are streamed back over HTTPS.
|
| 28 |
+
- The studio UI keeps your prompt history **in your browser's local storage only**. Clearing your browser cache erases it.
|
| 29 |
+
- Please do not submit sensitive, private, or confidential content in your prompts.
|
| 30 |
+
|
| 31 |
+
## Fair Use
|
| 32 |
+
|
| 33 |
+
Single-GPU demo, shared across all visitors. Heavy load may queue requests. Please avoid bursts of automated traffic so everyone can try it.
|
space/__init__.py
ADDED
|
File without changes
|
space/app.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HF Space wrapper around scripts.local_backend.
|
| 2 |
+
|
| 3 |
+
Adds a metrics middleware that:
|
| 4 |
+
- tracks total / success / error counters (cumulative since first launch)
|
| 5 |
+
- per-shape latency histogram (rolling)
|
| 6 |
+
- rolling 1000-request log with hashed-IP for unique-user count
|
| 7 |
+
- per-day buckets (UTC date) for the daily archives the metrics_pusher
|
| 8 |
+
sidecar writes under $BONSAI_STATE_DIR/daily/YYYY-MM-DD.json
|
| 9 |
+
|
| 10 |
+
State loaded at boot from $BONSAI_STATE_DIR/state.json so counters survive
|
| 11 |
+
Space restarts (assuming a persistent storage bucket is mounted; entrypoint
|
| 12 |
+
falls back to ephemeral disk otherwise).
|
| 13 |
+
|
| 14 |
+
Run with: uvicorn space.app:app
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import asyncio
|
| 19 |
+
import hashlib
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
import time
|
| 23 |
+
from collections import defaultdict, deque
|
| 24 |
+
from datetime import datetime, timezone
|
| 25 |
+
from threading import Lock
|
| 26 |
+
|
| 27 |
+
from fastapi import Request
|
| 28 |
+
|
| 29 |
+
# Re-export the real backend's app object so /generate, /backends, /docs
|
| 30 |
+
# are served untouched.
|
| 31 |
+
from scripts.local_backend import app # noqa: F401 (re-exported)
|
| 32 |
+
|
| 33 |
+
# ββ in-memory state ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
_lock = Lock()
|
| 35 |
+
_started_at = time.monotonic()
|
| 36 |
+
_total = {"requests": 0, "success": 0, "errors": 0}
|
| 37 |
+
_by_shape: dict[str, dict] = defaultdict(
|
| 38 |
+
lambda: {"count": 0, "duration_ms_total": 0, "durations": deque(maxlen=200)}
|
| 39 |
+
)
|
| 40 |
+
# Cumulative by-variant counter. The `variant` key is "ternary", "binary",
|
| 41 |
+
# or "unknown" (parsed from the request's `backend` field β see middleware).
|
| 42 |
+
# Mirrors by_shape's shape so the dashboard can show "ternary: X Β· binary: Y"
|
| 43 |
+
# across all time without re-summing the by_day history.
|
| 44 |
+
_by_variant: dict[str, dict] = defaultdict(
|
| 45 |
+
lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}
|
| 46 |
+
)
|
| 47 |
+
_recent: deque = deque(maxlen=1000)
|
| 48 |
+
|
| 49 |
+
# Per-day buckets keyed by UTC YYYY-MM-DD. Last 30 days kept in memory;
|
| 50 |
+
# older days remain on disk (metrics_pusher writes one file per day under
|
| 51 |
+
# $BONSAI_STATE_DIR/daily/).
|
| 52 |
+
_MAX_DAYS_IN_MEMORY = 30
|
| 53 |
+
_by_day: dict[str, dict] = {}
|
| 54 |
+
|
| 55 |
+
# UTC bucketing. (We tried Pacific Time, but `zoneinfo.ZoneInfo` needs
|
| 56 |
+
# /usr/share/zoneinfo/ which our CUDA Ubuntu base image strips with
|
| 57 |
+
# --no-install-recommends. To re-enable PT, install `tzdata` in the
|
| 58 |
+
# Dockerfile and swap these back to ZoneInfo("America/Los_Angeles").)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _today() -> str:
|
| 62 |
+
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _now_hour() -> int:
|
| 66 |
+
return datetime.now(timezone.utc).hour
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _empty_day() -> dict:
|
| 70 |
+
return {
|
| 71 |
+
"requests": 0,
|
| 72 |
+
"success": 0,
|
| 73 |
+
"errors": 0,
|
| 74 |
+
# queue_ms_total at three levels: day-total + per_shape + per_gpu.
|
| 75 |
+
# Day-total powers the dashboard's "today avg queue" tile; the
|
| 76 |
+
# per-shape and per-gpu views surface where queueing pressure is
|
| 77 |
+
# actually landing (e.g. is the slow GPU starving on small shapes?).
|
| 78 |
+
"by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
|
| 79 |
+
"by_hour": [0] * 24,
|
| 80 |
+
"unique_ips": set(),
|
| 81 |
+
# Per-GPU attribution for this day. Persisted to state.json +
|
| 82 |
+
# written into daily/YYYY-MM-DD.json so historical days retain
|
| 83 |
+
# their original GPU split even after a tier swap.
|
| 84 |
+
"by_gpu": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
|
| 85 |
+
# Per-variant attribution (ternary/binary/unknown). Tells you which
|
| 86 |
+
# arm took the traffic on this day independent of which GPU served
|
| 87 |
+
# it β useful for "did users actually click binary today, or are
|
| 88 |
+
# they all defaulting to ternary?" analysis.
|
| 89 |
+
"by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
|
| 90 |
+
"queue_ms_total": 0,
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _bump_day(date: str, ok: bool, shape: str, dt_ms: int, queue_ms: int, hour: int, ip_hash: str, variant: str) -> None:
|
| 95 |
+
"""Increment today's bucket. Caller must hold _lock."""
|
| 96 |
+
if date not in _by_day:
|
| 97 |
+
_by_day[date] = _empty_day()
|
| 98 |
+
d = _by_day[date]
|
| 99 |
+
d["requests"] += 1
|
| 100 |
+
if ok:
|
| 101 |
+
d["success"] += 1
|
| 102 |
+
else:
|
| 103 |
+
d["errors"] += 1
|
| 104 |
+
d["by_shape"][shape]["count"] += 1
|
| 105 |
+
d["by_shape"][shape]["duration_ms_total"] += dt_ms
|
| 106 |
+
d["by_shape"][shape]["queue_ms_total"] += queue_ms
|
| 107 |
+
d["by_hour"][hour] += 1
|
| 108 |
+
d["unique_ips"].add(ip_hash)
|
| 109 |
+
d["by_gpu"][_GPU_NAME]["count"] += 1
|
| 110 |
+
d["by_gpu"][_GPU_NAME]["duration_ms_total"] += dt_ms
|
| 111 |
+
d["by_gpu"][_GPU_NAME]["queue_ms_total"] += queue_ms
|
| 112 |
+
d["by_variant"][variant]["count"] += 1
|
| 113 |
+
d["by_variant"][variant]["duration_ms_total"] += dt_ms
|
| 114 |
+
d["by_variant"][variant]["queue_ms_total"] += queue_ms
|
| 115 |
+
d["queue_ms_total"] += queue_ms
|
| 116 |
+
if len(_by_day) > _MAX_DAYS_IN_MEMORY:
|
| 117 |
+
for stale in sorted(_by_day)[:-_MAX_DAYS_IN_MEMORY]:
|
| 118 |
+
del _by_day[stale]
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# ββ persisted state ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
+
# $BONSAI_STATE_DIR is set by entrypoint.sh β /data/state if a persistent
|
| 123 |
+
# storage bucket is mounted, else $APP_DIR/outputs/.state (ephemeral).
|
| 124 |
+
_STATE_DIR = os.environ.get("BONSAI_STATE_DIR", "/tmp")
|
| 125 |
+
_STATE_PATH = os.path.join(_STATE_DIR, "state.json")
|
| 126 |
+
# entrypoint.sh sets this to "1" when /data is mounted + writable, else "0".
|
| 127 |
+
# Surfaced to the dashboard so it can show a "counters won't persist" warning.
|
| 128 |
+
_PERSISTENT_STORAGE = os.environ.get("BONSAI_PERSISTENT_STORAGE", "0") == "1"
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _load_state() -> dict:
|
| 132 |
+
"""Return a dict with all persisted fields, or fresh defaults on miss / parse error."""
|
| 133 |
+
fresh = {
|
| 134 |
+
"pepper": os.urandom(16).hex().encode(),
|
| 135 |
+
"totals": {"requests": 0, "success": 0, "errors": 0},
|
| 136 |
+
"by_shape": {},
|
| 137 |
+
"by_variant": {}, # parallel to by_shape; new in this build, may be missing in old state files
|
| 138 |
+
"recent": [],
|
| 139 |
+
"by_day": {},
|
| 140 |
+
}
|
| 141 |
+
try:
|
| 142 |
+
with open(_STATE_PATH) as f:
|
| 143 |
+
data = json.load(f)
|
| 144 |
+
except (FileNotFoundError, json.JSONDecodeError, OSError) as exc:
|
| 145 |
+
print(f"[space.app] no prior state ({type(exc).__name__}: {exc}); starting fresh", flush=True)
|
| 146 |
+
return fresh
|
| 147 |
+
try:
|
| 148 |
+
fresh["pepper"] = (data.get("ip_pepper") or fresh["pepper"].decode()).encode()
|
| 149 |
+
fresh["totals"] = {
|
| 150 |
+
"requests": int(data.get("total_requests", 0)),
|
| 151 |
+
"success": int(data.get("success", 0)),
|
| 152 |
+
"errors": int(data.get("errors", 0)),
|
| 153 |
+
}
|
| 154 |
+
by_shape_raw = data.get("by_shape", {}) or {}
|
| 155 |
+
by_shape_loaded = {}
|
| 156 |
+
for shape, b in by_shape_raw.items():
|
| 157 |
+
by_shape_loaded[shape] = {
|
| 158 |
+
"count": int(b.get("count", 0)),
|
| 159 |
+
"duration_ms_total": int(b.get("duration_ms_total", 0)),
|
| 160 |
+
"durations": deque(maxlen=200), # p50/p95 starts fresh after a boot
|
| 161 |
+
}
|
| 162 |
+
fresh["by_shape"] = by_shape_loaded
|
| 163 |
+
# by_variant: parallel to by_shape, no `durations` deque (no need
|
| 164 |
+
# for p50/p95 yet, just cumulative count + duration + queue).
|
| 165 |
+
by_variant_raw = data.get("by_variant", {}) or {}
|
| 166 |
+
by_variant_loaded = {}
|
| 167 |
+
for variant, b in by_variant_raw.items():
|
| 168 |
+
by_variant_loaded[variant] = {
|
| 169 |
+
"count": int(b.get("count", 0)),
|
| 170 |
+
"duration_ms_total": int(b.get("duration_ms_total", 0)),
|
| 171 |
+
"queue_ms_total": int(b.get("queue_ms_total", 0)),
|
| 172 |
+
}
|
| 173 |
+
fresh["by_variant"] = by_variant_loaded
|
| 174 |
+
fresh["recent"] = data.get("recent", []) or []
|
| 175 |
+
# Per-day
|
| 176 |
+
by_day_raw = data.get("by_day", {}) or {}
|
| 177 |
+
by_day_loaded: dict[str, dict] = {}
|
| 178 |
+
for date, d in by_day_raw.items():
|
| 179 |
+
bd = _empty_day()
|
| 180 |
+
bd["requests"] = int(d.get("requests", 0))
|
| 181 |
+
bd["success"] = int(d.get("success", 0))
|
| 182 |
+
bd["errors"] = int(d.get("errors", 0))
|
| 183 |
+
# queue_ms_total fields default to 0 for state files persisted
|
| 184 |
+
# before this feature shipped β keeps reload graceful.
|
| 185 |
+
bd["queue_ms_total"] = int(d.get("queue_ms_total", 0))
|
| 186 |
+
for shape, s in (d.get("by_shape", {}) or {}).items():
|
| 187 |
+
bd["by_shape"][shape] = {
|
| 188 |
+
"count": int(s.get("count", 0)),
|
| 189 |
+
"duration_ms_total": int(s.get("duration_ms_total", 0)),
|
| 190 |
+
"queue_ms_total": int(s.get("queue_ms_total", 0)),
|
| 191 |
+
}
|
| 192 |
+
bh = d.get("by_hour") or [0] * 24
|
| 193 |
+
bd["by_hour"] = list(bh) + [0] * max(0, 24 - len(bh))
|
| 194 |
+
bd["unique_ips"] = set(d.get("unique_ips", []) or [])
|
| 195 |
+
for gpu_name, g in (d.get("by_gpu", {}) or {}).items():
|
| 196 |
+
bd["by_gpu"][gpu_name] = {
|
| 197 |
+
"count": int(g.get("count", 0)),
|
| 198 |
+
"duration_ms_total": int(g.get("duration_ms_total", 0)),
|
| 199 |
+
"queue_ms_total": int(g.get("queue_ms_total", 0)),
|
| 200 |
+
}
|
| 201 |
+
for variant_name, v in (d.get("by_variant", {}) or {}).items():
|
| 202 |
+
bd["by_variant"][variant_name] = {
|
| 203 |
+
"count": int(v.get("count", 0)),
|
| 204 |
+
"duration_ms_total": int(v.get("duration_ms_total", 0)),
|
| 205 |
+
"queue_ms_total": int(v.get("queue_ms_total", 0)),
|
| 206 |
+
}
|
| 207 |
+
by_day_loaded[date] = bd
|
| 208 |
+
fresh["by_day"] = by_day_loaded
|
| 209 |
+
except Exception as exc:
|
| 210 |
+
print(f"[space.app] state file partially malformed ({exc}); using what we could parse", flush=True)
|
| 211 |
+
return fresh
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# ββ replica gating for multi-GPU deploys βββββββββββββββββββββββββββββββββββββ
|
| 215 |
+
# Each uvicorn process (one per GPU) sets BONSAI_REPLICA_INDEX via entrypoint.
|
| 216 |
+
# Only replica 0 seeds its in-memory counters from state.json β other
|
| 217 |
+
# replicas start at zero. metrics_pusher polls every replica and sums them,
|
| 218 |
+
# so this avoids N-way inflation of cumulative counts. Pepper comes from
|
| 219 |
+
# the env (set by entrypoint), shared across all replicas so unique-user
|
| 220 |
+
# hashing is consistent.
|
| 221 |
+
_REPLICA_INDEX = int(os.environ.get("BONSAI_REPLICA_INDEX", "0"))
|
| 222 |
+
# Name of the GPU this replica is pinned to (entrypoint sets it from
|
| 223 |
+
# `nvidia-smi --query-gpu=name`). Exposed in /metrics so the pusher can
|
| 224 |
+
# aggregate per-GPU averages on the dashboard. Falls back to "unknown"
|
| 225 |
+
# if not provided.
|
| 226 |
+
# Default to NVIDIA L40S if entrypoint didn't supply a name β that's the
|
| 227 |
+
# tier we ran on for most of the demo's history, so unattributed counters
|
| 228 |
+
# get folded into the L40S bucket rather than a misleading "unknown".
|
| 229 |
+
_GPU_NAME = os.environ.get("BONSAI_GPU_NAME", "").strip() or "NVIDIA L40S"
|
| 230 |
+
_loaded = _load_state()
|
| 231 |
+
if _REPLICA_INDEX == 0:
|
| 232 |
+
_total.update(_loaded["totals"])
|
| 233 |
+
for _s, _b in _loaded["by_shape"].items():
|
| 234 |
+
_by_shape[_s] = _b
|
| 235 |
+
for _v, _b in _loaded["by_variant"].items():
|
| 236 |
+
_by_variant[_v] = _b
|
| 237 |
+
for _r in _loaded["recent"][-1000:]:
|
| 238 |
+
_recent.append(_r)
|
| 239 |
+
_by_day.update(_loaded["by_day"])
|
| 240 |
+
print(
|
| 241 |
+
f"[space.app] replica 0: seeded counters from {_STATE_PATH} "
|
| 242 |
+
f"(requests={_total['requests']} days={len(_by_day)} "
|
| 243 |
+
f"persistent_storage={_PERSISTENT_STORAGE})",
|
| 244 |
+
flush=True,
|
| 245 |
+
)
|
| 246 |
+
else:
|
| 247 |
+
print(
|
| 248 |
+
f"[space.app] replica {_REPLICA_INDEX}: starting counters at 0 "
|
| 249 |
+
f"(replica 0 owns cumulative state)",
|
| 250 |
+
flush=True,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# Pepper: prefer env (entrypoint exports a single value for all replicas).
|
| 254 |
+
# Fall back to whatever _load_state surfaced (typically random on first
|
| 255 |
+
# launch) β fine for single-replica or testing.
|
| 256 |
+
_IP_PEPPER = os.environ.get("BONSAI_IP_PEPPER", _loaded["pepper"].decode()).encode()
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def _hash_ip(ip: str) -> str:
|
| 260 |
+
return hashlib.sha256(_IP_PEPPER + ip.encode()).hexdigest()[:12]
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# Concurrency cap per replica. Image-gen is compute-bound; two concurrent
|
| 264 |
+
# requests at one GPU just contend for the same SMs and serialize at the
|
| 265 |
+
# kernel-launch level, wasting time. With Semaphore(1), additional requests
|
| 266 |
+
# queue at the asyncio level, and nginx's least_conn sees them as "this
|
| 267 |
+
# replica is busy" β routes to a free GPU when one's available.
|
| 268 |
+
_GENERATE_CONCURRENCY = int(os.environ.get("BONSAI_GENERATE_CONCURRENCY", "1"))
|
| 269 |
+
_generate_sem = asyncio.Semaphore(_GENERATE_CONCURRENCY)
|
| 270 |
+
|
| 271 |
+
# In-flight gauge. Incremented when a /generate request enters the middleware
|
| 272 |
+
# (before semaphore acquire β so queued requests count), decremented in
|
| 273 |
+
# finally. metrics_pusher sums across replicas and derives queue depth as
|
| 274 |
+
# max(0, total_inflight - total_concurrency).
|
| 275 |
+
_inflight = 0
|
| 276 |
+
_inflight_lock = Lock()
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# ββ middleware βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
+
@app.middleware("http")
|
| 281 |
+
async def _track_generate(request: Request, call_next):
|
| 282 |
+
if request.url.path != "/generate" or request.method != "POST":
|
| 283 |
+
return await call_next(request)
|
| 284 |
+
|
| 285 |
+
# Read + replay the body so the downstream handler still sees it.
|
| 286 |
+
body = await request.body()
|
| 287 |
+
|
| 288 |
+
async def _receive() -> dict:
|
| 289 |
+
return {"type": "http.request", "body": body, "more_body": False}
|
| 290 |
+
|
| 291 |
+
request._receive = _receive # type: ignore[attr-defined]
|
| 292 |
+
|
| 293 |
+
shape = "unknown"
|
| 294 |
+
# variant: "ternary" / "binary" / "unknown". Parsed from the request's
|
| 295 |
+
# `backend` field β values look like "bonsai-ternary-gemlite" or
|
| 296 |
+
# "bonsai-binary-mlx". If the client omits backend, FastAPI's default
|
| 297 |
+
# picks the resident pipeline arm (set by MFLUX_STUDIO_GPU_DEFAULT_BACKEND
|
| 298 |
+
# in entrypoint.sh β currently bonsai-ternary-gemlite) so we mirror that
|
| 299 |
+
# default here for fair attribution.
|
| 300 |
+
variant = "ternary"
|
| 301 |
+
try:
|
| 302 |
+
payload = json.loads(body or b"{}")
|
| 303 |
+
w, h = int(payload.get("width", 0)), int(payload.get("height", 0))
|
| 304 |
+
if w and h:
|
| 305 |
+
shape = f"{w}x{h}"
|
| 306 |
+
backend = (payload.get("backend") or "").lower()
|
| 307 |
+
if "ternary" in backend:
|
| 308 |
+
variant = "ternary"
|
| 309 |
+
elif "binary" in backend:
|
| 310 |
+
variant = "binary"
|
| 311 |
+
elif backend:
|
| 312 |
+
variant = "unknown"
|
| 313 |
+
# else: backend missing β keep the default "ternary" set above
|
| 314 |
+
except Exception:
|
| 315 |
+
pass
|
| 316 |
+
|
| 317 |
+
# Identity for unique-user counting. Preference order:
|
| 318 |
+
# 1. X-IP-Token β set by HF when the visitor is logged into
|
| 319 |
+
# huggingface.co and viewing the Space via the embed. Tied to
|
| 320 |
+
# their HF session, stable across homeβmobile network changes.
|
| 321 |
+
# 2. X-Forwarded-For β real client IP, set by nginx (and propagated
|
| 322 |
+
# by Next.js's /api/generate route handler).
|
| 323 |
+
# 3. request.client.host β direct-loopback fallback (mostly never).
|
| 324 |
+
# The "hf:" / "ip:" prefix keeps the two namespaces from colliding.
|
| 325 |
+
hf_token = request.headers.get("x-ip-token")
|
| 326 |
+
if hf_token:
|
| 327 |
+
identity = f"hf:{hf_token}"
|
| 328 |
+
else:
|
| 329 |
+
forwarded = request.headers.get("x-forwarded-for")
|
| 330 |
+
ip = forwarded.split(",")[0].strip() if forwarded else (request.client.host if request.client else "0.0.0.0")
|
| 331 |
+
identity = f"ip:{ip}"
|
| 332 |
+
ip_hash = _hash_ip(identity)
|
| 333 |
+
|
| 334 |
+
date = _today()
|
| 335 |
+
hour = _now_hour()
|
| 336 |
+
|
| 337 |
+
# Increment in-flight gauge BEFORE the semaphore so queued requests are
|
| 338 |
+
# visible to the dashboard ("X pending"). Decrement in finally so the
|
| 339 |
+
# gauge stays accurate even on exceptions.
|
| 340 |
+
global _inflight
|
| 341 |
+
t_enqueue = time.monotonic()
|
| 342 |
+
with _inflight_lock:
|
| 343 |
+
_inflight += 1
|
| 344 |
+
try:
|
| 345 |
+
# Queue at the semaphore so only N requests per replica run on the
|
| 346 |
+
# GPU at once. The HTTP connection stays open while we wait, which
|
| 347 |
+
# makes nginx's least_conn see this replica as busy β routes new
|
| 348 |
+
# arrivals to a free GPU when one's available.
|
| 349 |
+
async with _generate_sem:
|
| 350 |
+
t_start = time.monotonic()
|
| 351 |
+
queue_ms = int((t_start - t_enqueue) * 1000)
|
| 352 |
+
try:
|
| 353 |
+
response = await call_next(request)
|
| 354 |
+
except Exception:
|
| 355 |
+
dt_ms = int((time.monotonic() - t_start) * 1000)
|
| 356 |
+
with _lock:
|
| 357 |
+
_total["requests"] += 1
|
| 358 |
+
_total["errors"] += 1
|
| 359 |
+
_by_variant[variant]["count"] += 1
|
| 360 |
+
_by_variant[variant]["duration_ms_total"] += dt_ms
|
| 361 |
+
_by_variant[variant]["queue_ms_total"] += queue_ms
|
| 362 |
+
_recent.append({"ts": int(time.time()), "shape": shape, "duration_ms": dt_ms, "queue_ms": queue_ms, "ip_hash": ip_hash, "gpu": _GPU_NAME, "variant": variant, "ok": False})
|
| 363 |
+
_bump_day(date, False, shape, dt_ms, queue_ms, hour, ip_hash, variant)
|
| 364 |
+
raise
|
| 365 |
+
|
| 366 |
+
dt_ms = int((time.monotonic() - t_start) * 1000)
|
| 367 |
+
ok = response.status_code < 400
|
| 368 |
+
with _lock:
|
| 369 |
+
_total["requests"] += 1
|
| 370 |
+
if ok:
|
| 371 |
+
_total["success"] += 1
|
| 372 |
+
else:
|
| 373 |
+
_total["errors"] += 1
|
| 374 |
+
bucket = _by_shape[shape]
|
| 375 |
+
bucket["count"] += 1
|
| 376 |
+
bucket["duration_ms_total"] += dt_ms
|
| 377 |
+
bucket["durations"].append(dt_ms)
|
| 378 |
+
_by_variant[variant]["count"] += 1
|
| 379 |
+
_by_variant[variant]["duration_ms_total"] += dt_ms
|
| 380 |
+
_by_variant[variant]["queue_ms_total"] += queue_ms
|
| 381 |
+
_recent.append({"ts": int(time.time()), "shape": shape, "duration_ms": dt_ms, "queue_ms": queue_ms, "ip_hash": ip_hash, "gpu": _GPU_NAME, "variant": variant, "ok": ok})
|
| 382 |
+
_bump_day(date, ok, shape, dt_ms, queue_ms, hour, ip_hash, variant)
|
| 383 |
+
return response
|
| 384 |
+
finally:
|
| 385 |
+
with _inflight_lock:
|
| 386 |
+
_inflight -= 1
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
# ββ /metrics endpoint (loopback-only via nginx) ββββββββββββββββββββββββββββββ
|
| 390 |
+
def _percentile(xs: list[int], p: int) -> int | None:
|
| 391 |
+
if not xs:
|
| 392 |
+
return None
|
| 393 |
+
s = sorted(xs)
|
| 394 |
+
idx = min(int(len(s) * p / 100), len(s) - 1)
|
| 395 |
+
return s[idx]
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
@app.get("/metrics")
|
| 399 |
+
def get_metrics() -> dict:
|
| 400 |
+
"""Scraped by metrics_pusher every few seconds. Returns the full in-memory
|
| 401 |
+
state so the sidecar can rebuild analytics.json + write daily archives.
|
| 402 |
+
"""
|
| 403 |
+
with _lock:
|
| 404 |
+
by_shape = {}
|
| 405 |
+
for shape, b in _by_shape.items():
|
| 406 |
+
durs = list(b["durations"])
|
| 407 |
+
by_shape[shape] = {
|
| 408 |
+
"count": b["count"],
|
| 409 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 410 |
+
"duration_ms_p50": _percentile(durs, 50),
|
| 411 |
+
"duration_ms_p95": _percentile(durs, 95),
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
by_day_out = {}
|
| 415 |
+
for date, d in _by_day.items():
|
| 416 |
+
by_day_out[date] = {
|
| 417 |
+
"requests": d["requests"],
|
| 418 |
+
"success": d["success"],
|
| 419 |
+
"errors": d["errors"],
|
| 420 |
+
# queue_ms_total exposed at all three levels (day + per-shape +
|
| 421 |
+
# per-gpu) so the pusher can compute today's average queue at
|
| 422 |
+
# arbitrary slicing without re-summing recent[].
|
| 423 |
+
"queue_ms_total": d.get("queue_ms_total", 0),
|
| 424 |
+
"by_shape": {
|
| 425 |
+
s: {
|
| 426 |
+
"count": b["count"],
|
| 427 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 428 |
+
"queue_ms_total": b.get("queue_ms_total", 0),
|
| 429 |
+
}
|
| 430 |
+
for s, b in d["by_shape"].items()
|
| 431 |
+
},
|
| 432 |
+
"by_hour": list(d["by_hour"]),
|
| 433 |
+
"unique_users": len(d["unique_ips"]),
|
| 434 |
+
"unique_ips": list(d["unique_ips"]), # for round-trip persistence
|
| 435 |
+
"by_gpu": {
|
| 436 |
+
g: {
|
| 437 |
+
"count": v["count"],
|
| 438 |
+
"duration_ms_total": v["duration_ms_total"],
|
| 439 |
+
"queue_ms_total": v.get("queue_ms_total", 0),
|
| 440 |
+
}
|
| 441 |
+
for g, v in d["by_gpu"].items()
|
| 442 |
+
},
|
| 443 |
+
"by_variant": {
|
| 444 |
+
v: {
|
| 445 |
+
"count": b["count"],
|
| 446 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 447 |
+
"queue_ms_total": b.get("queue_ms_total", 0),
|
| 448 |
+
}
|
| 449 |
+
for v, b in d.get("by_variant", {}).items()
|
| 450 |
+
},
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
with _inflight_lock:
|
| 454 |
+
inflight = _inflight
|
| 455 |
+
# Replica's own cumulative duration sum (sum across all shapes).
|
| 456 |
+
# Used by the pusher to compute per-GPU avg latency without
|
| 457 |
+
# rebuilding it from `recent` (which would lose history).
|
| 458 |
+
total_duration_ms = sum(b["duration_ms_total"] for b in _by_shape.values())
|
| 459 |
+
return {
|
| 460 |
+
"uptime_s": int(time.monotonic() - _started_at),
|
| 461 |
+
"replica_index": _REPLICA_INDEX,
|
| 462 |
+
"gpu_name": _GPU_NAME,
|
| 463 |
+
"inflight": inflight,
|
| 464 |
+
"generate_concurrency": _GENERATE_CONCURRENCY,
|
| 465 |
+
"total_requests": _total["requests"],
|
| 466 |
+
"success": _total["success"],
|
| 467 |
+
"errors": _total["errors"],
|
| 468 |
+
"total_duration_ms": total_duration_ms,
|
| 469 |
+
"by_shape": by_shape,
|
| 470 |
+
"by_variant": {
|
| 471 |
+
v: {
|
| 472 |
+
"count": b["count"],
|
| 473 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 474 |
+
"queue_ms_total": b.get("queue_ms_total", 0),
|
| 475 |
+
}
|
| 476 |
+
for v, b in _by_variant.items()
|
| 477 |
+
},
|
| 478 |
+
"by_day": by_day_out,
|
| 479 |
+
"recent": list(_recent),
|
| 480 |
+
"ip_pepper": _IP_PEPPER.decode(),
|
| 481 |
+
"persistent_storage": _PERSISTENT_STORAGE,
|
| 482 |
+
"state_dir": _STATE_DIR,
|
| 483 |
+
}
|
space/dashboard.html
ADDED
|
@@ -0,0 +1,594 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>Bonsai-Image Dashboard</title>
|
| 7 |
+
<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.4/dist/chart.umd.min.js"></script>
|
| 8 |
+
<style>
|
| 9 |
+
:root {
|
| 10 |
+
--bg: #0e1116;
|
| 11 |
+
--panel: #161b22;
|
| 12 |
+
--panel-border: #1f2630;
|
| 13 |
+
--text: #d7dde6;
|
| 14 |
+
--muted: #7d8694;
|
| 15 |
+
--accent: #4cb583;
|
| 16 |
+
--warn: #d97757;
|
| 17 |
+
--grid: #21272f;
|
| 18 |
+
}
|
| 19 |
+
* { box-sizing: border-box; }
|
| 20 |
+
body {
|
| 21 |
+
margin: 0; padding: 24px;
|
| 22 |
+
font: 14px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
| 23 |
+
background: var(--bg); color: var(--text);
|
| 24 |
+
}
|
| 25 |
+
header { display: flex; align-items: baseline; justify-content: space-between; margin-bottom: 20px; }
|
| 26 |
+
h1 { margin: 0; font-size: 18px; font-weight: 600; }
|
| 27 |
+
.subtitle { color: var(--muted); font-size: 12px; }
|
| 28 |
+
.grid { display: grid; gap: 16px; grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); margin-bottom: 16px; }
|
| 29 |
+
.card { background: var(--panel); border: 1px solid var(--panel-border); border-radius: 8px; padding: 16px; }
|
| 30 |
+
.card h2 { margin: 0 0 12px 0; font-size: 12px; font-weight: 600; color: var(--muted); text-transform: uppercase; letter-spacing: 0.05em; }
|
| 31 |
+
.metric { font-size: 28px; font-weight: 600; line-height: 1; }
|
| 32 |
+
.metric-sub { color: var(--muted); font-size: 12px; margin-top: 6px; }
|
| 33 |
+
.row { display: grid; gap: 16px; grid-template-columns: 2fr 1fr; margin-bottom: 16px; }
|
| 34 |
+
.row.single { grid-template-columns: 1fr; }
|
| 35 |
+
.row.equal { grid-template-columns: 1fr 1fr; }
|
| 36 |
+
.row.three { grid-template-columns: 1fr 1fr 1fr; }
|
| 37 |
+
@media (max-width: 1100px) { .row.three { grid-template-columns: 1fr 1fr; } }
|
| 38 |
+
@media (max-width: 900px) { .row, .row.equal, .row.three { grid-template-columns: 1fr; } }
|
| 39 |
+
/* Replica pills: one chip per active uvicorn worker, color-coded by GPU
|
| 40 |
+
tier so a glance at the Replicas tile shows mixed vs homogeneous fleets. */
|
| 41 |
+
.replicas { display: flex; flex-wrap: wrap; gap: 6px; margin-top: 8px; }
|
| 42 |
+
.replica-pill { display: inline-flex; align-items: center; gap: 4px; font-size: 11px; padding: 2px 8px; border-radius: 999px; background: var(--grid); }
|
| 43 |
+
.replica-pill .dot { width: 6px; height: 6px; border-radius: 50%; background: var(--accent); }
|
| 44 |
+
.status-warn { color: var(--warn); }
|
| 45 |
+
canvas { max-width: 100%; }
|
| 46 |
+
table { width: 100%; border-collapse: collapse; font-size: 12px; }
|
| 47 |
+
th, td { padding: 6px 8px; text-align: left; border-bottom: 1px solid var(--grid); }
|
| 48 |
+
th { color: var(--muted); font-weight: 500; }
|
| 49 |
+
th.num, td.num { text-align: right; font-variant-numeric: tabular-nums; }
|
| 50 |
+
.status-ok { color: var(--accent); }
|
| 51 |
+
.status-err { color: var(--warn); }
|
| 52 |
+
footer { color: var(--muted); font-size: 11px; margin-top: 20px; text-align: right; }
|
| 53 |
+
.gpu-bar { background: var(--grid); border-radius: 4px; height: 6px; overflow: hidden; margin-top: 4px; }
|
| 54 |
+
.gpu-bar-fill { background: var(--accent); height: 100%; transition: width 0.3s; }
|
| 55 |
+
.banner { display: none; padding: 12px 16px; border-radius: 8px; margin-bottom: 16px; font-size: 13px; line-height: 1.5; border: 1px solid; }
|
| 56 |
+
.banner.warn { background: rgba(217, 119, 87, 0.08); border-color: rgba(217, 119, 87, 0.4); color: #e8a280; }
|
| 57 |
+
.banner.error { background: rgba(217, 119, 87, 0.15); border-color: rgba(217, 119, 87, 0.5); color: #f0a890; }
|
| 58 |
+
</style>
|
| 59 |
+
</head>
|
| 60 |
+
<body>
|
| 61 |
+
|
| 62 |
+
<header>
|
| 63 |
+
<div>
|
| 64 |
+
<h1>πΏ Bonsai-Image Dashboard</h1>
|
| 65 |
+
<div class="subtitle" id="updated">loading...</div>
|
| 66 |
+
</div>
|
| 67 |
+
<div class="subtitle" id="refresh-label">auto-refresh every 2s</div>
|
| 68 |
+
</header>
|
| 69 |
+
|
| 70 |
+
<div id="storage-banner" class="banner warn"></div>
|
| 71 |
+
<div id="stale-banner" class="banner warn"></div>
|
| 72 |
+
<div id="error-banner" class="banner error"></div>
|
| 73 |
+
|
| 74 |
+
<div class="grid">
|
| 75 |
+
<div class="card"><h2>Total images</h2><div class="metric" id="total-requests">β</div><div class="metric-sub" id="total-sub">β ok / β errors</div></div>
|
| 76 |
+
<div class="card"><h2>Today (UTC)</h2><div class="metric" id="req-today">β</div><div class="metric-sub" id="users-today">β unique users</div></div>
|
| 77 |
+
<div class="card"><h2>Last 7 days</h2><div class="metric" id="req-7d">β</div><div class="metric-sub" id="users-7d">β unique users</div></div>
|
| 78 |
+
<div class="card"><h2>Last 30 days</h2><div class="metric" id="req-30d">β</div><div class="metric-sub" id="users-30d">β unique users</div></div>
|
| 79 |
+
<div class="card"><h2>Pending</h2><div class="metric" id="pending">β</div><div class="metric-sub" id="pending-sub">β running / β capacity</div></div>
|
| 80 |
+
<div class="card">
|
| 81 |
+
<h2>Replicas</h2>
|
| 82 |
+
<div class="metric" id="replicas-metric">β</div>
|
| 83 |
+
<div class="metric-sub" id="replicas-sub">β</div>
|
| 84 |
+
<div class="replicas" id="replicas-pills"></div>
|
| 85 |
+
</div>
|
| 86 |
+
<div class="card">
|
| 87 |
+
<h2>By Variant</h2>
|
| 88 |
+
<div class="metric" id="variant-metric">β</div>
|
| 89 |
+
<div class="metric-sub" id="variant-sub">all-time Β· β today</div>
|
| 90 |
+
</div>
|
| 91 |
+
<div class="card"><h2>Uptime</h2><div class="metric" id="uptime">β</div><div class="metric-sub">since last restart</div></div>
|
| 92 |
+
</div>
|
| 93 |
+
|
| 94 |
+
<!-- Row: both charts side-by-side. Daily covers 30d, hourly covers today. -->
|
| 95 |
+
<div class="row equal">
|
| 96 |
+
<div class="card">
|
| 97 |
+
<h2>Requests per day (last 30d)</h2>
|
| 98 |
+
<canvas id="daily-chart" height="80"></canvas>
|
| 99 |
+
</div>
|
| 100 |
+
<div class="card">
|
| 101 |
+
<h2>Today's requests by hour (UTC)</h2>
|
| 102 |
+
<canvas id="hourly-chart" height="80"></canvas>
|
| 103 |
+
</div>
|
| 104 |
+
</div>
|
| 105 |
+
|
| 106 |
+
<!-- Row: image-time stats. Three views of latency: rolling 50, all-time
|
| 107 |
+
per-resolution, today per-resolution. Same column shape so eye can
|
| 108 |
+
scan leftβright and spot drift. -->
|
| 109 |
+
<div class="row three">
|
| 110 |
+
<div class="card">
|
| 111 |
+
<h2>Average latency (last 50 requests)</h2>
|
| 112 |
+
<div style="display: flex; align-items: baseline; gap: 16px; margin-bottom: 12px;">
|
| 113 |
+
<div class="metric" id="avg-latency">β</div>
|
| 114 |
+
<div class="metric-sub" id="avg-latency-sub">across last β requests</div>
|
| 115 |
+
</div>
|
| 116 |
+
<table>
|
| 117 |
+
<thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg latency</th><th class="num">Avg queue</th></tr></thead>
|
| 118 |
+
<tbody id="latency-tbody"></tbody>
|
| 119 |
+
</table>
|
| 120 |
+
</div>
|
| 121 |
+
<div class="card">
|
| 122 |
+
<h2>By resolution (all-time)</h2>
|
| 123 |
+
<table>
|
| 124 |
+
<thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg time</th></tr></thead>
|
| 125 |
+
<tbody id="shape-tbody"></tbody>
|
| 126 |
+
</table>
|
| 127 |
+
</div>
|
| 128 |
+
<div class="card">
|
| 129 |
+
<h2>By resolution (today)</h2>
|
| 130 |
+
<table>
|
| 131 |
+
<thead><tr><th>Resolution</th><th class="num">Count</th><th class="num">Avg time</th><th class="num">Avg queue</th></tr></thead>
|
| 132 |
+
<tbody id="shape-today-tbody"></tbody>
|
| 133 |
+
</table>
|
| 134 |
+
</div>
|
| 135 |
+
</div>
|
| 136 |
+
|
| 137 |
+
<!-- Row: GPU stats. Live nvidia-smi snapshot, today's per-GPU breakdown,
|
| 138 |
+
all-time per-GPU breakdown. Lets you spot tier mix today vs total. -->
|
| 139 |
+
<div class="row three">
|
| 140 |
+
<div class="card">
|
| 141 |
+
<h2>GPUs (live)</h2>
|
| 142 |
+
<div id="gpus"></div>
|
| 143 |
+
</div>
|
| 144 |
+
<div class="card">
|
| 145 |
+
<h2>By GPU (today)</h2>
|
| 146 |
+
<table>
|
| 147 |
+
<thead><tr><th>GPU</th><th class="num">Count</th><th class="num">Avg latency</th><th class="num">Avg queue</th></tr></thead>
|
| 148 |
+
<tbody id="gpu-today-tbody"></tbody>
|
| 149 |
+
</table>
|
| 150 |
+
</div>
|
| 151 |
+
<div class="card">
|
| 152 |
+
<h2>By GPU (all-time)</h2>
|
| 153 |
+
<table>
|
| 154 |
+
<thead><tr><th>GPU</th><th class="num">Reps</th><th class="num">Count</th><th class="num">Errors</th><th class="num">Avg latency</th></tr></thead>
|
| 155 |
+
<tbody id="gpu-tbody"></tbody>
|
| 156 |
+
</table>
|
| 157 |
+
</div>
|
| 158 |
+
</div>
|
| 159 |
+
|
| 160 |
+
<div class="row single">
|
| 161 |
+
<div class="card">
|
| 162 |
+
<h2>Recent requests (last 50)</h2>
|
| 163 |
+
<table>
|
| 164 |
+
<thead><tr><th>Time</th><th>Shape</th><th>GPU</th><th class="num">Queued</th><th class="num">Duration</th><th>User</th><th>Status</th></tr></thead>
|
| 165 |
+
<tbody id="recent-tbody"></tbody>
|
| 166 |
+
</table>
|
| 167 |
+
</div>
|
| 168 |
+
</div>
|
| 169 |
+
|
| 170 |
+
<footer id="footer">β</footer>
|
| 171 |
+
|
| 172 |
+
<script>
|
| 173 |
+
// Absolute paths β the dashboard URL has no trailing slash, so relative
|
| 174 |
+
// `analytics.json` would resolve to `/analytics.json` (wrong) rather than
|
| 175 |
+
// `/dash-β¦/analytics.json`. nginx has explicit location blocks for these.
|
| 176 |
+
const ANALYTICS_URL = "/dash-10a08e9c1ee4/analytics.json";
|
| 177 |
+
const GPU_URL = "/dash-10a08e9c1ee4/gpu-stats.json";
|
| 178 |
+
|
| 179 |
+
function fmtDuration(s) {
|
| 180 |
+
if (!s) return "β";
|
| 181 |
+
const days = Math.floor(s / 86400);
|
| 182 |
+
const hours = Math.floor((s % 86400) / 3600);
|
| 183 |
+
const mins = Math.floor((s % 3600) / 60);
|
| 184 |
+
if (days) return `${days}d ${hours}h`;
|
| 185 |
+
if (hours) return `${hours}h ${mins}m`;
|
| 186 |
+
return `${mins}m`;
|
| 187 |
+
}
|
| 188 |
+
function fmtTime(ts) {
|
| 189 |
+
if (!ts) return "β";
|
| 190 |
+
return new Date(ts * 1000).toLocaleString();
|
| 191 |
+
}
|
| 192 |
+
function fmtRelative(ts) {
|
| 193 |
+
const dt = Date.now() / 1000 - ts;
|
| 194 |
+
if (dt < 60) return `${Math.floor(dt)}s ago`;
|
| 195 |
+
if (dt < 3600) return `${Math.floor(dt / 60)}m ago`;
|
| 196 |
+
if (dt < 86400) return `${Math.floor(dt / 3600)}h ago`;
|
| 197 |
+
return `${Math.floor(dt / 86400)}d ago`;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
let hourlyChart, dailyChart;
|
| 201 |
+
function initCharts() {
|
| 202 |
+
Chart.defaults.color = "#7d8694";
|
| 203 |
+
Chart.defaults.borderColor = "#21272f";
|
| 204 |
+
Chart.defaults.font.family = "-apple-system, BlinkMacSystemFont, Segoe UI, Roboto, sans-serif";
|
| 205 |
+
hourlyChart = new Chart(document.getElementById("hourly-chart"), {
|
| 206 |
+
type: "line",
|
| 207 |
+
data: {
|
| 208 |
+
labels: [],
|
| 209 |
+
datasets: [{
|
| 210 |
+
label: "requests",
|
| 211 |
+
data: [],
|
| 212 |
+
borderColor: "#4cb583",
|
| 213 |
+
backgroundColor: "rgba(76, 181, 131, 0.12)",
|
| 214 |
+
fill: true,
|
| 215 |
+
tension: 0.25,
|
| 216 |
+
pointRadius: 3,
|
| 217 |
+
pointBackgroundColor: "#4cb583",
|
| 218 |
+
pointHoverRadius: 6,
|
| 219 |
+
}],
|
| 220 |
+
},
|
| 221 |
+
options: {
|
| 222 |
+
plugins: { legend: { display: false } },
|
| 223 |
+
scales: { y: { beginAtZero: true, ticks: { precision: 0 } } },
|
| 224 |
+
},
|
| 225 |
+
});
|
| 226 |
+
// Daily chart: single-series line, same style as the hourly chart. We
|
| 227 |
+
// KEEP the per-GPU breakdown data in requests_by_day[].by_gpu β it's just
|
| 228 |
+
// not rendered on the chart. Per-GPU averages are surfaced in the By GPU
|
| 229 |
+
// (today/all-time) tables; this chart sticks to volume-over-time.
|
| 230 |
+
dailyChart = new Chart(document.getElementById("daily-chart"), {
|
| 231 |
+
type: "line",
|
| 232 |
+
data: {
|
| 233 |
+
labels: [],
|
| 234 |
+
datasets: [{
|
| 235 |
+
label: "requests",
|
| 236 |
+
data: [],
|
| 237 |
+
borderColor: "#4cb583",
|
| 238 |
+
backgroundColor: "rgba(76, 181, 131, 0.12)",
|
| 239 |
+
fill: true,
|
| 240 |
+
tension: 0.25,
|
| 241 |
+
pointRadius: 3,
|
| 242 |
+
pointBackgroundColor: "#4cb583",
|
| 243 |
+
pointHoverRadius: 6,
|
| 244 |
+
spanGaps: true,
|
| 245 |
+
}],
|
| 246 |
+
},
|
| 247 |
+
options: {
|
| 248 |
+
plugins: { legend: { display: false } },
|
| 249 |
+
scales: { y: { beginAtZero: true, ticks: { precision: 0 } } },
|
| 250 |
+
},
|
| 251 |
+
});
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
function renderStorageBanner(a) {
|
| 255 |
+
const banner = document.getElementById("storage-banner");
|
| 256 |
+
if (!a) { banner.style.display = "none"; return; }
|
| 257 |
+
if (a.persistent_storage === false) {
|
| 258 |
+
banner.style.display = "block";
|
| 259 |
+
banner.textContent = "β Persistent storage bucket not mounted at /data β counters, model weights, and kernel caches reset on every Space restart. Enable a Storage Bucket in Space Settings β Storage.";
|
| 260 |
+
} else {
|
| 261 |
+
banner.style.display = "none";
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
function renderErrorBanner(msg) {
|
| 266 |
+
const banner = document.getElementById("error-banner");
|
| 267 |
+
if (!msg) { banner.style.display = "none"; return; }
|
| 268 |
+
banner.style.display = "block";
|
| 269 |
+
banner.textContent = msg;
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
function renderStaleBanner(a) {
|
| 273 |
+
// analytics.json gets rewritten every metrics_pusher tick (~2s). If the
|
| 274 |
+
// age creeps past ~10s the pusher is either struggling to reach the
|
| 275 |
+
// backends (load, restart, /metrics timeouts) or the pusher itself is
|
| 276 |
+
// wedged. Either way, surface it so the user doesn't mistake stale
|
| 277 |
+
// numbers for a real lull or zero-out.
|
| 278 |
+
const banner = document.getElementById("stale-banner");
|
| 279 |
+
if (!a || !a.updated_at) { banner.style.display = "none"; return; }
|
| 280 |
+
const ageSec = Math.floor(Date.now() / 1000 - a.updated_at);
|
| 281 |
+
if (ageSec > 10) {
|
| 282 |
+
banner.style.display = "block";
|
| 283 |
+
banner.textContent = `β Metrics are ${ageSec}s stale β the backend likely couldn't answer the last few /metrics polls (often because it's busy with /generate). Numbers shown are the last good scrape.`;
|
| 284 |
+
} else {
|
| 285 |
+
banner.style.display = "none";
|
| 286 |
+
}
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
async function refresh() {
|
| 290 |
+
try {
|
| 291 |
+
const [aResp, gResp] = await Promise.all([fetch(ANALYTICS_URL, { cache: "no-store" }), fetch(GPU_URL, { cache: "no-store" })]);
|
| 292 |
+
if (!aResp.ok || !gResp.ok) throw new Error(`http ${aResp.status}/${gResp.status}`);
|
| 293 |
+
const a = await aResp.json();
|
| 294 |
+
const g = await gResp.json();
|
| 295 |
+
renderStorageBanner(a);
|
| 296 |
+
renderErrorBanner(null);
|
| 297 |
+
renderStaleBanner(a);
|
| 298 |
+
renderSummary(a);
|
| 299 |
+
renderReplicas(a);
|
| 300 |
+
renderVariant(a);
|
| 301 |
+
renderHourly(a);
|
| 302 |
+
renderShapeList(a);
|
| 303 |
+
renderShapeToday(a);
|
| 304 |
+
renderGpuToday(a);
|
| 305 |
+
renderDaily(a);
|
| 306 |
+
renderRecent(a);
|
| 307 |
+
renderGPUs(g);
|
| 308 |
+
renderLatency(a);
|
| 309 |
+
renderByGPU(a);
|
| 310 |
+
document.getElementById("updated").textContent = `updated ${fmtRelative(a.updated_at)}`;
|
| 311 |
+
} catch (e) {
|
| 312 |
+
renderErrorBanner(`Could not load metrics: ${e.message}. Sidecar may be down, or the bucket isn't ready yet.`);
|
| 313 |
+
document.getElementById("updated").textContent = `error: ${e.message}`;
|
| 314 |
+
}
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
function fmtMs(ms) {
|
| 318 |
+
if (ms == null || isNaN(ms)) return "β";
|
| 319 |
+
if (ms < 1000) return `${ms} ms`;
|
| 320 |
+
return `${(ms / 1000).toFixed(1)} s`;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
function renderSummary(a) {
|
| 324 |
+
const t = a.summary_total || { requests: 0, success: 0, errors: 0 };
|
| 325 |
+
document.getElementById("total-requests").textContent = t.requests.toLocaleString();
|
| 326 |
+
document.getElementById("total-sub").innerHTML = `<span class="status-ok">${t.success.toLocaleString()} ok</span> Β· <span class="status-err">${t.errors.toLocaleString()} errors</span>`;
|
| 327 |
+
const today = a.summary_today || { requests: 0, unique_users: 0 };
|
| 328 |
+
document.getElementById("req-today").textContent = today.requests.toLocaleString();
|
| 329 |
+
document.getElementById("users-today").textContent = `${today.unique_users.toLocaleString()} unique users`;
|
| 330 |
+
const d7 = a.summary_7d || { requests: 0, unique_users: 0 };
|
| 331 |
+
document.getElementById("req-7d").textContent = d7.requests.toLocaleString();
|
| 332 |
+
document.getElementById("users-7d").textContent = `${d7.unique_users.toLocaleString()} unique users`;
|
| 333 |
+
const d30 = a.summary_30d || { requests: 0, unique_users: 0 };
|
| 334 |
+
document.getElementById("req-30d").textContent = d30.requests.toLocaleString();
|
| 335 |
+
document.getElementById("users-30d").textContent = `${d30.unique_users.toLocaleString()} unique users`;
|
| 336 |
+
const queue = a.queue_depth ?? 0;
|
| 337 |
+
const running = a.running ?? 0;
|
| 338 |
+
const cap = a.capacity ?? 0;
|
| 339 |
+
const todayQueueAvg = a.today_avg_queue_ms ?? 0;
|
| 340 |
+
document.getElementById("pending").textContent = queue.toLocaleString();
|
| 341 |
+
// Pending subtitle: live cap utilization + today's avg queue. The avg is
|
| 342 |
+
// computed in metrics_pusher from today_bucket.queue_ms_total / requests,
|
| 343 |
+
// so it includes successful + errored requests but not currently-queued
|
| 344 |
+
// ones (those haven't tripped queue_ms yet).
|
| 345 |
+
const queueAvgPart = today.requests ? ` Β· today queue avg ${fmtMs(todayQueueAvg)}` : "";
|
| 346 |
+
document.getElementById("pending-sub").textContent = `${running} running / ${cap} GPU slot${cap === 1 ? "" : "s"}${queueAvgPart}`;
|
| 347 |
+
document.getElementById("uptime").textContent = fmtDuration(a.uptime_s);
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
// Variant breakdown tile β shows the ternary vs binary mix at a glance.
|
| 351 |
+
// Big number: "T:1234 Β· B:567" (all-time). Subtitle: today's split. Variants
|
| 352 |
+
// keyed by name ("ternary" / "binary" / "unknown") from the request's
|
| 353 |
+
// `backend` field; metrics_pusher exposes them under by_variant + by_variant_today.
|
| 354 |
+
function renderVariant(a) {
|
| 355 |
+
const fmtMix = (data) => {
|
| 356 |
+
const t = data?.ternary?.count || 0;
|
| 357 |
+
const b = data?.binary?.count || 0;
|
| 358 |
+
const u = data?.unknown?.count || 0;
|
| 359 |
+
const parts = [`T:${t.toLocaleString()}`, `B:${b.toLocaleString()}`];
|
| 360 |
+
if (u) parts.push(`?:${u.toLocaleString()}`);
|
| 361 |
+
return parts.join(" Β· ");
|
| 362 |
+
};
|
| 363 |
+
document.getElementById("variant-metric").textContent = fmtMix(a.by_variant);
|
| 364 |
+
const todayMix = fmtMix(a.by_variant_today);
|
| 365 |
+
const todayTotal = Object.values(a.by_variant_today || {}).reduce((s, b) => s + (b.count || 0), 0);
|
| 366 |
+
document.getElementById("variant-sub").textContent = todayTotal
|
| 367 |
+
? `all-time Β· today: ${todayMix}`
|
| 368 |
+
: `all-time Β· no requests yet today`;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
// Multi-GPU health card. Shows replicas_seen/expected up top and a row of
|
| 372 |
+
// pills below β one per active replica, dot color reflects healthy or
|
| 373 |
+
// errored. If the seen count is below expected, "X/Y (1 down)" + warn tint.
|
| 374 |
+
function renderReplicas(a) {
|
| 375 |
+
const seen = a.replicas_seen ?? 0;
|
| 376 |
+
const expected = a.replicas_expected ?? seen;
|
| 377 |
+
const per = a.per_replica || [];
|
| 378 |
+
const metricEl = document.getElementById("replicas-metric");
|
| 379 |
+
metricEl.textContent = expected ? `${seen} / ${expected}` : String(seen);
|
| 380 |
+
metricEl.className = "metric" + (seen < expected ? " status-warn" : "");
|
| 381 |
+
const down = Math.max(0, expected - seen);
|
| 382 |
+
const subParts = [];
|
| 383 |
+
if (per.length) {
|
| 384 |
+
// Summarize tier mix: count GPUs by name. "L40S Γ 2" or "L40S + A10G".
|
| 385 |
+
const tierCounts = new Map();
|
| 386 |
+
for (const r of per) tierCounts.set(r.gpu_name, (tierCounts.get(r.gpu_name) || 0) + 1);
|
| 387 |
+
const tierStr = [...tierCounts.entries()]
|
| 388 |
+
.map(([n, c]) => c > 1 ? `${n.replace(/^(NVIDIA |Tesla )/, "")} Γ ${c}` : n.replace(/^(NVIDIA |Tesla )/, ""))
|
| 389 |
+
.join(" + ");
|
| 390 |
+
subParts.push(tierStr);
|
| 391 |
+
} else {
|
| 392 |
+
subParts.push("no replicas responding");
|
| 393 |
+
}
|
| 394 |
+
if (down) subParts.push(`${down} down`);
|
| 395 |
+
document.getElementById("replicas-sub").textContent = subParts.join(" Β· ");
|
| 396 |
+
|
| 397 |
+
// Per-replica pills: short tier label + current inflight/capacity. Hover
|
| 398 |
+
// shows the full gpu_name + uptime via title attribute.
|
| 399 |
+
const pillsEl = document.getElementById("replicas-pills");
|
| 400 |
+
pillsEl.innerHTML = per.map(r => {
|
| 401 |
+
const short = (r.gpu_name || "?").replace(/^(NVIDIA |Tesla )/, "");
|
| 402 |
+
const busy = r.inflight > 0;
|
| 403 |
+
const dotColor = busy ? "var(--warn)" : "var(--accent)";
|
| 404 |
+
const title = `${r.gpu_name || "unknown"} Β· uptime ${fmtDuration(r.uptime_s)} Β· total ${(r.total_requests ?? 0).toLocaleString()}`;
|
| 405 |
+
return `<span class="replica-pill" title="${title}"><span class="dot" style="background: ${dotColor}"></span>${short} ${r.inflight}/${r.capacity}</span>`;
|
| 406 |
+
}).join("");
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
function renderDaily(a) {
|
| 410 |
+
const days = a.requests_by_day || [];
|
| 411 |
+
dailyChart.data.labels = days.map(d => d.date.slice(5)); // MM-DD
|
| 412 |
+
dailyChart.data.datasets[0].data = days.map(d => d.count);
|
| 413 |
+
dailyChart.update("none");
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
function renderShapeList(a) {
|
| 417 |
+
const by = a.by_shape || {};
|
| 418 |
+
const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
|
| 419 |
+
const tbody = document.getElementById("shape-tbody");
|
| 420 |
+
if (!entries.length) {
|
| 421 |
+
tbody.innerHTML = `<tr><td colspan="3" class="metric-sub">no requests yet</td></tr>`;
|
| 422 |
+
return;
|
| 423 |
+
}
|
| 424 |
+
tbody.innerHTML = entries.map(([shape, b]) => `
|
| 425 |
+
<tr>
|
| 426 |
+
<td>${shape}</td>
|
| 427 |
+
<td class="num">${b.count.toLocaleString()}</td>
|
| 428 |
+
<td class="num">${fmtMs(b.duration_ms_avg)}</td>
|
| 429 |
+
</tr>
|
| 430 |
+
`).join("");
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
function renderByGPU(a) {
|
| 434 |
+
const by = a.by_gpu || {};
|
| 435 |
+
const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
|
| 436 |
+
const tbody = document.getElementById("gpu-tbody");
|
| 437 |
+
if (!entries.length) {
|
| 438 |
+
tbody.innerHTML = `<tr><td colspan="5" class="metric-sub">no per-GPU data yet</td></tr>`;
|
| 439 |
+
return;
|
| 440 |
+
}
|
| 441 |
+
// Dropped the explicit Success column from this table when we shrunk it
|
| 442 |
+
// into a 3-col row β success is implied by count - errors and rarely
|
| 443 |
+
// useful at a glance. Error count gets the warn color when nonzero.
|
| 444 |
+
const shortName = (n) => (n || "β").replace(/^(NVIDIA |Tesla )/, "");
|
| 445 |
+
tbody.innerHTML = entries.map(([name, b]) => `
|
| 446 |
+
<tr>
|
| 447 |
+
<td>${shortName(name)}</td>
|
| 448 |
+
<td class="num">${(b.replicas ?? 0).toLocaleString()}</td>
|
| 449 |
+
<td class="num">${b.count.toLocaleString()}</td>
|
| 450 |
+
<td class="num ${(b.errors ?? 0) > 0 ? "status-err" : ""}">${(b.errors ?? 0).toLocaleString()}</td>
|
| 451 |
+
<td class="num">${fmtMs(b.duration_ms_avg)}</td>
|
| 452 |
+
</tr>
|
| 453 |
+
`).join("");
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
// Today-scoped mirrors of renderShapeList / renderByGPU. Same shape of input
|
| 457 |
+
// from metrics_pusher (count + duration_ms_avg per key) so the table markup
|
| 458 |
+
// matches; columns are trimmed since today's per-GPU bucket doesn't carry
|
| 459 |
+
// replicas/success/errors splits.
|
| 460 |
+
function renderShapeToday(a) {
|
| 461 |
+
const by = a.by_shape_today || {};
|
| 462 |
+
const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
|
| 463 |
+
const tbody = document.getElementById("shape-today-tbody");
|
| 464 |
+
if (!entries.length) {
|
| 465 |
+
tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no requests yet today</td></tr>`;
|
| 466 |
+
return;
|
| 467 |
+
}
|
| 468 |
+
tbody.innerHTML = entries.map(([shape, b]) => `
|
| 469 |
+
<tr>
|
| 470 |
+
<td>${shape}</td>
|
| 471 |
+
<td class="num">${b.count.toLocaleString()}</td>
|
| 472 |
+
<td class="num">${fmtMs(b.duration_ms_avg)}</td>
|
| 473 |
+
<td class="num">${fmtMs(b.queue_ms_avg)}</td>
|
| 474 |
+
</tr>
|
| 475 |
+
`).join("");
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
function renderGpuToday(a) {
|
| 479 |
+
const by = a.by_gpu_today || {};
|
| 480 |
+
const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
|
| 481 |
+
const tbody = document.getElementById("gpu-today-tbody");
|
| 482 |
+
if (!entries.length) {
|
| 483 |
+
tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no requests yet today</td></tr>`;
|
| 484 |
+
return;
|
| 485 |
+
}
|
| 486 |
+
// Drop "NVIDIA " / "Tesla " prefix to keep the GPU column narrow in the
|
| 487 |
+
// 3-column row layout.
|
| 488 |
+
const shortName = (n) => (n || "β").replace(/^(NVIDIA |Tesla )/, "");
|
| 489 |
+
tbody.innerHTML = entries.map(([name, b]) => `
|
| 490 |
+
<tr>
|
| 491 |
+
<td>${shortName(name)}</td>
|
| 492 |
+
<td class="num">${b.count.toLocaleString()}</td>
|
| 493 |
+
<td class="num">${fmtMs(b.duration_ms_avg)}</td>
|
| 494 |
+
<td class="num">${fmtMs(b.queue_ms_avg)}</td>
|
| 495 |
+
</tr>
|
| 496 |
+
`).join("");
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
function renderLatency(a) {
|
| 500 |
+
// Latency uses the last-50 window (recent_by_shape) so the numbers feel
|
| 501 |
+
// current β long-term shape avg is on the by-resolution table above.
|
| 502 |
+
document.getElementById("avg-latency").textContent = fmtMs(a.recent_avg_latency_ms);
|
| 503 |
+
const n = a.recent_count ?? 0;
|
| 504 |
+
document.getElementById("avg-latency-sub").textContent = `across last ${n.toLocaleString()} requests`;
|
| 505 |
+
const by = a.recent_by_shape || {};
|
| 506 |
+
const entries = Object.entries(by).sort((x, y) => y[1].count - x[1].count);
|
| 507 |
+
const tbody = document.getElementById("latency-tbody");
|
| 508 |
+
if (!entries.length) {
|
| 509 |
+
tbody.innerHTML = `<tr><td colspan="4" class="metric-sub">no recent requests yet</td></tr>`;
|
| 510 |
+
return;
|
| 511 |
+
}
|
| 512 |
+
tbody.innerHTML = entries.map(([shape, b]) => `
|
| 513 |
+
<tr>
|
| 514 |
+
<td>${shape}</td>
|
| 515 |
+
<td class="num">${b.count.toLocaleString()}</td>
|
| 516 |
+
<td class="num">${fmtMs(b.duration_ms_avg)}</td>
|
| 517 |
+
<td class="num">${fmtMs(b.queue_ms_avg)}</td>
|
| 518 |
+
</tr>
|
| 519 |
+
`).join("");
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
function renderHourly(a) {
|
| 523 |
+
// Today's by-hour, 24 ints indexed by UTC hour
|
| 524 |
+
const buckets = a.requests_by_hour || [];
|
| 525 |
+
hourlyChart.data.labels = buckets.map((_, i) => `${i.toString().padStart(2, "0")}:00`);
|
| 526 |
+
hourlyChart.data.datasets[0].data = buckets;
|
| 527 |
+
hourlyChart.update("none");
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
function renderRecent(a) {
|
| 531 |
+
const rows = (a.recent || []).slice().reverse().slice(0, 50);
|
| 532 |
+
const tbody = document.getElementById("recent-tbody");
|
| 533 |
+
// GPU shorthand: drop the "NVIDIA " prefix so the column stays narrow
|
| 534 |
+
// ("L40S" / "A10G" reads cleaner than "NVIDIA L40S"). Older recent entries
|
| 535 |
+
// (pre-feature) won't have r.gpu β fall back to "β".
|
| 536 |
+
const shortGpu = (g) => (g || "β").replace(/^(NVIDIA |Tesla )/, "");
|
| 537 |
+
tbody.innerHTML = rows.map(r => `
|
| 538 |
+
<tr>
|
| 539 |
+
<td>${fmtRelative(r.ts)}</td>
|
| 540 |
+
<td>${r.shape || "β"}</td>
|
| 541 |
+
<td>${shortGpu(r.gpu)}</td>
|
| 542 |
+
<td class="num">${r.queue_ms != null ? fmtMs(r.queue_ms) : "β"}</td>
|
| 543 |
+
<td class="num">${r.duration_ms ? (r.duration_ms / 1000).toFixed(1) + "s" : "β"}</td>
|
| 544 |
+
<td>${(r.ip_hash || "β").slice(0, 8)}</td>
|
| 545 |
+
<td class="${r.ok ? "status-ok" : "status-err"}">${r.ok ? "ok" : "err"}</td>
|
| 546 |
+
</tr>
|
| 547 |
+
`).join("");
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
function renderGPUs(g) {
|
| 551 |
+
const div = document.getElementById("gpus");
|
| 552 |
+
const gpus = g.gpus || [];
|
| 553 |
+
if (!gpus.length) {
|
| 554 |
+
div.innerHTML = `<div class="metric-sub">${g.error || "no GPU data yet"}</div>`;
|
| 555 |
+
return;
|
| 556 |
+
}
|
| 557 |
+
div.innerHTML = gpus.map(gpu => {
|
| 558 |
+
const memPct = gpu.memory_total_mb ? Math.round(100 * (gpu.memory_used_mb || 0) / gpu.memory_total_mb) : 0;
|
| 559 |
+
const util = gpu.util_pct ?? 0;
|
| 560 |
+
return `
|
| 561 |
+
<div style="margin-bottom: 12px;">
|
| 562 |
+
<div style="display: flex; justify-content: space-between;"><span><b>GPU ${gpu.index}</b> ${gpu.name || ""}</span><span class="metric-sub">${gpu.temp_c ?? "β"}Β°C Β· ${gpu.power_w ? gpu.power_w.toFixed(0) : "β"}W</span></div>
|
| 563 |
+
<div class="metric-sub" style="margin-top: 4px;">util ${util}%</div>
|
| 564 |
+
<div class="gpu-bar"><div class="gpu-bar-fill" style="width: ${util}%"></div></div>
|
| 565 |
+
<div class="metric-sub" style="margin-top: 4px;">mem ${gpu.memory_used_mb ?? "β"} / ${gpu.memory_total_mb ?? "β"} MB (${memPct}%)</div>
|
| 566 |
+
<div class="gpu-bar"><div class="gpu-bar-fill" style="width: ${memPct}%; background: #d97757;"></div></div>
|
| 567 |
+
</div>
|
| 568 |
+
`;
|
| 569 |
+
}).join("");
|
| 570 |
+
document.getElementById("footer").textContent = `GPU sample: ${fmtRelative(g.ts)}`;
|
| 571 |
+
}
|
| 572 |
+
|
| 573 |
+
// Refresh cadence: default 2s, override with `#refresh=N` in the URL
|
| 574 |
+
// (where N is seconds, e.g. #refresh=1 for 1s, #refresh=0.5 for 500ms).
|
| 575 |
+
// metrics_pusher writes JSON every 2s by default β polling faster than
|
| 576 |
+
// that just re-reads the same file. Bump METRICS_INTERVAL env on the
|
| 577 |
+
// Space too if you genuinely need sub-2s.
|
| 578 |
+
function readRefreshMs() {
|
| 579 |
+
const m = (location.hash || "").match(/refresh=([0-9.]+)/);
|
| 580 |
+
if (m) {
|
| 581 |
+
const v = parseFloat(m[1]);
|
| 582 |
+
if (v >= 0.25 && v <= 60) return Math.round(v * 1000);
|
| 583 |
+
}
|
| 584 |
+
return 2000;
|
| 585 |
+
}
|
| 586 |
+
const REFRESH_MS = readRefreshMs();
|
| 587 |
+
document.getElementById("refresh-label").textContent = `auto-refresh every ${(REFRESH_MS / 1000).toString()}s`;
|
| 588 |
+
|
| 589 |
+
initCharts();
|
| 590 |
+
refresh();
|
| 591 |
+
setInterval(refresh, REFRESH_MS);
|
| 592 |
+
</script>
|
| 593 |
+
</body>
|
| 594 |
+
</html>
|
space/entrypoint.sh
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Bonsai-Image HF Space entrypoint.
|
| 3 |
+
#
|
| 4 |
+
# Boot order:
|
| 5 |
+
# 1. Download the ternary gemlite model (~3.5 GB) β idempotent.
|
| 6 |
+
# 2. Generate /tmp/.htpasswd from $DASHBOARD_KEY for the basic-auth gate.
|
| 7 |
+
# 3. Build /tmp/nginx-upstream.conf from `nvidia-smi -L`. One server line
|
| 8 |
+
# per GPU. At N=1 the upstream has one entry; at N>1 we prepend
|
| 9 |
+
# `least_conn;` for variable-duration request routing.
|
| 10 |
+
# 4. Spawn one `uvicorn space.app:app` per GPU on consecutive ports
|
| 11 |
+
# (CUDA_VISIBLE_DEVICES pinned). Each worker's lifespan warms the
|
| 12 |
+
# shapes listed in BONSAI_WARMUP_SHAPES.
|
| 13 |
+
# 5. Wait for the first worker to be ready, then `next start` on :3000
|
| 14 |
+
# (internal β nginx will expose it on :7860).
|
| 15 |
+
# 6. Start metrics_pusher sidecar with a watchdog.
|
| 16 |
+
# 7. Exec nginx on :7860 (the one public port HF sees).
|
| 17 |
+
#
|
| 18 |
+
# Env (HF Space secrets):
|
| 19 |
+
# HF_TOKEN model + tokenizer downloads
|
| 20 |
+
# DASHBOARD_KEY basic-auth password for /dash-<obfuscated>
|
| 21 |
+
# BONSAI_WARMUP_SHAPES default "512x512,1024x1024,1248x832"
|
| 22 |
+
set -euo pipefail
|
| 23 |
+
|
| 24 |
+
APP_DIR="${HOME:-/home/user}/app"
|
| 25 |
+
cd "$APP_DIR"
|
| 26 |
+
|
| 27 |
+
export PATH="$APP_DIR/.venv/bin:$PATH"
|
| 28 |
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
| 29 |
+
|
| 30 |
+
# ββ GPU detection (early β needed for cache namespacing + tier-aware warmup) β
|
| 31 |
+
# nvidia-smi might not return data in some odd container states; treat as
|
| 32 |
+
# "unknown" rather than crashing so the rest of the boot can still run.
|
| 33 |
+
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1 | xargs)
|
| 34 |
+
GPU_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -1 | tr -d '.')
|
| 35 |
+
[ -z "$GPU_NAME" ] && GPU_NAME="unknown"
|
| 36 |
+
[ -z "$GPU_CAP" ] && GPU_CAP="00"
|
| 37 |
+
echo "[OK] GPU: $GPU_NAME (sm_${GPU_CAP})"
|
| 38 |
+
|
| 39 |
+
# Slow GPUs (T4, older Tesla cards): warm only the two square presets we
|
| 40 |
+
# benchmark against (512Β² and 1024Β²) and extend the readiness deadline.
|
| 41 |
+
# Skipping warmup entirely would shift the multi-minute first-call JIT
|
| 42 |
+
# onto the first user request, which corrupts benchmark numbers β better
|
| 43 |
+
# to bake it into boot. BONSAI_WARMUP_SHAPES + BACKEND_READY_TIMEOUT can
|
| 44 |
+
# be overridden via Space Variables if you want different shapes or a
|
| 45 |
+
# longer/shorter deadline.
|
| 46 |
+
case "$GPU_NAME" in
|
| 47 |
+
*T4*|*P100*|*V100*|*K80*|*M60*)
|
| 48 |
+
echo "[WARN] $GPU_NAME is slow β warming only 512x512 + 1024x1024."
|
| 49 |
+
echo " Extending readiness timeout to 30 min for the longer JIT."
|
| 50 |
+
: "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}"
|
| 51 |
+
: "${BACKEND_READY_TIMEOUT:=1800}"
|
| 52 |
+
export BONSAI_WARMUP_SHAPES BACKEND_READY_TIMEOUT
|
| 53 |
+
;;
|
| 54 |
+
esac
|
| 55 |
+
|
| 56 |
+
# ββ persistent storage detection βββββββββββββββββββββββββββββββββββββββββββββ
|
| 57 |
+
# Try to use /data (a Storage Bucket if mounted) for the model + kernel
|
| 58 |
+
# caches + stats. Every filesystem op is wrapped so that if anything fails
|
| 59 |
+
# midway β bucket detached mid-build, mkdir denied, symlink races β we
|
| 60 |
+
# silently fall back to ephemeral storage and keep going. The dashboard
|
| 61 |
+
# banner alerts the user via BONSAI_PERSISTENT_STORAGE.
|
| 62 |
+
_setup_persistent() {
|
| 63 |
+
[ -d /data ] && [ -w /data ] || return 1
|
| 64 |
+
|
| 65 |
+
# Kernel caches namespaced by compute capability so a tier swap (e.g.
|
| 66 |
+
# L40S sm_89 β T4 sm_75 β back to L40S) doesn't pollute either GPU's
|
| 67 |
+
# autotune configs / Triton kernels.
|
| 68 |
+
_gemlite_dir="/data/cache/gemlite-sm${GPU_CAP}"
|
| 69 |
+
_triton_dir="/data/cache/triton-sm${GPU_CAP}"
|
| 70 |
+
|
| 71 |
+
# One-shot migration: if a non-namespaced cache exists from older
|
| 72 |
+
# builds, move it under the current GPU's namespace so we don't lose
|
| 73 |
+
# the pre-existing autotune work.
|
| 74 |
+
if [ -d /data/cache/gemlite ] && [ ! -e "$_gemlite_dir" ]; then
|
| 75 |
+
echo "[INFO] migrating /data/cache/gemlite β gemlite-sm${GPU_CAP}"
|
| 76 |
+
mv /data/cache/gemlite "$_gemlite_dir" 2>/dev/null || true
|
| 77 |
+
fi
|
| 78 |
+
if [ -d /data/cache/triton ] && [ ! -e "$_triton_dir" ]; then
|
| 79 |
+
echo "[INFO] migrating /data/cache/triton β triton-sm${GPU_CAP}"
|
| 80 |
+
mv /data/cache/triton "$_triton_dir" 2>/dev/null || true
|
| 81 |
+
fi
|
| 82 |
+
|
| 83 |
+
mkdir -p /data/models "$_gemlite_dir" "$_triton_dir" /data/state /data/state/daily 2>/dev/null || return 1
|
| 84 |
+
rm -rf "$APP_DIR/models" 2>/dev/null || return 1
|
| 85 |
+
ln -s /data/models "$APP_DIR/models" 2>/dev/null || return 1
|
| 86 |
+
mkdir -p "$APP_DIR/outputs" 2>/dev/null || return 1
|
| 87 |
+
rm -rf "$APP_DIR/outputs/.gemlite_cache" "$APP_DIR/outputs/.triton_cache" 2>/dev/null || true
|
| 88 |
+
ln -s "$_gemlite_dir" "$APP_DIR/outputs/.gemlite_cache" 2>/dev/null || return 1
|
| 89 |
+
ln -s "$_triton_dir" "$APP_DIR/outputs/.triton_cache" 2>/dev/null || return 1
|
| 90 |
+
return 0
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
if _setup_persistent; then
|
| 94 |
+
echo "[OK] /data Storage Bucket attached β model + caches + counters will persist"
|
| 95 |
+
export BONSAI_STATE_DIR=/data/state
|
| 96 |
+
export BONSAI_PERSISTENT_STORAGE=1
|
| 97 |
+
else
|
| 98 |
+
if [ -d /data ]; then
|
| 99 |
+
echo "[WARN] /data is present but couldn't be set up (read-only? quota?). Falling back to ephemeral."
|
| 100 |
+
else
|
| 101 |
+
echo "[WARN] /data not mounted β model, kernel caches, and dashboard"
|
| 102 |
+
echo " counters will reset on every Space restart. Enable a"
|
| 103 |
+
echo " Storage Bucket in Space Settings β Storage to fix."
|
| 104 |
+
fi
|
| 105 |
+
export BONSAI_STATE_DIR="$APP_DIR/outputs/.state"
|
| 106 |
+
export BONSAI_PERSISTENT_STORAGE=0
|
| 107 |
+
mkdir -p "$BONSAI_STATE_DIR/daily" 2>/dev/null || true
|
| 108 |
+
fi
|
| 109 |
+
|
| 110 |
+
# ββ shared IP-hash pepper across all replicas ββββββββββββββββββββββββββββββββ
|
| 111 |
+
# Every replica must hash IPs with the same pepper so unique-user counts
|
| 112 |
+
# don't double across replicas. Extract from state.json if present (so the
|
| 113 |
+
# pepper survives restarts), else generate a fresh one. Each worker reads
|
| 114 |
+
# this via env, regardless of whether it loads cumulative state.
|
| 115 |
+
if [ -f "$BONSAI_STATE_DIR/state.json" ]; then
|
| 116 |
+
BONSAI_IP_PEPPER=$(python3 - "$BONSAI_STATE_DIR/state.json" <<'PY' 2>/dev/null || true
|
| 117 |
+
import json, sys
|
| 118 |
+
try:
|
| 119 |
+
with open(sys.argv[1]) as f:
|
| 120 |
+
print(json.load(f).get("ip_pepper") or "")
|
| 121 |
+
except Exception:
|
| 122 |
+
pass
|
| 123 |
+
PY
|
| 124 |
+
)
|
| 125 |
+
fi
|
| 126 |
+
if [ -z "${BONSAI_IP_PEPPER:-}" ]; then
|
| 127 |
+
BONSAI_IP_PEPPER=$(python3 -c "import secrets; print(secrets.token_hex(16))")
|
| 128 |
+
fi
|
| 129 |
+
export BONSAI_IP_PEPPER
|
| 130 |
+
# Warm only the two square presets users hit most often (512Β² and 1024Β²).
|
| 131 |
+
# Other resolutions JIT on first user request and join the on-disk caches
|
| 132 |
+
# (/data/cache/{gemlite,triton}-smXX/) organically. The warmup-skip sentinel
|
| 133 |
+
# (warmup-done.json next to gemlite autotune) tracks completed (backend,shape)
|
| 134 |
+
# pairs across boots, so subsequent boots skip even these two if they're
|
| 135 |
+
# already cached.
|
| 136 |
+
#
|
| 137 |
+
# Why so few shapes: multi-GPU boots collide during warmup β all N workers
|
| 138 |
+
# race for /data bandwidth + CPU during the gemlite layer pack, and we've
|
| 139 |
+
# seen 4-worker launches hang past BACKEND_READY_TIMEOUT. Two shapes covers
|
| 140 |
+
# the common case (most users render at 512Β² or 1024Β²) without inflating
|
| 141 |
+
# cold-boot wall time.
|
| 142 |
+
: "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}"
|
| 143 |
+
export BONSAI_WARMUP_SHAPES
|
| 144 |
+
|
| 145 |
+
# Binary warmup disabled by default. When enabled, every replica swaps to
|
| 146 |
+
# the binary transformer simultaneously after primary warmup β 4 parallel
|
| 147 |
+
# 3.5 GB state_dict reads from /data + 4 parallel gemlite layer packs.
|
| 148 |
+
# We've seen this hang multi-GPU boots indefinitely. First binary-arm click
|
| 149 |
+
# pays a one-time JIT cost (~30s for an unwarmed shape, after which the
|
| 150 |
+
# cache covers it forever).
|
| 151 |
+
#
|
| 152 |
+
# To re-enable on single-GPU rigs where the collision doesn't apply:
|
| 153 |
+
# set Space Variable BONSAI_WARMUP_EXTRA_BACKENDS=bonsai-binary-gemlite
|
| 154 |
+
: "${BONSAI_WARMUP_EXTRA_BACKENDS:=}"
|
| 155 |
+
export BONSAI_WARMUP_EXTRA_BACKENDS
|
| 156 |
+
|
| 157 |
+
# ββ token sanity check βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 158 |
+
if [ -z "${HF_TOKEN:-}" ]; then
|
| 159 |
+
echo "[ERR] HF_TOKEN not set β add it as a Space Secret so the model can download." >&2
|
| 160 |
+
exit 1
|
| 161 |
+
fi
|
| 162 |
+
export BONSAI_TOKEN="$HF_TOKEN" # what download_model.sh expects
|
| 163 |
+
|
| 164 |
+
# ββ model download / sync ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 165 |
+
# Ship BOTH ternary + binary so the picker's two options actually work. Each
|
| 166 |
+
# repo is ~3.5 GB; first cold boot downloads ~7 GB total, but Storage Bucket
|
| 167 |
+
# (/data/models, symlinked above) keeps them across restarts.
|
| 168 |
+
#
|
| 169 |
+
# We *always* invoke download_model.sh on boot (no file-exists guard). Under
|
| 170 |
+
# the hood it calls huggingface_hub.snapshot_download with `local_dir` set,
|
| 171 |
+
# which HEADs each file in the repo and skips any whose etag matches what's
|
| 172 |
+
# already on disk β so cached boots cost ~10-30s of metadata checks instead
|
| 173 |
+
# of a full redownload. The upside: pushing new weights to HF auto-propagates
|
| 174 |
+
# on the next Space restart without a force flag or manual cache wipe.
|
| 175 |
+
MODEL_DIR="$APP_DIR/models/bonsai-image-4B-ternary-gemlite"
|
| 176 |
+
BINARY_MODEL_DIR="$APP_DIR/models/bonsai-image-4B-binary-gemlite"
|
| 177 |
+
echo "==> syncing bonsai-image-ternary-4B-gemlite-2bit ..."
|
| 178 |
+
./scripts/download_model.sh --model ternary-gemlite
|
| 179 |
+
echo "==> syncing bonsai-image-binary-4B-gemlite-1bit ..."
|
| 180 |
+
./scripts/download_model.sh --model binary-gemlite
|
| 181 |
+
|
| 182 |
+
# ββ htpasswd for the dashboard βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 183 |
+
# DASHBOARD_KEY is a Space Secret; fall back to a sentinel that prints a
|
| 184 |
+
# big warning so missing-secret is obvious in the build log but the Space
|
| 185 |
+
# still comes up (useful while iterating).
|
| 186 |
+
if [ -n "${DASHBOARD_KEY:-}" ]; then
|
| 187 |
+
HASH=$(openssl passwd -apr1 "$DASHBOARD_KEY")
|
| 188 |
+
printf 'admin:%s\n' "$HASH" > /tmp/.htpasswd
|
| 189 |
+
echo "[OK] dashboard: auth enabled (user=admin)"
|
| 190 |
+
else
|
| 191 |
+
echo "[WARN] DASHBOARD_KEY not set β /dash-... is open with admin:open"
|
| 192 |
+
printf 'admin:$apr1$open$open\n' > /tmp/.htpasswd
|
| 193 |
+
fi
|
| 194 |
+
|
| 195 |
+
# ββ nginx scratch dirs βοΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
mkdir -p /tmp/nginx-body /tmp/nginx-proxy /tmp/nginx-fastcgi /tmp/nginx-uwsgi /tmp/nginx-scgi
|
| 197 |
+
|
| 198 |
+
# ββ pre-seed dashboard JSON so the page doesn't 502 before first scrape ββββββ
|
| 199 |
+
printf '{"updated_at":null,"persistent_storage":%s,"summary_total":{"requests":0,"success":0,"errors":0},"summary_today":{"requests":0,"unique_users":0},"summary_7d":{"requests":0,"unique_users":0},"by_shape":{},"requests_by_hour":[],"requests_by_day":[],"recent":[]}\n' \
|
| 200 |
+
"$([ "${BONSAI_PERSISTENT_STORAGE:-0}" = "1" ] && echo true || echo false)" \
|
| 201 |
+
> /tmp/analytics.json
|
| 202 |
+
echo '{"ts":null,"gpus":[]}' > /tmp/gpu-stats.json
|
| 203 |
+
|
| 204 |
+
# ββ pin model paths once; shared across all workers ββββββββββββββββββββββββββ
|
| 205 |
+
# backend_gpu/pipeline_gpu.py reads SEPARATE env vars per variant
|
| 206 |
+
# (TERNARY_TRANSFORMER_PATH vs BINARY_TRANSFORMER_PATH) and the packed
|
| 207 |
+
# transformer subdir name differs per variant (transformer-gemlite-int2
|
| 208 |
+
# for ternary, transformer-gemlite-int1 for binary). Glob each variant's
|
| 209 |
+
# dir for whichever transformer-gemlite-* it actually ships and assign to
|
| 210 |
+
# the right env var. Without the BINARY env var set, the pipeline falls
|
| 211 |
+
# back to its hardcoded /root/models/bonsai-binary/ default β PermissionError
|
| 212 |
+
# on a non-root container the moment a user picks binary in the UI.
|
| 213 |
+
#
|
| 214 |
+
# Note: text_encoder + vae + tokenizer are the SAME artifacts across both
|
| 215 |
+
# variants (Qwen3-4B-4bit + BFL VAE). Pointing them at the ternary copy
|
| 216 |
+
# is fine; binary's copy of these files sits idle on disk after download.
|
| 217 |
+
# That's a one-time ~1 GB of duplication on disk for the simplicity of
|
| 218 |
+
# letting download_model.sh pull the standard HF layout for each repo.
|
| 219 |
+
export MFLUX_STUDIO_GPU_DEFAULT_BACKEND="bonsai-ternary-gemlite"
|
| 220 |
+
_ternary_transformer_dir=$(ls -d "$MODEL_DIR"/transformer-gemlite-* 2>/dev/null | head -1)
|
| 221 |
+
if [ -z "$_ternary_transformer_dir" ]; then
|
| 222 |
+
echo "[ERR] no transformer-gemlite-* subdir under $MODEL_DIR" >&2
|
| 223 |
+
exit 1
|
| 224 |
+
fi
|
| 225 |
+
_binary_transformer_dir=$(ls -d "$BINARY_MODEL_DIR"/transformer-gemlite-* 2>/dev/null | head -1)
|
| 226 |
+
if [ -z "$_binary_transformer_dir" ]; then
|
| 227 |
+
echo "[ERR] no transformer-gemlite-* subdir under $BINARY_MODEL_DIR" >&2
|
| 228 |
+
exit 1
|
| 229 |
+
fi
|
| 230 |
+
export MFLUX_STUDIO_GPU_TERNARY_TRANSFORMER_PATH="$_ternary_transformer_dir"
|
| 231 |
+
export MFLUX_STUDIO_GPU_BINARY_TRANSFORMER_PATH="$_binary_transformer_dir"
|
| 232 |
+
export MFLUX_STUDIO_GPU_TEXT_ENCODER_PATH="$MODEL_DIR/text_encoder-hqq-4bit"
|
| 233 |
+
export MFLUX_STUDIO_GPU_VAE_PATH="$MODEL_DIR/vae"
|
| 234 |
+
export MFLUX_STUDIO_GPU_TOKENIZER_PATH="$MODEL_DIR/text_encoder-hqq-4bit/tokenizer"
|
| 235 |
+
|
| 236 |
+
# ββ detect GPUs + spawn one uvicorn per device βββββββββββββββββββββββββββββββ
|
| 237 |
+
GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l || echo 1)
|
| 238 |
+
[ "$GPU_COUNT" -lt 1 ] && GPU_COUNT=1
|
| 239 |
+
echo "[OK] detected $GPU_COUNT GPU(s)"
|
| 240 |
+
|
| 241 |
+
# Stagger consecutive worker starts. Without this, all N uvicorns hit the
|
| 242 |
+
# /data bucket simultaneously, contending for ~5 GB state_dict reads + the
|
| 243 |
+
# CPU-bound fp16 cast + gemlite layer conversion. We've seen 4-worker
|
| 244 |
+
# launches blow through BACKEND_READY_TIMEOUT this way. Staggering by ~30s
|
| 245 |
+
# (a hair more than the single-worker transformer-load wall time observed
|
| 246 |
+
# on warm bucket / sm_86) lets each worker get past torch.load + gemlite
|
| 247 |
+
# convert before the next starts touching the same files.
|
| 248 |
+
WORKER_START_STAGGER_SECONDS="${BONSAI_WORKER_START_STAGGER_SECONDS:-30}"
|
| 249 |
+
|
| 250 |
+
BACKEND_URLS=""
|
| 251 |
+
UPSTREAM_SERVERS=""
|
| 252 |
+
for i in $(seq 0 $((GPU_COUNT - 1))); do
|
| 253 |
+
PORT=$((8000 + i))
|
| 254 |
+
# Per-replica GPU name (mixed-GPU rigs are rare but possible β look it
|
| 255 |
+
# up by physical index rather than reuse the top-level GPU_NAME).
|
| 256 |
+
REPLICA_GPU=$(nvidia-smi --query-gpu=name --format=csv,noheader -i "$i" 2>/dev/null | head -1 | xargs)
|
| 257 |
+
[ -z "$REPLICA_GPU" ] && REPLICA_GPU="$GPU_NAME"
|
| 258 |
+
echo "==> starting backend on GPU $i ($REPLICA_GPU) β :$PORT (warmup: $BONSAI_WARMUP_SHAPES)"
|
| 259 |
+
# BONSAI_REPLICA_INDEX: only replica 0 seeds counters from state.json;
|
| 260 |
+
# replicas 1+ start at 0 and report deltas. metrics_pusher sums them β
|
| 261 |
+
# correct cumulative without N-way inflation.
|
| 262 |
+
# BONSAI_GPU_NAME: surfaced via /metrics so the pusher can aggregate
|
| 263 |
+
# request counts/latencies per GPU model for the dashboard.
|
| 264 |
+
CUDA_VISIBLE_DEVICES=$i BONSAI_REPLICA_INDEX=$i BONSAI_GPU_NAME="$REPLICA_GPU" \
|
| 265 |
+
uvicorn space.app:app \
|
| 266 |
+
--host 127.0.0.1 --port "$PORT" \
|
| 267 |
+
--no-access-log &
|
| 268 |
+
UPSTREAM_SERVERS="${UPSTREAM_SERVERS} server 127.0.0.1:$PORT;"$'\n'
|
| 269 |
+
[ -n "$BACKEND_URLS" ] && BACKEND_URLS="$BACKEND_URLS,"
|
| 270 |
+
BACKEND_URLS="${BACKEND_URLS}http://127.0.0.1:$PORT"
|
| 271 |
+
# Sleep between consecutive worker starts (skip after the last one).
|
| 272 |
+
# Set BONSAI_WORKER_START_STAGGER_SECONDS=0 to disable if cold-boot
|
| 273 |
+
# wall time matters more than first-boot reliability.
|
| 274 |
+
if [ "$i" -lt "$((GPU_COUNT - 1))" ] && [ "$WORKER_START_STAGGER_SECONDS" -gt 0 ]; then
|
| 275 |
+
echo " β³ sleeping ${WORKER_START_STAGGER_SECONDS}s before next worker (avoid /data + CPU contention)"
|
| 276 |
+
sleep "$WORKER_START_STAGGER_SECONDS"
|
| 277 |
+
fi
|
| 278 |
+
done
|
| 279 |
+
|
| 280 |
+
# At N>1 use least_conn (variable-duration requests β see space/nginx.conf).
|
| 281 |
+
if [ "$GPU_COUNT" -gt 1 ]; then
|
| 282 |
+
LB_DIRECTIVE=" least_conn;"$'\n'
|
| 283 |
+
else
|
| 284 |
+
LB_DIRECTIVE=""
|
| 285 |
+
fi
|
| 286 |
+
printf 'upstream bonsai_workers {\n%s%s}\n' "$LB_DIRECTIVE" "$UPSTREAM_SERVERS" > /tmp/nginx-upstream.conf
|
| 287 |
+
export BACKEND_URLS
|
| 288 |
+
|
| 289 |
+
# ββ wait for backend readiness βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 290 |
+
# Workers only answer /backends after lifespan finishes (kernels compiled +
|
| 291 |
+
# warmup shapes JITed). We poll the first one as a proxy for "ready enough."
|
| 292 |
+
_ready_timeout="${BACKEND_READY_TIMEOUT:-600}"
|
| 293 |
+
echo "==> waiting for backend on :8000 (up to ${_ready_timeout}s) ..."
|
| 294 |
+
for i in $(seq 1 "$_ready_timeout"); do
|
| 295 |
+
if curl -fsS -m 2 http://127.0.0.1:8000/backends > /dev/null 2>&1; then
|
| 296 |
+
echo "[OK] backend ready after ${i}s"
|
| 297 |
+
break
|
| 298 |
+
fi
|
| 299 |
+
sleep 1
|
| 300 |
+
if [ "$i" -eq "$_ready_timeout" ]; then
|
| 301 |
+
echo "[ERR] backend did not come up within ${_ready_timeout}s" >&2
|
| 302 |
+
exit 1
|
| 303 |
+
fi
|
| 304 |
+
done
|
| 305 |
+
|
| 306 |
+
# ββ frontend (next start) on internal :3000 ββββββββββββββββββββββββββββββββββ
|
| 307 |
+
echo "==> starting frontend (next start) on :3000"
|
| 308 |
+
(cd vendor/image-studio/frontend && exec npm start -- --port 3000 --hostname 127.0.0.1) &
|
| 309 |
+
|
| 310 |
+
# ββ metrics_pusher sidecar (watchdog restart on crash) βββββββββββββββββββββββ
|
| 311 |
+
start_metrics_pusher() {
|
| 312 |
+
while true; do
|
| 313 |
+
echo "[watchdog] starting metrics_pusher.py"
|
| 314 |
+
python3 /home/user/app/space/metrics_pusher.py || true
|
| 315 |
+
echo "[watchdog] metrics_pusher.py exited, restarting in 5s"
|
| 316 |
+
sleep 5
|
| 317 |
+
done
|
| 318 |
+
}
|
| 319 |
+
start_metrics_pusher &
|
| 320 |
+
|
| 321 |
+
# ββ nginx β front everything on :7860 (the HF-exposed port) ββββββββββββββββββ
|
| 322 |
+
echo "==> nginx on :7860"
|
| 323 |
+
exec nginx -c /home/user/app/space/nginx.conf -p /home/user/app/
|
space/metrics_pusher.py
ADDED
|
@@ -0,0 +1,599 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Sidecar that aggregates backend /metrics + nvidia-smi into JSON files.
|
| 2 |
+
|
| 3 |
+
On every tick (default 5 s) it writes:
|
| 4 |
+
/tmp/analytics.json current totals, today + 7d summaries, GPU info flag
|
| 5 |
+
/tmp/gpu-stats.json nvidia-smi snapshot
|
| 6 |
+
|
| 7 |
+
Every Nth tick (default 12 β ~1 min) it also writes:
|
| 8 |
+
$BONSAI_STATE_DIR/state.json boot-recovery snapshot
|
| 9 |
+
$BONSAI_STATE_DIR/daily/YYYY-MM-DD.json per-UTC-day archive (one file/day)
|
| 10 |
+
|
| 11 |
+
Robust to:
|
| 12 |
+
- missing /data bucket (writes go to ephemeral $BONSAI_STATE_DIR fallback)
|
| 13 |
+
- missing nvidia-smi
|
| 14 |
+
- backend not yet up (HTTP errors logged, tick continues)
|
| 15 |
+
- FUSE-backed mounts that don't support atomic rename (falls back to in-place)
|
| 16 |
+
"""
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
import subprocess
|
| 22 |
+
import time
|
| 23 |
+
import urllib.request
|
| 24 |
+
from collections import defaultdict
|
| 25 |
+
|
| 26 |
+
# Day bucketing is in UTC β matches what space.app uses for `_by_day` keys
|
| 27 |
+
# (we tried PT but the CUDA Ubuntu base image strips tzdata).
|
| 28 |
+
|
| 29 |
+
BACKEND_URLS = [u.strip() for u in os.environ.get("BACKEND_URLS", "http://127.0.0.1:8000").split(",") if u.strip()]
|
| 30 |
+
INTERVAL = int(os.environ.get("METRICS_INTERVAL", "2"))
|
| 31 |
+
ANALYTICS_PATH = "/tmp/analytics.json"
|
| 32 |
+
GPU_PATH = "/tmp/gpu-stats.json"
|
| 33 |
+
|
| 34 |
+
# Persisted state. STATE_DIR is /data/state when a bucket is mounted, else
|
| 35 |
+
# ephemeral under outputs/ (gone on Space restart).
|
| 36 |
+
STATE_DIR = os.environ.get("BONSAI_STATE_DIR", "/tmp")
|
| 37 |
+
STATE_PATH = os.path.join(STATE_DIR, "state.json")
|
| 38 |
+
DAILY_DIR = os.path.join(STATE_DIR, "daily")
|
| 39 |
+
|
| 40 |
+
# Write durable files (state.json + daily archives) every Nth tick to amortize
|
| 41 |
+
# disk traffic. Losing N*INTERVAL seconds of counter increments on unclean
|
| 42 |
+
# shutdown is acceptable.
|
| 43 |
+
STATE_WRITE_EVERY_N_TICKS = int(os.environ.get("STATE_WRITE_EVERY_N_TICKS", "12"))
|
| 44 |
+
|
| 45 |
+
# Surfaces in analytics.json so the dashboard shows a "counters won't persist"
|
| 46 |
+
# banner when a bucket is not mounted. Set by entrypoint.sh.
|
| 47 |
+
PERSISTENT_STORAGE = os.environ.get("BONSAI_PERSISTENT_STORAGE", "0") == "1"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _fetch_json(url: str, timeout: float = 5.0) -> dict | None:
|
| 51 |
+
# 5s timeout (was 2s): under 16-concurrent /generate load the uvicorn
|
| 52 |
+
# event loop can briefly queue /metrics behind in-flight responses.
|
| 53 |
+
# 5s is still well under the dashboard's polling cadence (so the user
|
| 54 |
+
# doesn't see a delay) and gives the backend headroom under stress.
|
| 55 |
+
try:
|
| 56 |
+
with urllib.request.urlopen(url, timeout=timeout) as resp:
|
| 57 |
+
return json.loads(resp.read())
|
| 58 |
+
except Exception:
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def fetch_backend_metrics() -> dict:
|
| 63 |
+
"""Aggregate /metrics from every backend replica."""
|
| 64 |
+
agg: dict = {
|
| 65 |
+
"total_requests": 0,
|
| 66 |
+
"success": 0,
|
| 67 |
+
"errors": 0,
|
| 68 |
+
"uptime_s": 0,
|
| 69 |
+
"inflight": 0, # sum across replicas β total in-flight requests
|
| 70 |
+
"generate_capacity": 0, # sum of per-replica concurrency caps
|
| 71 |
+
"replicas_seen": 0, # how many replicas answered /metrics this tick
|
| 72 |
+
# Per-replica details β list of {gpu_name, inflight, capacity,
|
| 73 |
+
# uptime_s, total_requests}. Used to compute accurate queue_depth
|
| 74 |
+
# (sum of per-replica (inflight - capacity)+ rather than the sum-
|
| 75 |
+
# then-subtract approximation that hides imbalance) and to render
|
| 76 |
+
# the multi-GPU health card on the dashboard.
|
| 77 |
+
"per_replica": [],
|
| 78 |
+
"by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0}),
|
| 79 |
+
# Cumulative per-variant counter. Replicas each report their own
|
| 80 |
+
# _by_variant; we sum them here. Variants are "ternary", "binary",
|
| 81 |
+
# or "unknown" β parsed from the request's `backend` field.
|
| 82 |
+
"by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
|
| 83 |
+
"by_day": {}, # date -> {requests, success, errors, by_shape, by_hour, unique_ips set, queue_ms_total}
|
| 84 |
+
# Per-GPU model breakdown β each replica's gpu_name + counts +
|
| 85 |
+
# duration sum get folded in. Multiple replicas on the same GPU
|
| 86 |
+
# model (e.g. l40sx4 = 4Γ "NVIDIA L40S") merge into one bucket.
|
| 87 |
+
"by_gpu": defaultdict(lambda: {"count": 0, "success": 0, "errors": 0, "duration_ms_total": 0, "replicas": 0}),
|
| 88 |
+
"recent": [],
|
| 89 |
+
"ip_pepper": None,
|
| 90 |
+
}
|
| 91 |
+
for url in BACKEND_URLS:
|
| 92 |
+
data = _fetch_json(f"{url}/metrics")
|
| 93 |
+
if not data:
|
| 94 |
+
continue
|
| 95 |
+
agg["replicas_seen"] += 1
|
| 96 |
+
agg["total_requests"] += data.get("total_requests", 0)
|
| 97 |
+
agg["success"] += data.get("success", 0)
|
| 98 |
+
agg["errors"] += data.get("errors", 0)
|
| 99 |
+
agg["uptime_s"] = max(agg["uptime_s"], data.get("uptime_s", 0))
|
| 100 |
+
replica_inflight = data.get("inflight", 0)
|
| 101 |
+
replica_capacity = data.get("generate_concurrency", 1)
|
| 102 |
+
agg["inflight"] += replica_inflight
|
| 103 |
+
agg["generate_capacity"] += replica_capacity
|
| 104 |
+
# Per-GPU rollup β fold this replica's totals into its GPU bucket.
|
| 105 |
+
# Default to NVIDIA L40S when missing so historical /metrics without
|
| 106 |
+
# gpu_name (pre-this-feature) don't show up as "unknown".
|
| 107 |
+
gpu = data.get("gpu_name") or "NVIDIA L40S"
|
| 108 |
+
# Per-replica record β keep the gpu_name + cap so the dashboard's
|
| 109 |
+
# multi-GPU health card can render "L40S Β· 1/1 busy" style rows
|
| 110 |
+
# and the queue calc can subtract per-replica.
|
| 111 |
+
agg["per_replica"].append({
|
| 112 |
+
"url": url,
|
| 113 |
+
"gpu_name": gpu,
|
| 114 |
+
"inflight": replica_inflight,
|
| 115 |
+
"capacity": replica_capacity,
|
| 116 |
+
"uptime_s": data.get("uptime_s", 0),
|
| 117 |
+
"total_requests": data.get("total_requests", 0),
|
| 118 |
+
"replica_index": data.get("replica_index"),
|
| 119 |
+
})
|
| 120 |
+
g = agg["by_gpu"][gpu]
|
| 121 |
+
g["count"] += data.get("total_requests", 0)
|
| 122 |
+
g["success"] += data.get("success", 0)
|
| 123 |
+
g["errors"] += data.get("errors", 0)
|
| 124 |
+
g["duration_ms_total"] += data.get("total_duration_ms", 0)
|
| 125 |
+
g["replicas"] += 1
|
| 126 |
+
for shape, b in data.get("by_shape", {}).items():
|
| 127 |
+
agg["by_shape"][shape]["count"] += b.get("count", 0)
|
| 128 |
+
agg["by_shape"][shape]["duration_ms_total"] += b.get("duration_ms_total", 0)
|
| 129 |
+
for v_name, v_data in (data.get("by_variant") or {}).items():
|
| 130 |
+
agg["by_variant"][v_name]["count"] += v_data.get("count", 0)
|
| 131 |
+
agg["by_variant"][v_name]["duration_ms_total"] += v_data.get("duration_ms_total", 0)
|
| 132 |
+
agg["by_variant"][v_name]["queue_ms_total"] += v_data.get("queue_ms_total", 0)
|
| 133 |
+
# Per-day merge: when we go multi-replica, each replica returns its
|
| 134 |
+
# own _by_day β we union them here (sum counters, union unique_ips).
|
| 135 |
+
for date, d in data.get("by_day", {}).items():
|
| 136 |
+
existing = agg["by_day"].setdefault(date, {
|
| 137 |
+
"requests": 0, "success": 0, "errors": 0,
|
| 138 |
+
"by_shape": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
|
| 139 |
+
"by_hour": [0] * 24,
|
| 140 |
+
"unique_ips": set(),
|
| 141 |
+
"by_gpu": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
|
| 142 |
+
"by_variant": defaultdict(lambda: {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0}),
|
| 143 |
+
"queue_ms_total": 0,
|
| 144 |
+
})
|
| 145 |
+
existing["requests"] += d.get("requests", 0)
|
| 146 |
+
existing["success"] += d.get("success", 0)
|
| 147 |
+
existing["errors"] += d.get("errors", 0)
|
| 148 |
+
existing["queue_ms_total"] += d.get("queue_ms_total", 0)
|
| 149 |
+
for shape, b in d.get("by_shape", {}).items():
|
| 150 |
+
existing["by_shape"][shape]["count"] += b.get("count", 0)
|
| 151 |
+
existing["by_shape"][shape]["duration_ms_total"] += b.get("duration_ms_total", 0)
|
| 152 |
+
existing["by_shape"][shape]["queue_ms_total"] += b.get("queue_ms_total", 0)
|
| 153 |
+
for i, c in enumerate(d.get("by_hour") or [0] * 24):
|
| 154 |
+
if i < 24:
|
| 155 |
+
existing["by_hour"][i] += c
|
| 156 |
+
for h in d.get("unique_ips", []) or []:
|
| 157 |
+
existing["unique_ips"].add(h)
|
| 158 |
+
for g_name, g_data in (d.get("by_gpu") or {}).items():
|
| 159 |
+
existing["by_gpu"][g_name]["count"] += g_data.get("count", 0)
|
| 160 |
+
existing["by_gpu"][g_name]["duration_ms_total"] += g_data.get("duration_ms_total", 0)
|
| 161 |
+
existing["by_gpu"][g_name]["queue_ms_total"] += g_data.get("queue_ms_total", 0)
|
| 162 |
+
for v_name, v_data in (d.get("by_variant") or {}).items():
|
| 163 |
+
existing["by_variant"][v_name]["count"] += v_data.get("count", 0)
|
| 164 |
+
existing["by_variant"][v_name]["duration_ms_total"] += v_data.get("duration_ms_total", 0)
|
| 165 |
+
existing["by_variant"][v_name]["queue_ms_total"] += v_data.get("queue_ms_total", 0)
|
| 166 |
+
agg["recent"].extend(data.get("recent", []))
|
| 167 |
+
agg["ip_pepper"] = agg["ip_pepper"] or data.get("ip_pepper")
|
| 168 |
+
|
| 169 |
+
agg["recent"].sort(key=lambda r: r.get("ts", 0))
|
| 170 |
+
agg["recent"] = agg["recent"][-2000:]
|
| 171 |
+
return agg
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def fetch_gpu_stats() -> dict:
|
| 175 |
+
try:
|
| 176 |
+
out = subprocess.check_output(
|
| 177 |
+
[
|
| 178 |
+
"nvidia-smi",
|
| 179 |
+
"--query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit",
|
| 180 |
+
"--format=csv,noheader,nounits",
|
| 181 |
+
],
|
| 182 |
+
timeout=2,
|
| 183 |
+
).decode()
|
| 184 |
+
except Exception as exc:
|
| 185 |
+
return {"ts": int(time.time()), "gpus": [], "error": str(exc)}
|
| 186 |
+
|
| 187 |
+
def _maybe_int(s: str) -> int | None:
|
| 188 |
+
s = s.strip()
|
| 189 |
+
return int(s) if s.isdigit() else None
|
| 190 |
+
|
| 191 |
+
def _maybe_float(s: str) -> float | None:
|
| 192 |
+
try:
|
| 193 |
+
return float(s.strip())
|
| 194 |
+
except ValueError:
|
| 195 |
+
return None
|
| 196 |
+
|
| 197 |
+
gpus = []
|
| 198 |
+
for line in out.strip().splitlines():
|
| 199 |
+
parts = [p.strip() for p in line.split(",")]
|
| 200 |
+
if len(parts) < 8:
|
| 201 |
+
continue
|
| 202 |
+
gpus.append({
|
| 203 |
+
"index": int(parts[0]),
|
| 204 |
+
"name": parts[1],
|
| 205 |
+
"util_pct": _maybe_int(parts[2]),
|
| 206 |
+
"memory_used_mb": _maybe_int(parts[3]),
|
| 207 |
+
"memory_total_mb": _maybe_int(parts[4]),
|
| 208 |
+
"temp_c": _maybe_int(parts[5]),
|
| 209 |
+
"power_w": _maybe_float(parts[6]),
|
| 210 |
+
"power_limit_w": _maybe_float(parts[7]),
|
| 211 |
+
})
|
| 212 |
+
return {"ts": int(time.time()), "gpus": gpus}
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def build_analytics(backend_data: dict) -> dict:
|
| 216 |
+
"""The JSON the dashboard polls. Derived from /metrics so they stay in sync."""
|
| 217 |
+
now = int(time.time())
|
| 218 |
+
today = time.strftime("%Y-%m-%d", time.gmtime(now))
|
| 219 |
+
|
| 220 |
+
by_shape_total = {}
|
| 221 |
+
for shape, b in backend_data["by_shape"].items():
|
| 222 |
+
avg = b["duration_ms_total"] // b["count"] if b["count"] else 0
|
| 223 |
+
by_shape_total[shape] = {"count": b["count"], "duration_ms_avg": avg}
|
| 224 |
+
|
| 225 |
+
# Latency stats derived from the last 50 requests only β same set the
|
| 226 |
+
# dashboard renders in its Recent Requests table. Keeps the latency
|
| 227 |
+
# numbers reactive to current load rather than smoothed by old data.
|
| 228 |
+
recent_window = backend_data["recent"][-50:]
|
| 229 |
+
recent_by_shape_acc: dict[str, dict] = {}
|
| 230 |
+
for r in recent_window:
|
| 231 |
+
s = r.get("shape") or "unknown"
|
| 232 |
+
d = recent_by_shape_acc.setdefault(s, {"count": 0, "duration_ms_total": 0, "queue_ms_total": 0})
|
| 233 |
+
d["count"] += 1
|
| 234 |
+
d["duration_ms_total"] += int(r.get("duration_ms") or 0)
|
| 235 |
+
d["queue_ms_total"] += int(r.get("queue_ms") or 0)
|
| 236 |
+
|
| 237 |
+
recent_by_shape = {}
|
| 238 |
+
for s, b in recent_by_shape_acc.items():
|
| 239 |
+
recent_by_shape[s] = {
|
| 240 |
+
"count": b["count"],
|
| 241 |
+
"duration_ms_avg": b["duration_ms_total"] // b["count"] if b["count"] else 0,
|
| 242 |
+
"queue_ms_avg": b["queue_ms_total"] // b["count"] if b["count"] else 0,
|
| 243 |
+
}
|
| 244 |
+
recent_count_total = sum(b["count"] for b in recent_by_shape_acc.values())
|
| 245 |
+
recent_duration_total = sum(b["duration_ms_total"] for b in recent_by_shape_acc.values())
|
| 246 |
+
recent_avg_latency_ms = recent_duration_total // recent_count_total if recent_count_total else 0
|
| 247 |
+
|
| 248 |
+
today_bucket = backend_data["by_day"].get(today, {})
|
| 249 |
+
today_unique_set = today_bucket.get("unique_ips", set())
|
| 250 |
+
today_unique = len(today_unique_set if isinstance(today_unique_set, set) else list(today_unique_set))
|
| 251 |
+
|
| 252 |
+
# Today-only mirrors of by_shape_total and by_gpu_out. Same shape so the
|
| 253 |
+
# dashboard can render them with the same table helpers; only the scope
|
| 254 |
+
# differs (cumulative vs reset-at-UTC-midnight). Useful for spotting
|
| 255 |
+
# today's tier mix or shape distribution at a glance vs the all-time avg
|
| 256 |
+
# which smooths over the full history. queue_ms_avg is included so the
|
| 257 |
+
# tables can show how queueing pressure is distributed.
|
| 258 |
+
by_shape_today = {}
|
| 259 |
+
for shape, b in (today_bucket.get("by_shape") or {}).items():
|
| 260 |
+
c = b.get("count", 0)
|
| 261 |
+
by_shape_today[shape] = {
|
| 262 |
+
"count": c,
|
| 263 |
+
"duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
|
| 264 |
+
"queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
|
| 265 |
+
}
|
| 266 |
+
by_gpu_today = {}
|
| 267 |
+
for gpu_name, b in (today_bucket.get("by_gpu") or {}).items():
|
| 268 |
+
c = b.get("count", 0)
|
| 269 |
+
by_gpu_today[gpu_name] = {
|
| 270 |
+
"count": c,
|
| 271 |
+
"duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
|
| 272 |
+
"queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
|
| 273 |
+
}
|
| 274 |
+
# by_variant slices: cumulative (across all of by_day history) + today.
|
| 275 |
+
# Today's view drives the new Variant tile in the dashboard summary row.
|
| 276 |
+
by_variant_total = {}
|
| 277 |
+
for v_name, b in backend_data["by_variant"].items():
|
| 278 |
+
c = b.get("count", 0)
|
| 279 |
+
by_variant_total[v_name] = {
|
| 280 |
+
"count": c,
|
| 281 |
+
"duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
|
| 282 |
+
"queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
|
| 283 |
+
}
|
| 284 |
+
by_variant_today = {}
|
| 285 |
+
for v_name, b in (today_bucket.get("by_variant") or {}).items():
|
| 286 |
+
c = b.get("count", 0)
|
| 287 |
+
by_variant_today[v_name] = {
|
| 288 |
+
"count": c,
|
| 289 |
+
"duration_ms_avg": (b.get("duration_ms_total", 0) // c) if c else 0,
|
| 290 |
+
"queue_ms_avg": (b.get("queue_ms_total", 0) // c) if c else 0,
|
| 291 |
+
}
|
| 292 |
+
# Today's overall avg queue, summed across all shapes/gpus. Surfaced as
|
| 293 |
+
# a single number in the Pending tile subtitle on the dashboard.
|
| 294 |
+
today_count = today_bucket.get("requests", 0)
|
| 295 |
+
today_avg_queue_ms = (today_bucket.get("queue_ms_total", 0) // today_count) if today_count else 0
|
| 296 |
+
|
| 297 |
+
def _summary_for_last(n_days: int) -> dict:
|
| 298 |
+
days = sorted(backend_data["by_day"].keys())[-n_days:]
|
| 299 |
+
req = sum(backend_data["by_day"][d].get("requests", 0) for d in days)
|
| 300 |
+
uniques: set = set()
|
| 301 |
+
for d in days:
|
| 302 |
+
ips = backend_data["by_day"][d].get("unique_ips", set())
|
| 303 |
+
uniques.update(ips if isinstance(ips, set) else list(ips))
|
| 304 |
+
return {"requests": req, "unique_users": len(uniques)}
|
| 305 |
+
|
| 306 |
+
# Include per-GPU counts on each day so the dashboard can stack the daily
|
| 307 |
+
# chart by GPU. Each day's by_gpu dict only carries GPUs that actually
|
| 308 |
+
# served traffic that day, so the dashboard derives the union of all GPU
|
| 309 |
+
# names client-side and fills missing days with 0. duration_ms_total is
|
| 310 |
+
# surfaced too so a future "stacked latency view" doesn't need new fields.
|
| 311 |
+
requests_by_day = [
|
| 312 |
+
{
|
| 313 |
+
"date": d,
|
| 314 |
+
"count": backend_data["by_day"][d].get("requests", 0),
|
| 315 |
+
"by_gpu": {
|
| 316 |
+
g_name: {
|
| 317 |
+
"count": g.get("count", 0),
|
| 318 |
+
"duration_ms_total": g.get("duration_ms_total", 0),
|
| 319 |
+
}
|
| 320 |
+
for g_name, g in (backend_data["by_day"][d].get("by_gpu") or {}).items()
|
| 321 |
+
},
|
| 322 |
+
}
|
| 323 |
+
for d in sorted(backend_data["by_day"].keys())[-30:]
|
| 324 |
+
]
|
| 325 |
+
requests_by_hour = list(today_bucket.get("by_hour", [0] * 24))
|
| 326 |
+
|
| 327 |
+
# Overall average latency, derived from by_shape (since duration totals
|
| 328 |
+
# live there, not in the cumulative counter).
|
| 329 |
+
total_duration_ms = sum(b["duration_ms_total"] for b in backend_data["by_shape"].values())
|
| 330 |
+
total_durations_count = sum(b["count"] for b in backend_data["by_shape"].values())
|
| 331 |
+
avg_latency_ms = total_duration_ms // total_durations_count if total_durations_count else 0
|
| 332 |
+
|
| 333 |
+
# Queue depth = whatever is in-flight beyond GPU-running capacity. Has
|
| 334 |
+
# to be summed PER REPLICA: if 4 are queued on replica 0 and replica 1
|
| 335 |
+
# is idle, naive sum(inflight) - sum(capacity) = max(0, 4-2) = 2 hides
|
| 336 |
+
# the fact that replica 0 has a 3-deep queue while replica 1 idles.
|
| 337 |
+
# Per-replica max(0, inflight-capacity) correctly attributes the queue.
|
| 338 |
+
per_replica = backend_data.get("per_replica", [])
|
| 339 |
+
inflight = sum(r["inflight"] for r in per_replica) if per_replica else backend_data.get("inflight", 0)
|
| 340 |
+
capacity = sum(r["capacity"] for r in per_replica) if per_replica else (
|
| 341 |
+
backend_data.get("generate_capacity", 0) or backend_data.get("replicas_seen", 1)
|
| 342 |
+
)
|
| 343 |
+
queue_depth = sum(max(0, r["inflight"] - r["capacity"]) for r in per_replica)
|
| 344 |
+
running = sum(min(r["inflight"], r["capacity"]) for r in per_replica) if per_replica else min(inflight, capacity)
|
| 345 |
+
|
| 346 |
+
# Per-GPU breakdown for the bottom-of-dashboard "By GPU" card. Count,
|
| 347 |
+
# success/error split, avg latency per GPU model. Useful for spotting
|
| 348 |
+
# variance between tiers (e.g. L40S vs T4) during benchmarking.
|
| 349 |
+
by_gpu_out = {}
|
| 350 |
+
for gpu_name, b in backend_data["by_gpu"].items():
|
| 351 |
+
c = b["count"]
|
| 352 |
+
by_gpu_out[gpu_name] = {
|
| 353 |
+
"count": c,
|
| 354 |
+
"success": b["success"],
|
| 355 |
+
"errors": b["errors"],
|
| 356 |
+
"duration_ms_avg": b["duration_ms_total"] // c if c else 0,
|
| 357 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 358 |
+
"replicas": b["replicas"],
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
return {
|
| 362 |
+
"updated_at": now,
|
| 363 |
+
"uptime_s": backend_data.get("uptime_s", 0),
|
| 364 |
+
"persistent_storage": PERSISTENT_STORAGE,
|
| 365 |
+
"state_dir": STATE_DIR,
|
| 366 |
+
"replicas_seen": backend_data.get("replicas_seen", 0),
|
| 367 |
+
# entrypoint.sh sets BACKEND_URLS once per boot, so this is the
|
| 368 |
+
# number we *expect* to see β diff against replicas_seen tells the
|
| 369 |
+
# dashboard "1 replica is unhealthy" vs "2 of 2 happy".
|
| 370 |
+
"replicas_expected": len(BACKEND_URLS),
|
| 371 |
+
"per_replica": backend_data.get("per_replica", []),
|
| 372 |
+
"inflight": inflight,
|
| 373 |
+
"running": running,
|
| 374 |
+
"queue_depth": queue_depth,
|
| 375 |
+
"capacity": capacity,
|
| 376 |
+
"today_avg_queue_ms": today_avg_queue_ms,
|
| 377 |
+
"summary_total": {
|
| 378 |
+
"requests": backend_data["total_requests"],
|
| 379 |
+
"success": backend_data["success"],
|
| 380 |
+
"errors": backend_data["errors"],
|
| 381 |
+
},
|
| 382 |
+
"summary_today": {
|
| 383 |
+
"requests": today_bucket.get("requests", 0),
|
| 384 |
+
"unique_users": today_unique,
|
| 385 |
+
},
|
| 386 |
+
"summary_7d": _summary_for_last(7),
|
| 387 |
+
"summary_30d": _summary_for_last(30),
|
| 388 |
+
"avg_latency_ms": avg_latency_ms,
|
| 389 |
+
"by_shape": by_shape_total,
|
| 390 |
+
"by_shape_today": by_shape_today,
|
| 391 |
+
"by_gpu": by_gpu_out,
|
| 392 |
+
"by_gpu_today": by_gpu_today,
|
| 393 |
+
"by_variant": by_variant_total,
|
| 394 |
+
"by_variant_today": by_variant_today,
|
| 395 |
+
"recent_by_shape": recent_by_shape,
|
| 396 |
+
"recent_avg_latency_ms": recent_avg_latency_ms,
|
| 397 |
+
"recent_count": recent_count_total,
|
| 398 |
+
"requests_by_hour": requests_by_hour,
|
| 399 |
+
"requests_by_day": requests_by_day,
|
| 400 |
+
"recent": backend_data["recent"][-100:],
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def _atomic_write(path: str, payload: dict, indent: int | None = None) -> None:
|
| 405 |
+
"""Write JSON atomically. Falls back to direct overwrite if rename fails
|
| 406 |
+
(some FUSE-backed mounts don't support rename within a dir)."""
|
| 407 |
+
text = json.dumps(payload, indent=indent, sort_keys=indent is not None)
|
| 408 |
+
tmp = path + ".tmp"
|
| 409 |
+
try:
|
| 410 |
+
with open(tmp, "w") as f:
|
| 411 |
+
f.write(text)
|
| 412 |
+
os.replace(tmp, path)
|
| 413 |
+
except OSError as exc:
|
| 414 |
+
print(f"[metrics_pusher] atomic rename failed for {path} ({exc}); writing in place", flush=True)
|
| 415 |
+
try:
|
| 416 |
+
with open(path, "w") as f:
|
| 417 |
+
f.write(text)
|
| 418 |
+
except OSError as exc2:
|
| 419 |
+
print(f"[metrics_pusher] direct write also failed for {path} ({exc2})", flush=True)
|
| 420 |
+
finally:
|
| 421 |
+
try:
|
| 422 |
+
os.unlink(tmp)
|
| 423 |
+
except OSError:
|
| 424 |
+
pass
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
def write_state(backend_data: dict) -> None:
|
| 428 |
+
"""Snapshot for boot-recovery. Includes per-day so the app can resume
|
| 429 |
+
counter buckets for in-flight days."""
|
| 430 |
+
by_day_out = {}
|
| 431 |
+
for date, d in backend_data["by_day"].items():
|
| 432 |
+
ips = d["unique_ips"]
|
| 433 |
+
by_day_out[date] = {
|
| 434 |
+
"requests": d["requests"],
|
| 435 |
+
"success": d["success"],
|
| 436 |
+
"errors": d["errors"],
|
| 437 |
+
"queue_ms_total": d.get("queue_ms_total", 0),
|
| 438 |
+
"by_shape": {
|
| 439 |
+
s: {
|
| 440 |
+
"count": b["count"],
|
| 441 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 442 |
+
"queue_ms_total": b.get("queue_ms_total", 0),
|
| 443 |
+
}
|
| 444 |
+
for s, b in d["by_shape"].items()
|
| 445 |
+
},
|
| 446 |
+
"by_hour": list(d["by_hour"]),
|
| 447 |
+
"unique_ips": sorted(ips) if isinstance(ips, set) else list(ips),
|
| 448 |
+
"by_gpu": {
|
| 449 |
+
g: {
|
| 450 |
+
"count": v["count"],
|
| 451 |
+
"duration_ms_total": v["duration_ms_total"],
|
| 452 |
+
"queue_ms_total": v.get("queue_ms_total", 0),
|
| 453 |
+
}
|
| 454 |
+
for g, v in (d.get("by_gpu") or {}).items()
|
| 455 |
+
},
|
| 456 |
+
"by_variant": {
|
| 457 |
+
v: {
|
| 458 |
+
"count": b["count"],
|
| 459 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 460 |
+
"queue_ms_total": b.get("queue_ms_total", 0),
|
| 461 |
+
}
|
| 462 |
+
for v, b in (d.get("by_variant") or {}).items()
|
| 463 |
+
},
|
| 464 |
+
}
|
| 465 |
+
payload = {
|
| 466 |
+
"total_requests": backend_data["total_requests"],
|
| 467 |
+
"success": backend_data["success"],
|
| 468 |
+
"errors": backend_data["errors"],
|
| 469 |
+
"by_shape": {
|
| 470 |
+
shape: {"count": b["count"], "duration_ms_total": b["duration_ms_total"]}
|
| 471 |
+
for shape, b in backend_data["by_shape"].items()
|
| 472 |
+
},
|
| 473 |
+
"by_variant": {
|
| 474 |
+
v: {
|
| 475 |
+
"count": b["count"],
|
| 476 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 477 |
+
"queue_ms_total": b.get("queue_ms_total", 0),
|
| 478 |
+
}
|
| 479 |
+
for v, b in backend_data["by_variant"].items()
|
| 480 |
+
},
|
| 481 |
+
"by_day": by_day_out,
|
| 482 |
+
"recent": backend_data["recent"][-100:],
|
| 483 |
+
"ip_pepper": backend_data.get("ip_pepper"),
|
| 484 |
+
"saved_at": int(time.time()),
|
| 485 |
+
}
|
| 486 |
+
try:
|
| 487 |
+
os.makedirs(STATE_DIR, exist_ok=True)
|
| 488 |
+
except OSError as exc:
|
| 489 |
+
print(f"[metrics_pusher] mkdir {STATE_DIR} failed ({exc}); skipping state write", flush=True)
|
| 490 |
+
return
|
| 491 |
+
_atomic_write(STATE_PATH, payload)
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def write_daily_archives(backend_data: dict) -> None:
|
| 495 |
+
"""One JSON file per UTC date. Today's file gets rewritten each tick; past
|
| 496 |
+
days only on a restart that reloads their bucket from state.json."""
|
| 497 |
+
if not backend_data["by_day"]:
|
| 498 |
+
return
|
| 499 |
+
try:
|
| 500 |
+
os.makedirs(DAILY_DIR, exist_ok=True)
|
| 501 |
+
except OSError as exc:
|
| 502 |
+
print(f"[metrics_pusher] mkdir {DAILY_DIR} failed ({exc}); skipping daily writes", flush=True)
|
| 503 |
+
return
|
| 504 |
+
for date, d in backend_data["by_day"].items():
|
| 505 |
+
by_shape_out = {}
|
| 506 |
+
for shape, b in d["by_shape"].items():
|
| 507 |
+
c = b["count"]
|
| 508 |
+
by_shape_out[shape] = {
|
| 509 |
+
"count": c,
|
| 510 |
+
"duration_ms_total": b["duration_ms_total"],
|
| 511 |
+
"duration_ms_avg": b["duration_ms_total"] // c if c else 0,
|
| 512 |
+
"queue_ms_total": b.get("queue_ms_total", 0),
|
| 513 |
+
"queue_ms_avg": b.get("queue_ms_total", 0) // c if c else 0,
|
| 514 |
+
}
|
| 515 |
+
by_gpu_out = {}
|
| 516 |
+
for g_name, g in (d.get("by_gpu") or {}).items():
|
| 517 |
+
c = g["count"]
|
| 518 |
+
by_gpu_out[g_name] = {
|
| 519 |
+
"count": c,
|
| 520 |
+
"duration_ms_total": g["duration_ms_total"],
|
| 521 |
+
"duration_ms_avg": g["duration_ms_total"] // c if c else 0,
|
| 522 |
+
"queue_ms_total": g.get("queue_ms_total", 0),
|
| 523 |
+
"queue_ms_avg": g.get("queue_ms_total", 0) // c if c else 0,
|
| 524 |
+
}
|
| 525 |
+
by_variant_out = {}
|
| 526 |
+
for v_name, v in (d.get("by_variant") or {}).items():
|
| 527 |
+
c = v["count"]
|
| 528 |
+
by_variant_out[v_name] = {
|
| 529 |
+
"count": c,
|
| 530 |
+
"duration_ms_total": v["duration_ms_total"],
|
| 531 |
+
"duration_ms_avg": v["duration_ms_total"] // c if c else 0,
|
| 532 |
+
"queue_ms_total": v.get("queue_ms_total", 0),
|
| 533 |
+
"queue_ms_avg": v.get("queue_ms_total", 0) // c if c else 0,
|
| 534 |
+
}
|
| 535 |
+
ips = d["unique_ips"]
|
| 536 |
+
day_req = d["requests"]
|
| 537 |
+
day_queue_total = d.get("queue_ms_total", 0)
|
| 538 |
+
payload = {
|
| 539 |
+
"date": date,
|
| 540 |
+
"updated_at": int(time.time()),
|
| 541 |
+
"requests": day_req,
|
| 542 |
+
"success": d["success"],
|
| 543 |
+
"errors": d["errors"],
|
| 544 |
+
"queue_ms_total": day_queue_total,
|
| 545 |
+
"queue_ms_avg": day_queue_total // day_req if day_req else 0,
|
| 546 |
+
"unique_users": len(ips) if isinstance(ips, set) else len(list(ips)),
|
| 547 |
+
"by_shape": by_shape_out,
|
| 548 |
+
"by_hour": list(d["by_hour"]),
|
| 549 |
+
"by_gpu": by_gpu_out,
|
| 550 |
+
"by_variant": by_variant_out,
|
| 551 |
+
}
|
| 552 |
+
_atomic_write(os.path.join(DAILY_DIR, f"{date}.json"), payload, indent=2)
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
def main() -> None:
|
| 556 |
+
print(
|
| 557 |
+
f"[metrics_pusher] backends={BACKEND_URLS} interval={INTERVAL}s "
|
| 558 |
+
f"state_dir={STATE_DIR} persistent_storage={PERSISTENT_STORAGE}",
|
| 559 |
+
flush=True,
|
| 560 |
+
)
|
| 561 |
+
tick = 0
|
| 562 |
+
consecutive_zero = 0
|
| 563 |
+
while True:
|
| 564 |
+
try:
|
| 565 |
+
backend_data = fetch_backend_metrics()
|
| 566 |
+
gpu_data = fetch_gpu_stats()
|
| 567 |
+
# nvidia-smi runs locally and is independent of backend health,
|
| 568 |
+
# so always refresh GPU stats.
|
| 569 |
+
_atomic_write(GPU_PATH, gpu_data)
|
| 570 |
+
|
| 571 |
+
if backend_data["replicas_seen"] == 0:
|
| 572 |
+
# NO replicas answered /metrics this tick β usually means
|
| 573 |
+
# they're all saturated. DON'T overwrite analytics.json
|
| 574 |
+
# with zero-everywhere defaults; keep the prior file so
|
| 575 |
+
# the dashboard stays meaningful. Updated_at age will
|
| 576 |
+
# naturally drift to indicate staleness.
|
| 577 |
+
consecutive_zero += 1
|
| 578 |
+
print(
|
| 579 |
+
f"[metrics_pusher] tick {tick}: no replicas responded "
|
| 580 |
+
f"(consecutive={consecutive_zero}); keeping prior analytics.json",
|
| 581 |
+
flush=True,
|
| 582 |
+
)
|
| 583 |
+
else:
|
| 584 |
+
if consecutive_zero > 0:
|
| 585 |
+
print(f"[metrics_pusher] backends recovered after {consecutive_zero} miss(es)", flush=True)
|
| 586 |
+
consecutive_zero = 0
|
| 587 |
+
analytics = build_analytics(backend_data)
|
| 588 |
+
_atomic_write(ANALYTICS_PATH, analytics)
|
| 589 |
+
if tick % STATE_WRITE_EVERY_N_TICKS == 0:
|
| 590 |
+
write_state(backend_data)
|
| 591 |
+
write_daily_archives(backend_data)
|
| 592 |
+
except Exception as exc:
|
| 593 |
+
print(f"[metrics_pusher] tick error: {exc}", flush=True)
|
| 594 |
+
tick += 1
|
| 595 |
+
time.sleep(INTERVAL)
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
if __name__ == "__main__":
|
| 599 |
+
main()
|
space/nginx.conf
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# nginx config for the Bonsai-Image HF Space.
|
| 2 |
+
# - :7860 is the only public port (HF exposes it).
|
| 3 |
+
# - / and /api/* go to the Next.js frontend on :3000.
|
| 4 |
+
# - /generate, /backends, /docs go to one (or many) uvicorn backends via
|
| 5 |
+
# the upstream block, which entrypoint.sh builds from `nvidia-smi -L`.
|
| 6 |
+
# At N=1 it's just one server line; at N>1 we add least_conn.
|
| 7 |
+
# - /dash-<obfuscated> is the metrics dashboard, basic-auth gated.
|
| 8 |
+
#
|
| 9 |
+
# Run as: nginx -c /home/user/app/space/nginx.conf -p /home/user/app/
|
| 10 |
+
|
| 11 |
+
worker_processes 1;
|
| 12 |
+
daemon off;
|
| 13 |
+
pid /tmp/nginx.pid;
|
| 14 |
+
error_log /dev/stderr warn;
|
| 15 |
+
|
| 16 |
+
events {
|
| 17 |
+
worker_connections 256;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
http {
|
| 21 |
+
default_type application/octet-stream;
|
| 22 |
+
sendfile on;
|
| 23 |
+
keepalive_timeout 65;
|
| 24 |
+
|
| 25 |
+
# nginx's stock /var/log/... isn't writable by uid 1000 on the HF image,
|
| 26 |
+
# so redirect everything into /tmp where we have write access.
|
| 27 |
+
client_body_temp_path /tmp/nginx-body;
|
| 28 |
+
proxy_temp_path /tmp/nginx-proxy;
|
| 29 |
+
fastcgi_temp_path /tmp/nginx-fastcgi;
|
| 30 |
+
uwsgi_temp_path /tmp/nginx-uwsgi;
|
| 31 |
+
scgi_temp_path /tmp/nginx-scgi;
|
| 32 |
+
access_log /tmp/nginx-access.log;
|
| 33 |
+
|
| 34 |
+
# Built at boot by entrypoint.sh from `nvidia-smi -L` β one server line
|
| 35 |
+
# per GPU. Today: one server at :8000.
|
| 36 |
+
include /tmp/nginx-upstream.conf;
|
| 37 |
+
|
| 38 |
+
server {
|
| 39 |
+
listen 7860 default_server;
|
| 40 |
+
client_max_body_size 16M;
|
| 41 |
+
|
| 42 |
+
# ββ frontend ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
location / {
|
| 44 |
+
proxy_pass http://127.0.0.1:3000;
|
| 45 |
+
proxy_http_version 1.1;
|
| 46 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 47 |
+
proxy_set_header Connection "upgrade";
|
| 48 |
+
proxy_set_header Host $host;
|
| 49 |
+
# APPEND $remote_addr to existing X-Forwarded-For (the chain HF's
|
| 50 |
+
# edge proxy already set with the real visitor IP). Using
|
| 51 |
+
# $remote_addr alone would overwrite that with the edge proxy's
|
| 52 |
+
# IP β same for all visitors β collapsing every user to one hash
|
| 53 |
+
# in the dashboard's unique-user counter.
|
| 54 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 55 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 56 |
+
# Generations can run several seconds; Next.js streams the
|
| 57 |
+
# response back so don't time the connection out.
|
| 58 |
+
proxy_read_timeout 600s;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# ββ backend API surface (called by Next.js api/generate route + curl) β
|
| 62 |
+
location ~ ^/(generate|backends|docs|openapi\.json)$ {
|
| 63 |
+
proxy_pass http://bonsai_workers;
|
| 64 |
+
proxy_http_version 1.1;
|
| 65 |
+
proxy_set_header Host $host;
|
| 66 |
+
# APPEND $remote_addr to existing X-Forwarded-For (the chain HF's
|
| 67 |
+
# edge proxy already set with the real visitor IP). Using
|
| 68 |
+
# $remote_addr alone would overwrite that with the edge proxy's
|
| 69 |
+
# IP β same for all visitors β collapsing every user to one hash
|
| 70 |
+
# in the dashboard's unique-user counter.
|
| 71 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 72 |
+
proxy_read_timeout 600s;
|
| 73 |
+
proxy_buffering off; # stream PNG bytes back immediately
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
# ββ dashboard βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
+
# Obfuscated path + basic auth. Path suffix is in source (visible to
|
| 78 |
+
# anyone with repo read access); auth is the actual gate.
|
| 79 |
+
# Trailing-slash exact-match keeps /dash-... from leaking into other
|
| 80 |
+
# locations.
|
| 81 |
+
location = /dash-10a08e9c1ee4 {
|
| 82 |
+
auth_basic "Bonsai Dashboard";
|
| 83 |
+
auth_basic_user_file /tmp/.htpasswd;
|
| 84 |
+
alias /home/user/app/space/dashboard.html;
|
| 85 |
+
default_type text/html;
|
| 86 |
+
add_header Cache-Control "no-store" always;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
location = /dash-10a08e9c1ee4/analytics.json {
|
| 90 |
+
auth_basic "Bonsai Dashboard";
|
| 91 |
+
auth_basic_user_file /tmp/.htpasswd;
|
| 92 |
+
alias /tmp/analytics.json;
|
| 93 |
+
default_type application/json;
|
| 94 |
+
add_header Cache-Control "no-store" always;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
location = /dash-10a08e9c1ee4/gpu-stats.json {
|
| 98 |
+
auth_basic "Bonsai Dashboard";
|
| 99 |
+
auth_basic_user_file /tmp/.htpasswd;
|
| 100 |
+
alias /tmp/gpu-stats.json;
|
| 101 |
+
default_type application/json;
|
| 102 |
+
add_header Cache-Control "no-store" always;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
# Catchall under the dashboard prefix β 404 (don't reveal what else
|
| 106 |
+
# might exist there).
|
| 107 |
+
location ~ ^/dash- {
|
| 108 |
+
return 404;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
# /metrics on the backend is loopback-only; nginx doesn't forward it.
|
| 112 |
+
# (metrics_pusher.py scrapes it directly at 127.0.0.1:8000/metrics.)
|
| 113 |
+
location = /metrics {
|
| 114 |
+
return 404;
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
}
|