#!/bin/bash # Bonsai-Image HF Space entrypoint. # # Boot order: # 1. Download the ternary gemlite model (~3.5 GB) — idempotent. # 2. Generate /tmp/.htpasswd from $DASHBOARD_KEY for the basic-auth gate. # 3. Build /tmp/nginx-upstream.conf from `nvidia-smi -L`. One server line # per GPU. At N=1 the upstream has one entry; at N>1 we prepend # `least_conn;` for variable-duration request routing. # 4. Spawn one `uvicorn space.app:app` per GPU on consecutive ports # (CUDA_VISIBLE_DEVICES pinned). Each worker's lifespan warms the # shapes listed in BONSAI_WARMUP_SHAPES. # 5. Wait for the first worker to be ready, then `next start` on :3000 # (internal — nginx will expose it on :7860). # 6. Start metrics_pusher sidecar with a watchdog. # 7. Exec nginx on :7860 (the one public port HF sees). # # Env (HF Space secrets): # (no HF_TOKEN needed — model repos are public; any token in env is scrubbed) # DASHBOARD_KEY basic-auth password for /dash- # BONSAI_WARMUP_SHAPES default "512x512,1024x1024,1248x832" set -euo pipefail APP_DIR="${HOME:-/home/user}/app" cd "$APP_DIR" export PATH="$APP_DIR/.venv/bin:$PATH" export HF_HUB_ENABLE_HF_TRANSFER=1 # ── reap orphaned workers from a prior crashed boot ────────────────────────── # HF can restart this entrypoint inside the SAME container. The processes we # launch with `&` (uvicorn × N, next, metrics_pusher) and the exec'd nginx are # NOT torn down when a previous run crashes mid-boot — they survive as orphans # still holding :8000-800N, :3000, and :7860. The next boot then dies on every # bind with EADDRINUSE. SIGKILL any leftovers up front so the ports are free. # On a fresh container these patterns match nothing; `|| true` keeps `set -e` # happy. (None match this entrypoint's own cmdline, so there's no self-kill.) echo "==> reaping any stale processes from a prior boot ..." pkill -9 -f "uvicorn space.app:app" 2>/dev/null || true pkill -9 -f "metrics_pusher.py" 2>/dev/null || true pkill -9 -f "next start" 2>/dev/null || true pkill -9 -f "next-server" 2>/dev/null || true pkill -9 -x "nginx" 2>/dev/null || true sleep 1 # ── GPU detection (early — needed for cache namespacing + tier-aware warmup) ─ # nvidia-smi might not return data in some odd container states; treat as # "unknown" rather than crashing so the rest of the boot can still run. # # IMPORTANT: take the first line with `awk 'NR==1'`, NOT `head -1`. On a # multi-GPU box nvidia-smi emits one line per GPU; `head -1` closes the pipe # after the first line, nvidia-smi gets SIGPIPE writing line 2, and with # `set -o pipefail` + `set -e` that 141 kills the whole entrypoint. `awk` # reads to EOF so the writer never sees a closed pipe. (Single-GPU boxes # only emit one line, which is why this only bit multi-GPU launches.) GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | awk 'NR==1' | xargs) GPU_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | awk 'NR==1' | tr -d '.') [ -z "$GPU_NAME" ] && GPU_NAME="unknown" [ -z "$GPU_CAP" ] && GPU_CAP="00" echo "[OK] GPU: $GPU_NAME (sm_${GPU_CAP})" # Slow GPUs (T4, older Tesla cards): warm only the two square presets we # benchmark against (512² and 1024²) and extend the readiness deadline. # Skipping warmup entirely would shift the multi-minute first-call JIT # onto the first user request, which corrupts benchmark numbers — better # to bake it into boot. BONSAI_WARMUP_SHAPES + BACKEND_READY_TIMEOUT can # be overridden via Space Variables if you want different shapes or a # longer/shorter deadline. case "$GPU_NAME" in *T4*|*P100*|*V100*|*K80*|*M60*) echo "[WARN] $GPU_NAME is slow — warming only 512x512 + 1024x1024." echo " Extending readiness timeout to 30 min for the longer JIT." : "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}" : "${BACKEND_READY_TIMEOUT:=1800}" export BONSAI_WARMUP_SHAPES BACKEND_READY_TIMEOUT ;; esac # ── persistent storage detection ───────────────────────────────────────────── # Try to use /data (a Storage Bucket if mounted) for the model + kernel # caches + stats. Every filesystem op is wrapped so that if anything fails # midway — bucket detached mid-build, mkdir denied, symlink races — we # silently fall back to ephemeral storage and keep going. The dashboard # banner alerts the user via BONSAI_PERSISTENT_STORAGE. _setup_persistent() { [ -d /data ] && [ -w /data ] || return 1 # Kernel caches namespaced by compute capability so a tier swap (e.g. # L40S sm_89 → T4 sm_75 → back to L40S) doesn't pollute either GPU's # autotune configs / Triton kernels. _gemlite_dir="/data/cache/gemlite-sm${GPU_CAP}" _triton_dir="/data/cache/triton-sm${GPU_CAP}" # One-shot migration: if a non-namespaced cache exists from older # builds, move it under the current GPU's namespace so we don't lose # the pre-existing autotune work. if [ -d /data/cache/gemlite ] && [ ! -e "$_gemlite_dir" ]; then echo "[INFO] migrating /data/cache/gemlite → gemlite-sm${GPU_CAP}" mv /data/cache/gemlite "$_gemlite_dir" 2>/dev/null || true fi if [ -d /data/cache/triton ] && [ ! -e "$_triton_dir" ]; then echo "[INFO] migrating /data/cache/triton → triton-sm${GPU_CAP}" mv /data/cache/triton "$_triton_dir" 2>/dev/null || true fi mkdir -p /data/models "$_gemlite_dir" "$_triton_dir" /data/state /data/state/daily 2>/dev/null || return 1 rm -rf "$APP_DIR/models" 2>/dev/null || return 1 ln -s /data/models "$APP_DIR/models" 2>/dev/null || return 1 mkdir -p "$APP_DIR/outputs" 2>/dev/null || return 1 rm -rf "$APP_DIR/outputs/.gemlite_cache" "$APP_DIR/outputs/.triton_cache" 2>/dev/null || true ln -s "$_gemlite_dir" "$APP_DIR/outputs/.gemlite_cache" 2>/dev/null || return 1 ln -s "$_triton_dir" "$APP_DIR/outputs/.triton_cache" 2>/dev/null || return 1 return 0 } if _setup_persistent; then echo "[OK] /data Storage Bucket attached — model + caches + counters will persist" export BONSAI_STATE_DIR=/data/state export BONSAI_PERSISTENT_STORAGE=1 else if [ -d /data ]; then echo "[WARN] /data is present but couldn't be set up (read-only? quota?). Falling back to ephemeral." else echo "[WARN] /data not mounted — model, kernel caches, and dashboard" echo " counters will reset on every Space restart. Enable a" echo " Storage Bucket in Space Settings → Storage to fix." fi export BONSAI_STATE_DIR="$APP_DIR/outputs/.state" export BONSAI_PERSISTENT_STORAGE=0 mkdir -p "$BONSAI_STATE_DIR/daily" 2>/dev/null || true fi # ── shared IP-hash pepper across all replicas ──────────────────────────────── # Every replica must hash IPs with the same pepper so unique-user counts # don't double across replicas. Extract from state.json if present (so the # pepper survives restarts), else generate a fresh one. Each worker reads # this via env, regardless of whether it loads cumulative state. if [ -f "$BONSAI_STATE_DIR/state.json" ]; then BONSAI_IP_PEPPER=$(python3 - "$BONSAI_STATE_DIR/state.json" <<'PY' 2>/dev/null || true import json, sys try: with open(sys.argv[1]) as f: print(json.load(f).get("ip_pepper") or "") except Exception: pass PY ) fi if [ -z "${BONSAI_IP_PEPPER:-}" ]; then BONSAI_IP_PEPPER=$(python3 -c "import secrets; print(secrets.token_hex(16))") fi export BONSAI_IP_PEPPER # Warm only the two square presets users hit most often (512² and 1024²). # Other resolutions JIT on first user request and join the on-disk caches # (/data/cache/{gemlite,triton}-smXX/) organically. The warmup-skip sentinel # (warmup-done.json next to gemlite autotune) tracks completed (backend,shape) # pairs across boots, so subsequent boots skip even these two if they're # already cached. # # Why so few shapes: multi-GPU boots collide during warmup — all N workers # race for /data bandwidth + CPU during the gemlite layer pack, and we've # seen 4-worker launches hang past BACKEND_READY_TIMEOUT. Two shapes covers # the common case (most users render at 512² or 1024²) without inflating # cold-boot wall time. : "${BONSAI_WARMUP_SHAPES:=512x512,1024x1024}" export BONSAI_WARMUP_SHAPES # Binary warmup disabled by default. When enabled, every replica swaps to # the binary transformer simultaneously after primary warmup — 4 parallel # 3.5 GB state_dict reads from /data + 4 parallel gemlite layer packs. # We've seen this hang multi-GPU boots indefinitely. First binary-arm click # pays a one-time JIT cost (~30s for an unwarmed shape, after which the # cache covers it forever). # # To re-enable on single-GPU rigs where the collision doesn't apply: # set Space Variable BONSAI_WARMUP_EXTRA_BACKENDS=bonsai-binary-gemlite : "${BONSAI_WARMUP_EXTRA_BACKENDS:=}" export BONSAI_WARMUP_EXTRA_BACKENDS # Restrict the studio picker to ternary only. With both variants exposed + # nginx least_conn routing, mixed traffic makes each replica thrash its # resident transformer (ternary↔binary, ~1s per swap on warm cache). A # single-variant picker eliminates that — no binary requests, no swaps. # # This ONLY controls what the picker offers. Binary weights are still # downloaded and wired below, so binary stays fully servable via a direct API # call and re-exposing it in the UI is a one-liner: set this Space Variable to # "bonsai-ternary,bonsai-binary" (or unset it for the default both). Keeping # download/wiring independent of picker visibility is deliberate — when the two # were tied together, hiding binary left its transformer path unset and every # worker crashed on boot. : "${BONSAI_SUPPORTED_FAMILIES:=bonsai-ternary}" export BONSAI_SUPPORTED_FAMILIES # ── token handling ─────────────────────────────────────────────────────────── # The model repos are PUBLIC now, so no token is needed — download_model.sh # calls snapshot_download with no `token=` arg. huggingface_hub still AUTO-READS # HF_TOKEN / HUGGING_FACE_HUB_TOKEN from the environment and sends it on every # request, and a stale/revoked token there makes the Hub return 401 even on # public repos (which crashed the boot). So we scrub any token from the env and # always download anonymously. echo "[INFO] model repos are public — downloading anonymously (any HF token in env is ignored)" unset HF_TOKEN HUGGING_FACE_HUB_TOKEN HUGGINGFACEHUB_API_TOKEN BONSAI_TOKEN 2>/dev/null || true # ── model download / sync ──────────────────────────────────────────────────── # Download BOTH ternary + binary, regardless of which the picker exposes. Each # repo is ~3.5 GB; first cold boot pulls ~7 GB total, but the Storage Bucket # (/data/models, symlinked above) keeps them across restarts so later boots # just etag-check. Shipping binary even while the UI hides it keeps it a # flip-the-variable away from servable, and guarantees its transformer path # resolves so GpuPipeline construction can't fail — the picker restriction # lives entirely in the /backends override, never here. # # We *always* invoke download_model.sh on boot (no file-exists guard). Under # the hood it calls huggingface_hub.snapshot_download with `local_dir` set, # which HEADs each file in the repo and skips any whose etag matches what's # already on disk — so cached boots cost ~10-30s of metadata checks instead # of a full redownload. The upside: pushing new weights to HF auto-propagates # on the next Space restart without a force flag or manual cache wipe. MODEL_DIR="$APP_DIR/models/bonsai-image-4B-ternary-gemlite" BINARY_MODEL_DIR="$APP_DIR/models/bonsai-image-4B-binary-gemlite" echo "==> syncing bonsai-image-ternary-4B-gemlite-2bit ..." ./scripts/download_model.sh --model ternary-gemlite echo "==> syncing bonsai-image-binary-4B-gemlite-1bit ..." ./scripts/download_model.sh --model binary-gemlite # ── htpasswd for the dashboard ─────────────────────────────────────────────── # DASHBOARD_KEY is a Space Secret; fall back to a sentinel that prints a # big warning so missing-secret is obvious in the build log but the Space # still comes up (useful while iterating). if [ -n "${DASHBOARD_KEY:-}" ]; then HASH=$(openssl passwd -apr1 "$DASHBOARD_KEY") printf 'admin:%s\n' "$HASH" > /tmp/.htpasswd echo "[OK] dashboard: auth enabled (user=admin)" else echo "[WARN] DASHBOARD_KEY not set — /dash-... is open with admin:open" printf 'admin:$apr1$open$open\n' > /tmp/.htpasswd fi # ── nginx scratch dirs ─────────────────────────────────────────────────────── mkdir -p /tmp/nginx-body /tmp/nginx-proxy /tmp/nginx-fastcgi /tmp/nginx-uwsgi /tmp/nginx-scgi # ── pre-seed dashboard JSON so the page doesn't 502 before first scrape ────── printf '{"updated_at":null,"persistent_storage":%s,"summary_total":{"requests":0,"success":0,"errors":0},"summary_today":{"requests":0,"unique_users":0},"summary_7d":{"requests":0,"unique_users":0},"by_shape":{},"requests_by_hour":[],"requests_by_day":[],"recent":[]}\n' \ "$([ "${BONSAI_PERSISTENT_STORAGE:-0}" = "1" ] && echo true || echo false)" \ > /tmp/analytics.json echo '{"ts":null,"gpus":[]}' > /tmp/gpu-stats.json # ── pin model paths once; shared across all workers ────────────────────────── # backend_gpu/pipeline_gpu.py reads SEPARATE env vars per variant # (TERNARY_TRANSFORMER_PATH vs BINARY_TRANSFORMER_PATH) and the packed # transformer subdir name differs per variant (transformer-gemlite-int2 # for ternary, transformer-gemlite-int1 for binary). Glob each variant's # dir for whichever transformer-gemlite-* it actually ships and assign to # the right env var. Without the BINARY env var set, the pipeline falls # back to its hardcoded /root/models/bonsai-binary/ default → PermissionError # on a non-root container the moment a user picks binary in the UI. # # Note: text_encoder + vae + tokenizer are the SAME artifacts across both # variants (Qwen3-4B-4bit + BFL VAE). Pointing them at the ternary copy # is fine; binary's copy of these files sits idle on disk after download. # That's a one-time ~1 GB of duplication on disk for the simplicity of # letting download_model.sh pull the standard HF layout for each repo. export MFLUX_STUDIO_GPU_DEFAULT_BACKEND="bonsai-ternary-gemlite" # `awk 'NR==1'` not `head -1` — same SIGPIPE-under-pipefail reasoning as the # nvidia-smi calls above: if a model dir ever has >1 transformer-gemlite-* # match, head closes the pipe early and ls dies 141, killing the script. _ternary_transformer_dir=$(ls -d "$MODEL_DIR"/transformer-gemlite-* 2>/dev/null | awk 'NR==1') if [ -z "$_ternary_transformer_dir" ]; then echo "[ERR] no transformer-gemlite-* subdir under $MODEL_DIR" >&2 exit 1 fi export MFLUX_STUDIO_GPU_TERNARY_TRANSFORMER_PATH="$_ternary_transformer_dir" # Wire the binary transformer too. GpuPipeline.__init__ validates EVERY # backend's path up front (binary AND ternary) even though it loads only the # default one lazily — so this must resolve or every worker dies on boot with # "binary transformer path is unset". We always download binary above, so the # glob always finds the real -int1 dir; the picker just hides it from users. _binary_transformer_dir=$(ls -d "$BINARY_MODEL_DIR"/transformer-gemlite-* 2>/dev/null | awk 'NR==1') if [ -z "$_binary_transformer_dir" ]; then echo "[ERR] no transformer-gemlite-* subdir under $BINARY_MODEL_DIR" >&2 exit 1 fi export MFLUX_STUDIO_GPU_BINARY_TRANSFORMER_PATH="$_binary_transformer_dir" export MFLUX_STUDIO_GPU_TEXT_ENCODER_PATH="$MODEL_DIR/text_encoder-hqq-4bit" export MFLUX_STUDIO_GPU_VAE_PATH="$MODEL_DIR/vae" export MFLUX_STUDIO_GPU_TOKENIZER_PATH="$MODEL_DIR/text_encoder-hqq-4bit/tokenizer" # ── detect GPUs + spawn one uvicorn per device ─────────────────────────────── GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l || echo 1) [ "$GPU_COUNT" -lt 1 ] && GPU_COUNT=1 echo "[OK] detected $GPU_COUNT GPU(s)" # Stagger consecutive worker starts. Without this, all N uvicorns hit the # /data bucket simultaneously, contending for ~5 GB state_dict reads + the # CPU-bound fp16 cast + gemlite layer conversion. We've seen 4-worker # launches blow through BACKEND_READY_TIMEOUT this way. Staggering by ~30s # (a hair more than the single-worker transformer-load wall time observed # on warm bucket / sm_86) lets each worker get past torch.load + gemlite # convert before the next starts touching the same files. WORKER_START_STAGGER_SECONDS="${BONSAI_WORKER_START_STAGGER_SECONDS:-30}" BACKEND_URLS="" UPSTREAM_SERVERS="" for i in $(seq 0 $((GPU_COUNT - 1))); do PORT=$((8000 + i)) # Per-replica GPU name (mixed-GPU rigs are rare but possible — look it # up by physical index rather than reuse the top-level GPU_NAME). REPLICA_GPU=$(nvidia-smi --query-gpu=name --format=csv,noheader -i "$i" 2>/dev/null | awk 'NR==1' | xargs) [ -z "$REPLICA_GPU" ] && REPLICA_GPU="$GPU_NAME" echo "==> starting backend on GPU $i ($REPLICA_GPU) → :$PORT (warmup: $BONSAI_WARMUP_SHAPES)" # BONSAI_REPLICA_INDEX: only replica 0 seeds counters from state.json; # replicas 1+ start at 0 and report deltas. metrics_pusher sums them → # correct cumulative without N-way inflation. # BONSAI_GPU_NAME: surfaced via /metrics so the pusher can aggregate # request counts/latencies per GPU model for the dashboard. CUDA_VISIBLE_DEVICES=$i BONSAI_REPLICA_INDEX=$i BONSAI_GPU_NAME="$REPLICA_GPU" \ uvicorn space.app:app \ --host 127.0.0.1 --port "$PORT" \ --no-access-log & UPSTREAM_SERVERS="${UPSTREAM_SERVERS} server 127.0.0.1:$PORT;"$'\n' [ -n "$BACKEND_URLS" ] && BACKEND_URLS="$BACKEND_URLS," BACKEND_URLS="${BACKEND_URLS}http://127.0.0.1:$PORT" # Sleep between consecutive worker starts (skip after the last one). # Set BONSAI_WORKER_START_STAGGER_SECONDS=0 to disable if cold-boot # wall time matters more than first-boot reliability. if [ "$i" -lt "$((GPU_COUNT - 1))" ] && [ "$WORKER_START_STAGGER_SECONDS" -gt 0 ]; then echo " ↳ sleeping ${WORKER_START_STAGGER_SECONDS}s before next worker (avoid /data + CPU contention)" sleep "$WORKER_START_STAGGER_SECONDS" fi done # At N>1 use least_conn (variable-duration requests — see space/nginx.conf). if [ "$GPU_COUNT" -gt 1 ]; then LB_DIRECTIVE=" least_conn;"$'\n' else LB_DIRECTIVE="" fi printf 'upstream bonsai_workers {\n%s%s}\n' "$LB_DIRECTIVE" "$UPSTREAM_SERVERS" > /tmp/nginx-upstream.conf export BACKEND_URLS # ── wait for backend readiness ─────────────────────────────────────────────── # Workers only answer /backends after lifespan finishes (kernels compiled + # warmup shapes JITed). We poll the first one as a proxy for "ready enough." _ready_timeout="${BACKEND_READY_TIMEOUT:-600}" echo "==> waiting for backend on :8000 (up to ${_ready_timeout}s) ..." for i in $(seq 1 "$_ready_timeout"); do if curl -fsS -m 2 http://127.0.0.1:8000/backends > /dev/null 2>&1; then echo "[OK] backend ready after ${i}s" break fi sleep 1 if [ "$i" -eq "$_ready_timeout" ]; then echo "[ERR] backend did not come up within ${_ready_timeout}s" >&2 exit 1 fi done # ── frontend (next start) on internal :3000 ────────────────────────────────── echo "==> starting frontend (next start) on :3000" (cd vendor/image-studio/frontend && exec npm start -- --port 3000 --hostname 127.0.0.1) & # ── metrics_pusher sidecar (watchdog restart on crash) ─────────────────────── start_metrics_pusher() { while true; do echo "[watchdog] starting metrics_pusher.py" python3 /home/user/app/space/metrics_pusher.py || true echo "[watchdog] metrics_pusher.py exited, restarting in 5s" sleep 5 done } start_metrics_pusher & # ── nginx — front everything on :7860 (the HF-exposed port) ────────────────── echo "==> nginx on :7860" exec nginx -c /home/user/app/space/nginx.conf -p /home/user/app/