#!/usr/bin/env bash # Live status of every causalgrok_camelyon_v2 process and run dir. # Survives SSH and is safe to call repeatedly. # # Usage: # bash scripts/monitor_all.sh # one-shot snapshot # watch -n 30 bash scripts/monitor_all.sh # auto-refresh every 30s # don't fail the whole script on any single subcommand error — # we want a tolerant dashboard, not a strict pipeline set +e ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "${ROOT}" bold() { printf "\033[1m%s\033[0m" "$*"; } green() { printf "\033[32m%s\033[0m" "$*"; } red() { printf "\033[31m%s\033[0m" "$*"; } gray() { printf "\033[90m%s\033[0m" "$*"; } stamp() { date -u +%FT%TZ; } echo "═══════════════════════════════════════════════════════════════════════════════" echo " CausalGrok — all-runs monitor $(stamp)" echo "═══════════════════════════════════════════════════════════════════════════════" # 1. GPU snapshot echo echo "$(bold "GPU")" nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu \ --format=csv,noheader,nounits | \ awk -F',' '{printf " used=%sMB free=%sMB total=%sMB util=%s%% temp=%s°C\n", $1, $2, $3, $4, $5}' # 2. All causalgrok_camelyon_v2 processes PIDS=$(pgrep -f "causalgrok_camelyon_v2" || true) N_ACTIVE=$(echo "$PIDS" | grep -c . || true) echo echo "$(bold "Active training processes: ${N_ACTIVE}")" if [ -z "$PIDS" ]; then echo " $(gray "(none)")" else printf " %-9s %-12s %-8s %-6s %-6s %s\n" "PID" "ELAPSED" "DETACHED" "%CPU" "%MEM" "RUN_ID" for pid in $PIDS; do ppid=$(ps -o ppid= -p $pid 2>/dev/null | tr -d ' ' || echo "?") etime=$(ps -o etime= -p $pid 2>/dev/null | tr -d ' ' || echo "?") pcpu=$(ps -o pcpu= -p $pid 2>/dev/null | tr -d ' ' || echo "?") pmem=$(ps -o pmem= -p $pid 2>/dev/null | tr -d ' ' || echo "?") rid=$(ps -o cmd= -p $pid 2>/dev/null | grep -oP 'experiments/runs/\K[^ ]+' || echo "?") if [ "$ppid" = "1" ]; then detached="$(green "yes")" else detached="$(red "PPID=$ppid")" fi printf " %-9s %-12s %-17s %-6s %-6s %s\n" "$pid" "$etime" "$detached" "$pcpu" "$pmem" "$rid" done fi # 3. Per-run progress (epoch, latest OOD, periodic ckpts) echo echo "$(bold "Per-run progress")" echo " (epoch ← latest train.log; ckpts ← ep*.pt count; best_ood ← latest history.json)" echo printf " %-46s %-7s %-9s %-8s %-8s %s\n" "RUN_ID" "EPOCH" "% DONE" "CKPTS" "BEST_OOD" "LATEST" for d in experiments/runs/*/; do rid=$(basename "$d") log="$d/logs/train.log" [ -f "$log" ] || continue # only show recent (May 5) or running ones pid_in_dir="" if [ -f "$d/run.pid" ]; then pid_in_dir=$(cat "$d/run.pid" 2>/dev/null | tr -d ' ') fi is_active="no" if [ -n "$pid_in_dir" ] && kill -0 "$pid_in_dir" 2>/dev/null; then is_active="yes" fi if [ "$is_active" = "no" ] && [[ "$rid" != 20260505-* ]]; then continue fi # latest epoch from train.log last_ep=$(grep -oP "ep\s+\K[0-9]+" "$log" 2>/dev/null | tail -1 || echo "0") last_ep=${last_ep:-0} # total epochs from config or log header total_ep=$(grep -oP "Camelyon17 v2 \| \K[0-9]+" "$log" 2>/dev/null | head -1 || echo "?") if [ -n "$total_ep" ] && [ "$total_ep" != "?" ] && [ "$last_ep" -gt 0 ]; then pct=$(awk "BEGIN{printf \"%.0f\", ($last_ep/$total_ep)*100}") pct="${pct}%" else pct="?" fi # ckpt count ckpts=$(ls "$d/checkpoints/"ep*.pt 2>/dev/null | wc -l) # best ood from history.json (max of ood_acc field) best_ood="?" if [ -f "$d/results/history.json" ]; then best_ood=$(python3 -c " import json, sys try: h = json.load(open('$d/results/history.json')) oods = [r.get('ood_acc', 0) for r in h if isinstance(r.get('ood_acc'), (int, float))] print(f'{max(oods):.3f}' if oods else '?') except: print('?') " 2>/dev/null) fi # latest line condensed latest=$(grep -E "ep\s+[0-9]+ \| tr" "$log" 2>/dev/null | tail -1 | \ sed -E 's/^.*ep\s+[0-9]+ \|//; s/\| ‖W‖.*//' | tr -s ' ' || echo "") state_icon="$(green "●")" if [ "$is_active" = "no" ]; then if [ "$last_ep" = "$total_ep" ]; then state_icon="$(green "✓")" else state_icon="$(red "✗")" fi fi printf " %s %-44s %-7s %-9s %-8s %-8s %s\n" \ "$state_icon" "$rid" "$last_ep" "$pct" "$ckpts" "$best_ood" "${latest:0:60}" done # 4. Disk pressure echo echo "$(bold "Disk usage on experiments/runs")" du -sh experiments/runs/ 2>/dev/null | awk '{print " " $1 " (" $2 ")"}' echo echo "$(gray "Re-run with: watch -n 30 bash scripts/monitor_all.sh")"