#!/usr/bin/env bash # goblin_runner.sh — wrap the user's training command with rocprofv3 + amd-smi. # # Architecture: brainstorming/architecture.md §5 (Profiling Pipeline). # # Inputs (env vars): # USER_SCRIPT Path to the workload python file. Required. # OUT_DIR Directory to write trace.csv / torch_profile.json / # amd_smi.csv. Required. Created if missing. # STEPS --max_steps argument forwarded to the user script. Default 10. # GOBLIN_GPU_ID ROCR_VISIBLE_DEVICES value. Default 0. # # Outputs (in $OUT_DIR): # trace.csv rocprofv3 kernel trace # torch_profile.json torch.profiler chrome trace (the user script writes this) # amd_smi.csv amd-smi telemetry sampled at 200 ms # stdout.log user-script stdout # stderr.log user-script stderr # # Failure mode: any non-zero rocprofv3 exit short-circuits the script. On # failure we dump the captured stdout/stderr logs to THIS script's own stderr # so `subprocess.run(capture_output=True)` in LiveRunner sees the real error # (not just an empty `[]` tail). LiveRunner then archives the whole OUT_DIR # under bench_cache/last_runner_failure_/ so you can inspect after-the-fact. set -uo pipefail : "${USER_SCRIPT:?USER_SCRIPT env var is required}" : "${OUT_DIR:?OUT_DIR env var is required}" STEPS="${STEPS:-10}" mkdir -p "$OUT_DIR" # Pin to a single MI300X so concurrent benchmark runs don't fight. export ROCR_VISIBLE_DEVICES="${GOBLIN_GPU_ID:-0}" # Background HBM/power telemetry. Different amd-smi versions ship slightly # different flag names (--interval / --watch / no flag at all in older # builds), so try a few variants and gracefully degrade. Telemetry is # optional — profile_parser tolerates a missing amd_smi.csv. start_amd_smi() { if ! command -v amd-smi >/dev/null 2>&1; then echo "amd-smi not on PATH; skipping telemetry sidecar" \ > "$OUT_DIR/amd_smi.err" return 1 fi # amd-smi telemetry surface drifted hard across rocm versions. We try # subcommands in this order, newest-first; first one that survives 0.5s # is kept: # # 1. `amd-smi metric --watch --mem-usage --usage --csv` (ROCm 7.x). # Different code path from `monitor`, so it dodges the known # `AttributeError: 'Namespace' object has no attribute 'violation'` # crash that some ROCm 7.x point releases ship inside the `monitor` # subcommand. Produces VRAM_USED_MB / GFX_ACTIVITY / MEM_ACTIVITY # columns that profile_parser._pick_column already recognises. # # 2. `amd-smi monitor --watch ...` (ROCm 6.x and the 7.x builds # where `monitor` actually works). Kept as a fallback for older # installs that may not have the `metric --watch` form. # # 3. `amd-smi monitor --interval ` (pre-6.0) and finally bare # `amd-smi monitor` (very old / implicit-all). local variants=( # ROCm 7.x: prefer the metric subcommand — bypasses the monitor bug. "amd-smi metric --watch 1 --mem-usage --usage --csv" "amd-smi metric -w 1 -m -u --csv" # ROCm 7.x monitor (works on builds without the violation-attribute bug) "amd-smi monitor --watch 1 --power-usage --gfx --mem --vram-usage --csv" "amd-smi monitor -w 1 -p -u -m -v --csv" # ROCm 6.x intermediate forms "amd-smi monitor --watch 1 --csv" "amd-smi monitor --watch 1" # Older / fallback "amd-smi monitor --interval 1 --csv" "amd-smi monitor --interval 1" "amd-smi monitor --csv" "amd-smi monitor" ) for cmd in "${variants[@]}"; do # shellcheck disable=SC2086 $cmd > "$OUT_DIR/amd_smi.csv" 2> "$OUT_DIR/amd_smi.err" & local pid=$! sleep 0.5 if kill -0 "$pid" 2>/dev/null; then AMD_SMI_PID=$pid return 0 fi done # All variants failed. Telemetry is optional; mark it as deliberately # skipped and let the main run proceed — profile_parser tolerates a # missing/empty amd_smi.csv. echo "amd-smi monitor: no compatible flag set in this build; telemetry skipped" \ > "$OUT_DIR/amd_smi.skipped" rm -f "$OUT_DIR/amd_smi.csv" return 1 } AMD_SMI_PID= start_amd_smi || true # never block the main run on telemetry cleanup() { if [[ -n "${AMD_SMI_PID:-}" ]] && kill -0 "$AMD_SMI_PID" 2>/dev/null; then kill "$AMD_SMI_PID" 2>/dev/null || true wait "$AMD_SMI_PID" 2>/dev/null || true fi } trap cleanup EXIT # Dump the captured logs to *our* stderr on a non-zero rocprofv3 exit so the # Python subprocess that spawned us actually sees the real error message. # Without this the redirected stdout.log / stderr.log live inside the tempdir # only and LiveRunner's stderr-tail check sees nothing. dump_failure_logs() { local code=$? if [[ $code -ne 0 ]]; then { echo "=== goblin_runner.sh failed with exit code $code ===" echo "=== USER_SCRIPT: $USER_SCRIPT ===" echo "=== OUT_DIR: $OUT_DIR ===" echo "=== ROCR_VISIBLE_DEVICES: ${ROCR_VISIBLE_DEVICES:-unset} ===" echo echo "=== last 50 lines of $OUT_DIR/stdout.log ===" tail -n 50 "$OUT_DIR/stdout.log" 2>/dev/null || echo "(stdout.log missing)" echo echo "=== last 50 lines of $OUT_DIR/stderr.log ===" tail -n 50 "$OUT_DIR/stderr.log" 2>/dev/null || echo "(stderr.log missing)" echo echo "=== last 20 lines of $OUT_DIR/amd_smi.err ===" tail -n 20 "$OUT_DIR/amd_smi.err" 2>/dev/null || echo "(amd_smi.err missing)" } 1>&2 fi return $code } # rocprofv3 collects HSA + kernel traces. The user script is responsible for # writing torch_profile.json (the agent injects torch.profiler around the # training loop in Phase 3). --output-format csv keeps parsing simple. set +e rocprofv3 \ --hsa-trace --kernel-trace \ --output-directory "$OUT_DIR" \ --output-file trace \ --output-format csv \ -- \ python "$USER_SCRIPT" \ --max_steps="$STEPS" \ --torch_profile_out="$OUT_DIR/torch_profile.json" \ > "$OUT_DIR/stdout.log" 2> "$OUT_DIR/stderr.log" ROCPROF_EXIT=$? set -e if [[ $ROCPROF_EXIT -ne 0 ]]; then (exit $ROCPROF_EXIT) || dump_failure_logs exit $ROCPROF_EXIT fi # rocprofv3 may write trace_kernel_trace.csv etc. — normalize to trace.csv so # profile_parser has one stable filename to look for. if [[ ! -f "$OUT_DIR/trace.csv" ]]; then for candidate in "$OUT_DIR"/trace*kernel*.csv "$OUT_DIR"/*kernel_trace.csv; do if [[ -f "$candidate" ]]; then cp "$candidate" "$OUT_DIR/trace.csv" break fi done fi exit 0