#!/usr/bin/env bash # # Validate FP8 KV cache: start server, check KV capacity, run benchmark. # Compares against BF16 baseline numbers. # # Usage: # ./validate-fp8.sh # default FP8 config # ./validate-fp8.sh configs/production-fp8kv-safe.env # safe fallback set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CONFIG="${1:-$SCRIPT_DIR/configs/production-fp8kv.env}" echo "=== FP8 KV Cache Validation ===" echo "Config: $CONFIG" echo "" source "$CONFIG" # Start server echo "Starting server with FP8 KV cache..." "$SCRIPT_DIR/serve.sh" "$CONFIG" # The serve.sh script waits for readiness, so if we get here, it's up. echo "" echo "=== Checking KV cache capacity ===" docker logs "$CONTAINER_NAME" 2>&1 | grep -i "kv cache" | tail -5 echo "" BF16_CAPACITY=1316727 echo "BF16 baseline capacity: $BF16_CAPACITY tokens" FP8_CAPACITY=$(docker logs "$CONTAINER_NAME" 2>&1 | grep -oP 'KV cache.*?(\d{4,})' | grep -oP '\d{4,}' | tail -1) if [[ -n "$FP8_CAPACITY" ]]; then echo "FP8 measured capacity: $FP8_CAPACITY tokens" RATIO=$(echo "scale=2; $FP8_CAPACITY / $BF16_CAPACITY" | bc) echo "Ratio: ${RATIO}x" else echo "Could not parse FP8 capacity from logs. Check manually:" echo " docker logs $CONTAINER_NAME 2>&1 | grep -i 'kv cache'" fi echo "" echo "=== Running benchmark sweep ===" for CONC in 32 48 64; do echo "" echo "--- Concurrency: $CONC ---" python3 "$SCRIPT_DIR/payload/benchmark_multi_turn.py" \ --base-url "http://127.0.0.1:${PORT}/v1" \ --model kimi-k2.6-amd-dflash \ --sessions "$CONC" \ --turns-per-session 1 \ --max-tokens 512 \ --output-json "$SCRIPT_DIR/benchmarks/fp8kv-dflash-st${NUM_SPECULATIVE_TOKENS}-s${MAX_NUM_SEQS}-c${CONC}.json" \ 2>&1 | tail -5 done echo "" echo "=== Validation complete ===" echo "Results in benchmarks/fp8kv-*.json" echo "" echo "BF16 baseline (seqs=32, c=32): 507.6 tok/s" echo "Expected FP8 (seqs=64, c=64): ~1010 tok/s"