| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| CONFIG="${1:-$SCRIPT_DIR/configs/production-fp8kv.env}" |
|
|
| echo "=== FP8 KV Cache Validation ===" |
| echo "Config: $CONFIG" |
| echo "" |
|
|
| source "$CONFIG" |
|
|
| |
| echo "Starting server with FP8 KV cache..." |
| "$SCRIPT_DIR/serve.sh" "$CONFIG" |
|
|
| |
| echo "" |
| echo "=== Checking KV cache capacity ===" |
| docker logs "$CONTAINER_NAME" 2>&1 | grep -i "kv cache" | tail -5 |
| echo "" |
|
|
| BF16_CAPACITY=1316727 |
| echo "BF16 baseline capacity: $BF16_CAPACITY tokens" |
| FP8_CAPACITY=$(docker logs "$CONTAINER_NAME" 2>&1 | grep -oP 'KV cache.*?(\d{4,})' | grep -oP '\d{4,}' | tail -1) |
| if [[ -n "$FP8_CAPACITY" ]]; then |
| echo "FP8 measured capacity: $FP8_CAPACITY tokens" |
| RATIO=$(echo "scale=2; $FP8_CAPACITY / $BF16_CAPACITY" | bc) |
| echo "Ratio: ${RATIO}x" |
| else |
| echo "Could not parse FP8 capacity from logs. Check manually:" |
| echo " docker logs $CONTAINER_NAME 2>&1 | grep -i 'kv cache'" |
| fi |
|
|
| echo "" |
| echo "=== Running benchmark sweep ===" |
|
|
| for CONC in 32 48 64; do |
| echo "" |
| echo "--- Concurrency: $CONC ---" |
| python3 "$SCRIPT_DIR/payload/benchmark_multi_turn.py" \ |
| --base-url "http://127.0.0.1:${PORT}/v1" \ |
| --model kimi-k2.6-amd-dflash \ |
| --sessions "$CONC" \ |
| --turns-per-session 1 \ |
| --max-tokens 512 \ |
| --output-json "$SCRIPT_DIR/benchmarks/fp8kv-dflash-st${NUM_SPECULATIVE_TOKENS}-s${MAX_NUM_SEQS}-c${CONC}.json" \ |
| 2>&1 | tail -5 |
| done |
|
|
| echo "" |
| echo "=== Validation complete ===" |
| echo "Results in benchmarks/fp8kv-*.json" |
| echo "" |
| echo "BF16 baseline (seqs=32, c=32): 507.6 tok/s" |
| echo "Expected FP8 (seqs=64, c=64): ~1010 tok/s" |
|
|