kimi-k26-dflash-mi300x / validate-fp8.sh
florianleibert's picture
Upload folder using huggingface_hub
d6061c3 verified
Raw
History Blame Contribute Delete
1.99 kB
#!/usr/bin/env bash
#
# Validate FP8 KV cache: start server, check KV capacity, run benchmark.
# Compares against BF16 baseline numbers.
#
# Usage:
# ./validate-fp8.sh # default FP8 config
# ./validate-fp8.sh configs/production-fp8kv-safe.env # safe fallback
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CONFIG="${1:-$SCRIPT_DIR/configs/production-fp8kv.env}"
echo "=== FP8 KV Cache Validation ==="
echo "Config: $CONFIG"
echo ""
source "$CONFIG"
# Start server
echo "Starting server with FP8 KV cache..."
"$SCRIPT_DIR/serve.sh" "$CONFIG"
# The serve.sh script waits for readiness, so if we get here, it's up.
echo ""
echo "=== Checking KV cache capacity ==="
docker logs "$CONTAINER_NAME" 2>&1 | grep -i "kv cache" | tail -5
echo ""
BF16_CAPACITY=1316727
echo "BF16 baseline capacity: $BF16_CAPACITY tokens"
FP8_CAPACITY=$(docker logs "$CONTAINER_NAME" 2>&1 | grep -oP 'KV cache.*?(\d{4,})' | grep -oP '\d{4,}' | tail -1)
if [[ -n "$FP8_CAPACITY" ]]; then
echo "FP8 measured capacity: $FP8_CAPACITY tokens"
RATIO=$(echo "scale=2; $FP8_CAPACITY / $BF16_CAPACITY" | bc)
echo "Ratio: ${RATIO}x"
else
echo "Could not parse FP8 capacity from logs. Check manually:"
echo " docker logs $CONTAINER_NAME 2>&1 | grep -i 'kv cache'"
fi
echo ""
echo "=== Running benchmark sweep ==="
for CONC in 32 48 64; do
echo ""
echo "--- Concurrency: $CONC ---"
python3 "$SCRIPT_DIR/payload/benchmark_multi_turn.py" \
--base-url "http://127.0.0.1:${PORT}/v1" \
--model kimi-k2.6-amd-dflash \
--sessions "$CONC" \
--turns-per-session 1 \
--max-tokens 512 \
--output-json "$SCRIPT_DIR/benchmarks/fp8kv-dflash-st${NUM_SPECULATIVE_TOKENS}-s${MAX_NUM_SEQS}-c${CONC}.json" \
2>&1 | tail -5
done
echo ""
echo "=== Validation complete ==="
echo "Results in benchmarks/fp8kv-*.json"
echo ""
echo "BF16 baseline (seqs=32, c=32): 507.6 tok/s"
echo "Expected FP8 (seqs=64, c=64): ~1010 tok/s"