# Kimi K2.6 DFlash Production Configuration — FP8 KV Cache # Target: ~1000 tok/s on 8x AMD Instinct MI300X (gfx942) # # FP8 KV cache halves KV memory (8-bit vs 16-bit per element). # Measured capacity: 2,469,568 tokens (up from 1,230,368 with BF16) = 2.01x. # # STATUS: Server starts and allocates FP8 KV correctly, BUT output quality # is broken with default scales (1.0). The Kimi-K2.6 checkpoint has no # pre-computed KV scales. Without calibrated per-layer scales, FP8 E4M3 # clips KV values and produces degenerate output (repeating tokens). # # TO FIX: Run calibrate_kv_scales.py to generate per-layer scales, then # load them via --quantization-param-path. See docs for details. # # Prerequisites: # - NUMA balancing disabled: echo 0 > /proc/sys/kernel/numa_balancing # - Calibrated KV scales (payload/calibrate_kv_scales.py) # - Docker with ROCm support # - vllm/vllm-openai-rocm:nightly image # - Model: moonshotai/Kimi-K2.6 on local NVMe # - Draft: z-lab/Kimi-K2.5-DFlash on local NVMe # Target model MODEL_DIR=/mnt/nvme5n1p1/hydra/models/Kimi-K2.6 DRAFT_MODEL_DIR=/mnt/nvme5n1p1/hydra/models/Kimi-K2.5-DFlash IMAGE=vllm/vllm-openai-rocm:nightly PORT=8262 # DFlash speculative decoding SPEC_METHOD=dflash NUM_SPECULATIVE_TOKENS=2 BLOCK_SIZE=16 # KV cache — fp8 halves memory, doubles capacity KV_CACHE_DTYPE=fp8 # Scheduler — push seqs to 64 with FP8 KV headroom MAX_NUM_SEQS=64 MAX_NUM_BATCHED_TOKENS=32768 MAX_MODEL_LEN=262144 GPU_MEMORY_UTILIZATION=0.92 # Runtime TENSOR_PARALLEL_SIZE=8 ENFORCE_EAGER=true MOE_BACKEND=aiter OPTIMIZATION_LEVEL=2 PERFORMANCE_MODE=throughput SAFETENSORS_LOAD_STRATEGY=lazy ENABLE_PREFIX_CACHING=false ENABLE_CHUNKED_PREFILL=true # ROCm environment PYTORCH_ROCM_ARCH=gfx942 AITER_ROCM_ARCH=gfx942 GPU_ARCHS=gfx942 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ROCM_USE_AITER_RMSNORM=0 HSA_ENABLE_SDMA=0 HSA_NO_SCRATCH_RECLAIM=1 OMP_NUM_THREADS=1