# Kimi K2.6 DFlash — FP8 KV + Matched K2.6 Drafter # Target: 1000+ tok/s on 8x AMD Instinct MI300X (gfx942) # # This config combines both optimizations: # 1. FP8 KV cache: 2x token capacity => max_num_seqs=64 # 2. K2.6-matched drafter: ~60-80% acceptance => st=8 viable # # Expected: 64 slots × ~25 tok/s per slot (st=8 matched) = ~1600 tok/s # # Prerequisites: # - Trained K2.6 DFlash drafter (run train-drafter.sh first) # - NUMA balancing disabled # Target model MODEL_DIR=/mnt/nvme5n1p1/hydra/models/Kimi-K2.6 DRAFT_MODEL_DIR=/mnt/nvme5n1p1/hydra/models/Kimi-K2.6-DFlash IMAGE=vllm/vllm-openai-rocm:nightly PORT=8262 # DFlash — matched drafter can handle 8 spec tokens SPEC_METHOD=dflash NUM_SPECULATIVE_TOKENS=8 BLOCK_SIZE=16 # KV cache — fp8 KV_CACHE_DTYPE=fp8 # Scheduler — max out with FP8 headroom MAX_NUM_SEQS=64 MAX_NUM_BATCHED_TOKENS=32768 MAX_MODEL_LEN=262144 GPU_MEMORY_UTILIZATION=0.92 # Runtime TENSOR_PARALLEL_SIZE=8 ENFORCE_EAGER=true MOE_BACKEND=aiter OPTIMIZATION_LEVEL=2 PERFORMANCE_MODE=throughput SAFETENSORS_LOAD_STRATEGY=lazy ENABLE_PREFIX_CACHING=false ENABLE_CHUNKED_PREFILL=true # ROCm environment PYTORCH_ROCM_ARCH=gfx942 AITER_ROCM_ARCH=gfx942 GPU_ARCHS=gfx942 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ROCM_USE_AITER_RMSNORM=0 HSA_ENABLE_SDMA=0 HSA_NO_SCRATCH_RECLAIM=1 OMP_NUM_THREADS=1