#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CONFIG_FILE="${1:-$SCRIPT_DIR/configs/production.env}" source "$CONFIG_FILE" CONTAINER_NAME="${CONTAINER_NAME:-kimi26-dflash}" PATCH_SCRIPT="$SCRIPT_DIR/patches/patch_dflash_rocm.py" echo "Kimi K2.6 DFlash — 8x MI300X" echo "Config: $CONFIG_FILE" echo "==============================" numa_status=$(cat /proc/sys/kernel/numa_balancing 2>/dev/null || echo "unknown") if [[ "$numa_status" != "0" ]]; then echo "WARNING: NUMA balancing is enabled ($numa_status). Disable it:" echo " sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing'" echo "" fi docker rm -f "$CONTAINER_NAME" 2>/dev/null || true SPEC_CONFIG="{\"method\":\"$SPEC_METHOD\",\"model\":\"$DRAFT_MODEL_DIR\",\"num_speculative_tokens\":$NUM_SPECULATIVE_TOKENS}" FP8_FLAGS="" if [[ "$KV_CACHE_DTYPE" == fp8* ]]; then FP8_FLAGS="--calculate-kv-scales" echo "FP8 KV cache enabled (dtype=$KV_CACHE_DTYPE)" fi docker run -d \ --name "$CONTAINER_NAME" \ --network host \ --device=/dev/kfd \ --device=/dev/dri \ --security-opt seccomp=unconfined \ --group-add video \ --ipc=host \ -e PYTORCH_ROCM_ARCH="$PYTORCH_ROCM_ARCH" \ -e AITER_ROCM_ARCH="$AITER_ROCM_ARCH" \ -e GPU_ARCHS="$GPU_ARCHS" \ -e VLLM_ROCM_USE_AITER="$VLLM_ROCM_USE_AITER" \ -e VLLM_ROCM_QUICK_REDUCE_QUANTIZATION="$VLLM_ROCM_QUICK_REDUCE_QUANTIZATION" \ -e VLLM_ROCM_USE_AITER_RMSNORM="$VLLM_ROCM_USE_AITER_RMSNORM" \ ${VLLM_ROCM_USE_AITER_MOE:+-e VLLM_ROCM_USE_AITER_MOE="$VLLM_ROCM_USE_AITER_MOE"} \ -e HSA_ENABLE_SDMA="$HSA_ENABLE_SDMA" \ -e HSA_NO_SCRATCH_RECLAIM="$HSA_NO_SCRATCH_RECLAIM" \ -e OMP_NUM_THREADS="$OMP_NUM_THREADS" \ -e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ -v "$(dirname "$MODEL_DIR"):$(dirname "$MODEL_DIR")" \ -v "$SCRIPT_DIR/patches:/patches:ro" \ --entrypoint bash \ "$IMAGE" \ -lc "python3 /patches/patch_dflash_rocm.py && python3 -m vllm.entrypoints.openai.api_server \ --model '$MODEL_DIR' \ --served-model-name kimi-k2.6-amd-dflash \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ --trust-remote-code \ --max-model-len $MAX_MODEL_LEN \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ --kv-cache-dtype $KV_CACHE_DTYPE \ $FP8_FLAGS \ --max-num-seqs $MAX_NUM_SEQS \ --mm-encoder-tp-mode data \ --block-size $BLOCK_SIZE \ --tool-call-parser kimi_k2 \ --reasoning-parser kimi_k2 \ --enable-auto-tool-choice \ --moe-backend $MOE_BACKEND \ --optimization-level $OPTIMIZATION_LEVEL \ --performance-mode $PERFORMANCE_MODE \ --safetensors-load-strategy $SAFETENSORS_LOAD_STRATEGY \ --disable-uvicorn-access-log \ --no-enable-prefix-caching \ --enable-chunked-prefill \ --enforce-eager \ --speculative-config '$SPEC_CONFIG'" echo "" echo "Container '$CONTAINER_NAME' started on port $PORT" echo "Waiting for server ready (model load takes ~5 min)..." for i in $(seq 1 360); do if curl -sf "http://127.0.0.1:${PORT}/v1/models" >/dev/null 2>&1; then echo "Server ready at http://127.0.0.1:${PORT}" echo "" echo "Test: curl http://127.0.0.1:${PORT}/v1/chat/completions -H 'Content-Type: application/json' -d '{\"model\":\"kimi-k2.6-amd-dflash\",\"messages\":[{\"role\":\"user\",\"content\":\"Hello\"}],\"max_tokens\":32}'" exit 0 fi sleep 5 done echo "ERROR: Server did not become ready in 30 minutes" docker logs --tail 20 "$CONTAINER_NAME" exit 1