#!/bin/bash # entrypoint.sh — vLLM OpenAI-compatible server for Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP. # Qwen3.5-family hybrid (linear + periodic full attention) + MTP speculative decoding, # NVFP4 (modelopt W4A4). Targets 4× 16 GB Blackwell (SM120), PCIe. # Every flag is env-overridable — see README.md / USAGE.md. set -e MODEL_DIR="${MODEL_DIR:-/model}" HOST="${HOST:-0.0.0.0}" PORT="${PORT:-8000}" # ---- tunables ---- MAX_MODEL_LEN="${MAX_MODEL_LEN:-65536}" # up to 262144 native MAX_NUM_SEQS="${MAX_NUM_SEQS:-8}" # concurrent seqs (24 = peak throughput on 64K) MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-16384}" GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.85}" # model ~7.2 GiB/GPU at TP=4; rest is KV KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" TP_SIZE="${TP_SIZE:-4}" DTYPE="${DTYPE:-auto}" SPEC_TOKENS="${SPEC_TOKENS:-3}" # MTP draft tokens; 0 disables speculative decoding REASONING_PARSER="${REASONING_PARSER:-qwen3}" # -> reasoning_content; "" to disable TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-qwen3_xml}" ENABLE_TOOLS="${ENABLE_TOOLS:-1}" # 1 = agentic (auto tool choice); 0 = plain chat SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-huihui-qwen36-27b-local}" ARGS=( "$MODEL_DIR" --served-model-name "$SERVED_MODEL_NAME" --trust-remote-code --tensor-parallel-size "$TP_SIZE" --quantization modelopt --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$MAX_NUM_SEQS" --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" --gpu-memory-utilization "$GPU_MEM_UTIL" --kv-cache-dtype "$KV_CACHE_DTYPE" --dtype "$DTYPE" --chat-template "$MODEL_DIR/chat_template.jinja" --host "$HOST" --port "$PORT" ) [ -n "$REASONING_PARSER" ] && ARGS+=(--reasoning-parser "$REASONING_PARSER") # MTP speculative decoding. vLLM >=0.22 deprecates method "qwen3_5_mtp" -> "mtp" # (harmless warning, auto-mapped). Set SPEC_TOKENS=0 to disable. if [ "${SPEC_TOKENS}" -gt 0 ] 2>/dev/null; then ARGS+=(--speculative-config "{\"method\":\"qwen3_5_mtp\",\"num_speculative_tokens\":${SPEC_TOKENS}}") fi # Agentic tool-calling (Qwen3 emits XML tool calls). Set ENABLE_TOOLS=0 for pure chat. if [ "$ENABLE_TOOLS" = "1" ]; then ARGS+=(--enable-auto-tool-choice --tool-call-parser "$TOOL_CALL_PARSER") fi echo "[entrypoint] vllm serve TP=$TP_SIZE len=$MAX_MODEL_LEN seqs=$MAX_NUM_SEQS spec=$SPEC_TOKENS tools=$ENABLE_TOOLS kv=$KV_CACHE_DTYPE" exec vllm serve "${ARGS[@]}"