#!/bin/bash
# entrypoint.sh — vLLM OpenAI-compatible server for Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP.
# Qwen3.5-family hybrid (linear + periodic full attention) + MTP speculative decoding,
# NVFP4 (modelopt W4A4). Targets 4× 16 GB Blackwell (SM120), PCIe.
# Every flag is env-overridable — see README.md / USAGE.md.
set -e

MODEL_DIR="${MODEL_DIR:-/model}"
HOST="${HOST:-0.0.0.0}"
PORT="${PORT:-8000}"

# ---- tunables ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-65536}"                 # up to 262144 native
MAX_NUM_SEQS="${MAX_NUM_SEQS:-8}"                       # concurrent seqs (24 = peak throughput on 64K)
MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-16384}"
GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.85}"                    # model ~7.2 GiB/GPU at TP=4; rest is KV
KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
TP_SIZE="${TP_SIZE:-4}"
DTYPE="${DTYPE:-auto}"
SPEC_TOKENS="${SPEC_TOKENS:-3}"                         # MTP draft tokens; 0 disables speculative decoding
REASONING_PARSER="${REASONING_PARSER:-qwen3}"          # <think>…</think> -> reasoning_content; "" to disable
TOOL_CALL_PARSER="${TOOL_CALL_PARSER:-qwen3_xml}"
ENABLE_TOOLS="${ENABLE_TOOLS:-1}"                      # 1 = agentic (auto tool choice); 0 = plain chat
SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-huihui-qwen36-27b-local}"

ARGS=(
  "$MODEL_DIR"
  --served-model-name "$SERVED_MODEL_NAME"
  --trust-remote-code
  --tensor-parallel-size "$TP_SIZE"
  --quantization modelopt
  --max-model-len "$MAX_MODEL_LEN"
  --max-num-seqs "$MAX_NUM_SEQS"
  --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS"
  --gpu-memory-utilization "$GPU_MEM_UTIL"
  --kv-cache-dtype "$KV_CACHE_DTYPE"
  --dtype "$DTYPE"
  --chat-template "$MODEL_DIR/chat_template.jinja"
  --host "$HOST"
  --port "$PORT"
)

[ -n "$REASONING_PARSER" ] && ARGS+=(--reasoning-parser "$REASONING_PARSER")

# MTP speculative decoding. vLLM >=0.22 deprecates method "qwen3_5_mtp" -> "mtp"
# (harmless warning, auto-mapped). Set SPEC_TOKENS=0 to disable.
if [ "${SPEC_TOKENS}" -gt 0 ] 2>/dev/null; then
  ARGS+=(--speculative-config "{\"method\":\"qwen3_5_mtp\",\"num_speculative_tokens\":${SPEC_TOKENS}}")
fi

# Agentic tool-calling (Qwen3 emits XML tool calls). Set ENABLE_TOOLS=0 for pure chat.
if [ "$ENABLE_TOOLS" = "1" ]; then
  ARGS+=(--enable-auto-tool-choice --tool-call-parser "$TOOL_CALL_PARSER")
fi

echo "[entrypoint] vllm serve  TP=$TP_SIZE  len=$MAX_MODEL_LEN  seqs=$MAX_NUM_SEQS  spec=$SPEC_TOKENS  tools=$ENABLE_TOOLS  kv=$KV_CACHE_DTYPE"
exec vllm serve "${ARGS[@]}"