# Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP — vLLM serving stack.
#
#   ./run.sh up        # start (build-free; uses the official vLLM image)
#   ./run.sh test      # poll /v1/models until ready
#   ./run.sh bench     # one-shot chat completion
#   ./run.sh logs      # tail
#   ./run.sh down      # stop
#
# Defaults to the official vLLM image — it already ships the qwen3_5 architecture
# (Qwen3_5ForConditionalGeneration) AND the Qwen3_5MTP draft, so NO build is needed.
# Pick the GPUs with CUDA_VISIBLE_DEVICES (must expose exactly TP_SIZE of them, and
# avoid your display GPU). Example: CUDA_VISIBLE_DEVICES=0,1,2,3 TP_SIZE=4 ./run.sh up
services:
  vllm:
    image: ${VLLM_IMAGE:-vllm/vllm-openai:v0.22.0}
    # --- Build a self-contained image instead (optional): comment `image:` above,
    #     uncomment the block below, then `./run.sh rebuild`. The Dockerfile just
    #     pip-installs vLLM on a CUDA 13.1 base — the official image is equivalent. ---
    # build:
    #   context: .
    #   dockerfile: Dockerfile
    container_name: vllm-huihui
    runtime: nvidia
    restart: unless-stopped
    entrypoint: ["/bin/bash", "/entrypoint.sh"]
    ports:
      - "${PORT:-8000}:8000"
    environment:
      - MODEL_DIR=/model
      - PORT=8000
      - MAX_MODEL_LEN=${MAX_MODEL_LEN:-65536}
      - MAX_NUM_SEQS=${MAX_NUM_SEQS:-8}
      - MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-16384}
      - GPU_MEM_UTIL=${GPU_MEM_UTIL:-0.85}
      - KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8}
      - SPEC_TOKENS=${SPEC_TOKENS:-3}
      - TP_SIZE=${TP_SIZE:-4}
      - ENABLE_TOOLS=${ENABLE_TOOLS:-1}
      - SERVED_MODEL_NAME=huihui-qwen36-27b-local
      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3}
      - VLLM_USE_FLASHINFER_SAMPLER=1
      - TORCH_MATMUL_PRECISION=high
    volumes:
      # the model folder (this directory) + the entrypoint, both read-only
      - .:/model:ro
      - ./entrypoint.sh:/entrypoint.sh:ro
    shm_size: 32g