# Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP — vLLM serving stack. # # ./run.sh up # start (build-free; uses the official vLLM image) # ./run.sh test # poll /v1/models until ready # ./run.sh bench # one-shot chat completion # ./run.sh logs # tail # ./run.sh down # stop # # Defaults to the official vLLM image — it already ships the qwen3_5 architecture # (Qwen3_5ForConditionalGeneration) AND the Qwen3_5MTP draft, so NO build is needed. # Pick the GPUs with CUDA_VISIBLE_DEVICES (must expose exactly TP_SIZE of them, and # avoid your display GPU). Example: CUDA_VISIBLE_DEVICES=0,1,2,3 TP_SIZE=4 ./run.sh up services: vllm: image: ${VLLM_IMAGE:-vllm/vllm-openai:v0.22.0} # --- Build a self-contained image instead (optional): comment `image:` above, # uncomment the block below, then `./run.sh rebuild`. The Dockerfile just # pip-installs vLLM on a CUDA 13.1 base — the official image is equivalent. --- # build: # context: . # dockerfile: Dockerfile container_name: vllm-huihui runtime: nvidia restart: unless-stopped entrypoint: ["/bin/bash", "/entrypoint.sh"] ports: - "${PORT:-8000}:8000" environment: - MODEL_DIR=/model - PORT=8000 - MAX_MODEL_LEN=${MAX_MODEL_LEN:-65536} - MAX_NUM_SEQS=${MAX_NUM_SEQS:-8} - MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-16384} - GPU_MEM_UTIL=${GPU_MEM_UTIL:-0.85} - KV_CACHE_DTYPE=${KV_CACHE_DTYPE:-fp8} - SPEC_TOKENS=${SPEC_TOKENS:-3} - TP_SIZE=${TP_SIZE:-4} - ENABLE_TOOLS=${ENABLE_TOOLS:-1} - SERVED_MODEL_NAME=huihui-qwen36-27b-local - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} - VLLM_USE_FLASHINFER_SAMPLER=1 - TORCH_MATMUL_PRECISION=high volumes: # the model folder (this directory) + the entrypoint, both read-only - .:/model:ro - ./entrypoint.sh:/entrypoint.sh:ro shm_size: 32g