Spaces:

build-small-hackathon
/

ai-prof

Running

App Files Files Community

ai-prof / modal_app.py

pranavkarthik10

Remove development status from README

1dff814 verified 17 days ago

Raw

History Blame Contribute Delete

6.12 kB

	"""Modal backend for AI Prof — the brain (Nemotron 3 Nano) served over vLLM.

	OpenAI-compatible endpoint that the Gradio app points ``BRAIN_BASE_URL`` at. The
	GPU container scales to zero when idle.

	Nemotron 3 Nano is a hybrid Mamba-2 + MoE reasoning model. On first run vLLM
	JIT-compiles CUDA kernels — notably FlashInfer's CUTLASS fused-MoE kernel (slow,
	minutes). We persist those compile caches to Volumes and warm ONCE, so every
	later cold start reuses them (~tens of seconds) instead of recompiling on an
	expensive GPU. Cost discipline: ``max_containers=1`` so a request burst can never
	fan out into multiple GPUs, and warming is a single controlled ``modal run`` —
	never a curl storm against the live endpoint.

	Bring-up:
	modal run modal_app.py::download_model # 1. pull weights to a Volume (CPU, cheap)
	modal run modal_app.py::warm # 2. ONE GPU run: compile + cache kernels
	modal deploy modal_app.py # 3. serve; cold starts now reuse the cache
	# the printed *.modal.run URL + "/v1" is BRAIN_BASE_URL
	"""

	from __future__ import annotations

	import json
	import subprocess
	import time
	import urllib.request

	import modal

	# --- what to serve -----------------------------------------------------------
	MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
	SERVED_NAME = "nemotron-3-nano" # must match BRAIN_MODEL in the app's .env
	GPU = "H100"
	MAX_MODEL_LEN = 16384 # ample for slide reading + outline + history; small KV cache
	VLLM_PORT = 8000
	MINUTES = 60

	app = modal.App("ai-prof-brain")

	# Persistent caches: weights, vLLM torch.compile artifacts, and FlashInfer's JIT
	# kernels. Mounting the FlashInfer cache is what stops the CUTLASS MoE kernel from
	# recompiling on every cold start.
	hf_cache = modal.Volume.from_name("ai-prof-hf-cache", create_if_missing=True)
	vllm_cache = modal.Volume.from_name("ai-prof-vllm-cache", create_if_missing=True)
	flashinfer_cache = modal.Volume.from_name("ai-prof-flashinfer-cache", create_if_missing=True)

	triton_cache = modal.Volume.from_name("ai-prof-triton-cache", create_if_missing=True)

	VOLUMES = {
	"/root/.cache/huggingface": hf_cache,
	"/root/.cache/vllm": vllm_cache,
	"/root/.cache/flashinfer": flashinfer_cache,
	"/root/.triton": triton_cache, # Mamba2 SSD Triton kernel cache (~11 min compile)
	}

	# CUDA devel base ships nvcc — Nemotron-H's Mamba-2 kernels JIT-compile at init.
	vllm_image = (
	modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12")
	.entrypoint([])
	.pip_install("vllm>=0.12.0", "huggingface_hub[hf_transfer]>=0.27")
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	)


	def _vllm_cmd() -> list[str]:
	# No --enforce-eager: we WANT torch.compile + CUDA-graph capture so the
	# artifacts get cached to the vLLM volume (FAST_BOOT=False pattern).
	return [
	"vllm", "serve", MODEL_NAME,
	"--served-model-name", SERVED_NAME,
	"--host", "0.0.0.0", "--port", str(VLLM_PORT),
	"--max-model-len", str(MAX_MODEL_LEN),
	"--max-num-seqs", "8",
	"--tensor-parallel-size", "1",
	# Modal Volumes mount as 9P, which vLLM does not recognize as a network
	# filesystem. Force parallel prefetch instead of reading 13 shards
	# serially; the serial path takes roughly ten minutes for this checkpoint.
	"--safetensors-load-strategy", "prefetch",
	"--trust-remote-code",
	"--reasoning-parser", "nemotron_v3",
	]


	def _wait_healthy(timeout_s: int = 25 * MINUTES) -> None:
	base = f"http://127.0.0.1:{VLLM_PORT}"
	deadline = time.time() + timeout_s
	while time.time() < deadline:
	try:
	urllib.request.urlopen(f"{base}/health", timeout=5)
	return
	except Exception:
	time.sleep(5)
	raise TimeoutError("vLLM did not become healthy in time")


	@app.function(
	image=vllm_image,
	volumes={"/root/.cache/huggingface": hf_cache},
	timeout=60 * MINUTES,
	)
	def download_model(model_name: str = MODEL_NAME) -> None:
	"""Pull weights to the Volume on CPU (no GPU billed during the big download)."""
	from huggingface_hub import snapshot_download

	print(f"Downloading {model_name} -> /root/.cache/huggingface ...")
	snapshot_download(model_name, ignore_patterns=[".pt", ".pth"])
	hf_cache.commit()
	print("Done.")


	@app.function(image=vllm_image, gpu=GPU, volumes=VOLUMES, timeout=60 * MINUTES)
	def warm() -> None:
	"""One controlled GPU run: boot vLLM, fire a single request to trigger every
	kernel compile, then commit the compile caches. Run with ``modal run``."""
	proc = subprocess.Popen(_vllm_cmd())
	try:
	print("Waiting for vLLM to compile + become healthy (first time is slow)...")
	_wait_healthy()
	req = urllib.request.Request(
	f"http://127.0.0.1:{VLLM_PORT}/v1/chat/completions",
	data=json.dumps(
	{
	"model": SERVED_NAME,
	"messages": [{"role": "user", "content": "Say hello."}],
	"max_tokens": 32,
	}
	).encode(),
	headers={"Content-Type": "application/json"},
	)
	print("Response:", urllib.request.urlopen(req, timeout=120).read().decode()[:400])
	finally:
	proc.terminate()
	try:
	proc.wait(timeout=30)
	except Exception:
	proc.kill()
	vllm_cache.commit()
	flashinfer_cache.commit()
	triton_cache.commit()
	print("Warm complete — compile caches committed. Cold starts will now be fast.")


	@app.function(
	image=vllm_image,
	gpu=GPU,
	volumes=VOLUMES,
	scaledown_window=20 * MINUTES, # keep warm between requests; cold start is ~2 min
	timeout=60 * MINUTES,
	max_containers=1, # cost guard: a request burst can never fan out into many GPUs
	)
	@modal.concurrent(max_inputs=8) # vLLM batches concurrent requests on the one replica
	@modal.web_server(port=VLLM_PORT, startup_timeout=30 * MINUTES)
	def serve() -> None:
	print("Launching:", " ".join(_vllm_cmd()))
	subprocess.Popen(_vllm_cmd())