Spaces:

build-small-hackathon
/

ai-prof

Sleeping

App Files Files Community

ai-prof / modal_app_vision.py

pranavkarthik10

Deploy AI Prof hackathon submission

81e3ca2 verified 18 days ago

Raw

History Blame Contribute Delete

8.31 kB

	"""Modal backend for AI Prof's eyes: MiniCPM-V-4 served by llama.cpp.

	The endpoint is OpenAI-compatible and accepts the image_url content produced by
	``ai_prof/vision.py``. Point ``VISION_BASE_URL`` at the deployed URL; the app
	adds ``/v1`` when needed.

	The GGUF route is intentional: it gives this small, bursty vision service quick
	cold starts and keeps the project on llama.cpp for the hackathon's Llama
	Champion track. An L4 comfortably holds the Q4_K_M language model, F16 vision
	projector, and an 8K context.

	Bring-up:
	modal run modal_app_vision.py::download_model
	modal run modal_app_vision.py::warm
	modal deploy modal_app_vision.py
	# .env: VISION_BASE_URL=<serve URL> VISION_MODEL=minicpm-v
	"""

	from __future__ import annotations

	import base64
	import binascii
	import contextlib
	import json
	import struct
	import subprocess
	import time
	import urllib.request
	import zlib

	import modal

	MODEL_REPO = "openbmb/MiniCPM-V-4-gguf"
	MODEL_REVISION = "a5a17436782fb15dff8df61ab0ec3126c3564695"
	MODEL_FILE = "ggml-model-Q4_K_M.gguf"
	MMPROJ_FILE = "mmproj-model-f16.gguf"
	SERVED_NAME = "minicpm-v"

	# Official llama.cpp CUDA server image b9570, pinned to its amd64 manifest so an
	# upstream image update cannot silently alter multimodal behavior during judging.
	LLAMA_CPP_IMAGE = (
	"ghcr.io/ggml-org/llama.cpp:server-cuda"
	"@sha256:3f167c81f5f281f642be62d1d9750b609fa38e7aa7be9b9ea2017a7f43a0d5eb"
	)
	LLAMA_SERVER = "/app/llama-server"
	MODEL_DIR = "/models/minicpm-v-4"

	GPU = "L4"
	LLAMA_PORT = 8081
	MAX_MODEL_LEN = 8192
	MINUTES = 60

	app = modal.App("ai-prof-vision")
	model_volume = modal.Volume.from_name("ai-prof-vision-models", create_if_missing=True)

	llama_image = (
	modal.Image.from_registry(LLAMA_CPP_IMAGE, add_python="3.12")
	.entrypoint([])
	.pip_install("fastapi[standard]", "httpx", "huggingface_hub>=1.0")
	.env({"HF_XET_HIGH_PERFORMANCE": "1"})
	)


	def _server_cmd() -> list[str]:
	return [
	LLAMA_SERVER,
	"--model",
	f"{MODEL_DIR}/{MODEL_FILE}",
	"--mmproj",
	f"{MODEL_DIR}/{MMPROJ_FILE}",
	"--alias",
	SERVED_NAME,
	"--host",
	"127.0.0.1",
	"--port",
	str(LLAMA_PORT),
	"--ctx-size",
	str(MAX_MODEL_LEN),
	"--n-gpu-layers",
	"99",
	"--parallel",
	"1",
	]


	def _wait_healthy(timeout_s: int = 10 * MINUTES) -> None:
	deadline = time.time() + timeout_s
	while time.time() < deadline:
	try:
	with urllib.request.urlopen(
	f"http://127.0.0.1:{LLAMA_PORT}/health", timeout=5
	) as response:
	if response.status == 200:
	return
	except Exception:
	time.sleep(2)
	raise TimeoutError("llama-server did not become healthy in time")


	def _png_data_uri(width: int = 64, height: int = 64) -> str:
	"""Create a tiny valid RGB test image without adding Pillow to the image."""

	def chunk(kind: bytes, data: bytes) -> bytes:
	body = kind + data
	return struct.pack(">I", len(data)) + body + struct.pack(">I", binascii.crc32(body))

	# A white image with one blue horizontal stripe gives the vision encoder
	# real, non-uniform pixels while keeping the warm-up payload tiny.
	rows = []
	for y in range(height):
	pixel = b"\x20\x70\xd0" if height // 3 <= y < 2 * height // 3 else b"\xf8\xf8\xf8"
	rows.append(b"\x00" + pixel * width)
	png = (
	b"\x89PNG\r\n\x1a\n"
	+ chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0))
	+ chunk(b"IDAT", zlib.compress(b"".join(rows)))
	+ chunk(b"IEND", b"")
	)
	return "data:image/png;base64," + base64.b64encode(png).decode("ascii")


	@app.function(
	image=llama_image,
	volumes={"/models": model_volume},
	timeout=30 * MINUTES,
	)
	def download_model() -> None:
	"""Download only the two GGUF files required by llama-server on CPU."""
	from huggingface_hub import hf_hub_download

	for filename in (MODEL_FILE, MMPROJ_FILE):
	print(f"Downloading {MODEL_REPO}/{filename} ...")
	hf_hub_download(
	repo_id=MODEL_REPO,
	filename=filename,
	revision=MODEL_REVISION,
	local_dir=MODEL_DIR,
	)
	model_volume.commit()
	print("MiniCPM-V model and projector downloaded.")


	@app.function(
	image=llama_image,
	gpu=GPU,
	volumes={"/models": model_volume},
	timeout=20 * MINUTES,
	)
	def warm() -> None:
	"""Smoke-test model loading and the complete multimodal request path."""
	proc = subprocess.Popen(_server_cmd())
	try:
	print("Waiting for llama-server to load MiniCPM-V...")
	_wait_healthy()
	payload = {
	"model": SERVED_NAME,
	"messages": [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "What color is the stripe? Answer briefly."},
	{"type": "image_url", "image_url": {"url": _png_data_uri()}},
	],
	}
	],
	"temperature": 0,
	"max_tokens": 16,
	}
	request = urllib.request.Request(
	f"http://127.0.0.1:{LLAMA_PORT}/v1/chat/completions",
	data=json.dumps(payload).encode(),
	headers={"Content-Type": "application/json"},
	)
	response = urllib.request.urlopen(request, timeout=180).read().decode()
	print("Multimodal warm-up response:", response[:800])
	finally:
	proc.terminate()
	try:
	proc.wait(timeout=30)
	except subprocess.TimeoutExpired:
	proc.kill()


	@app.function(
	image=llama_image,
	gpu=GPU,
	volumes={"/models": model_volume},
	scaledown_window=5 * MINUTES,
	timeout=30 * MINUTES,
	max_containers=1,
	)
	@modal.concurrent(max_inputs=4)
	@modal.asgi_app()
	def serve():
	"""Expose llama.cpp only after the model has finished loading."""
	import httpx
	from fastapi import FastAPI, Request
	from fastapi.responses import StreamingResponse

	# ``from __future__ import annotations`` makes the route annotation a
	# string; FastAPI resolves it through module globals.
	globals()["_ProxyRequest"] = Request

	@contextlib.asynccontextmanager
	async def lifespan(_app: FastAPI):
	print("Launching:", " ".join(_server_cmd()))
	proc = subprocess.Popen(_server_cmd())
	try:
	# Modal does not route traffic to the ASGI app until startup exits.
	# This avoids llama-server's temporary 503 while weights are loading.
	await __import__("asyncio").to_thread(_wait_healthy)
	print("MiniCPM-V is ready.")
	yield
	finally:
	proc.terminate()
	try:
	proc.wait(timeout=30)
	except subprocess.TimeoutExpired:
	proc.kill()

	proxy = FastAPI(lifespan=lifespan)
	upstream = f"http://127.0.0.1:{LLAMA_PORT}"

	@proxy.api_route(
	"/{path:path}",
	methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS", "HEAD"],
	)
	async def forward(path: str, request: _ProxyRequest):
	client = httpx.AsyncClient(timeout=None)
	upstream_request = client.build_request(
	request.method,
	f"{upstream}/{path}",
	params=request.query_params,
	headers={
	key: value
	for key, value in request.headers.items()
	if key.lower() not in {"host", "content-length"}
	},
	content=await request.body(),
	)
	response = await client.send(upstream_request, stream=True)

	async def body():
	try:
	async for chunk in response.aiter_raw():
	yield chunk
	finally:
	await response.aclose()
	await client.aclose()

	return StreamingResponse(
	body(),
	status_code=response.status_code,
	headers={
	key: value
	for key, value in response.headers.items()
	if key.lower() not in {"content-length", "transfer-encoding", "connection"}
	},
	)

	return proxy