ai-prof / modal_app_vision.py
pranavkarthik10's picture
Deploy AI Prof hackathon submission
81e3ca2 verified
Raw
History Blame Contribute Delete
8.31 kB
"""Modal backend for AI Prof's eyes: MiniCPM-V-4 served by llama.cpp.
The endpoint is OpenAI-compatible and accepts the image_url content produced by
``ai_prof/vision.py``. Point ``VISION_BASE_URL`` at the deployed URL; the app
adds ``/v1`` when needed.
The GGUF route is intentional: it gives this small, bursty vision service quick
cold starts and keeps the project on llama.cpp for the hackathon's Llama
Champion track. An L4 comfortably holds the Q4_K_M language model, F16 vision
projector, and an 8K context.
Bring-up:
modal run modal_app_vision.py::download_model
modal run modal_app_vision.py::warm
modal deploy modal_app_vision.py
# .env: VISION_BASE_URL=<serve URL> VISION_MODEL=minicpm-v
"""
from __future__ import annotations
import base64
import binascii
import contextlib
import json
import struct
import subprocess
import time
import urllib.request
import zlib
import modal
MODEL_REPO = "openbmb/MiniCPM-V-4-gguf"
MODEL_REVISION = "a5a17436782fb15dff8df61ab0ec3126c3564695"
MODEL_FILE = "ggml-model-Q4_K_M.gguf"
MMPROJ_FILE = "mmproj-model-f16.gguf"
SERVED_NAME = "minicpm-v"
# Official llama.cpp CUDA server image b9570, pinned to its amd64 manifest so an
# upstream image update cannot silently alter multimodal behavior during judging.
LLAMA_CPP_IMAGE = (
"ghcr.io/ggml-org/llama.cpp:server-cuda"
"@sha256:3f167c81f5f281f642be62d1d9750b609fa38e7aa7be9b9ea2017a7f43a0d5eb"
)
LLAMA_SERVER = "/app/llama-server"
MODEL_DIR = "/models/minicpm-v-4"
GPU = "L4"
LLAMA_PORT = 8081
MAX_MODEL_LEN = 8192
MINUTES = 60
app = modal.App("ai-prof-vision")
model_volume = modal.Volume.from_name("ai-prof-vision-models", create_if_missing=True)
llama_image = (
modal.Image.from_registry(LLAMA_CPP_IMAGE, add_python="3.12")
.entrypoint([])
.pip_install("fastapi[standard]", "httpx", "huggingface_hub>=1.0")
.env({"HF_XET_HIGH_PERFORMANCE": "1"})
)
def _server_cmd() -> list[str]:
return [
LLAMA_SERVER,
"--model",
f"{MODEL_DIR}/{MODEL_FILE}",
"--mmproj",
f"{MODEL_DIR}/{MMPROJ_FILE}",
"--alias",
SERVED_NAME,
"--host",
"127.0.0.1",
"--port",
str(LLAMA_PORT),
"--ctx-size",
str(MAX_MODEL_LEN),
"--n-gpu-layers",
"99",
"--parallel",
"1",
]
def _wait_healthy(timeout_s: int = 10 * MINUTES) -> None:
deadline = time.time() + timeout_s
while time.time() < deadline:
try:
with urllib.request.urlopen(
f"http://127.0.0.1:{LLAMA_PORT}/health", timeout=5
) as response:
if response.status == 200:
return
except Exception:
time.sleep(2)
raise TimeoutError("llama-server did not become healthy in time")
def _png_data_uri(width: int = 64, height: int = 64) -> str:
"""Create a tiny valid RGB test image without adding Pillow to the image."""
def chunk(kind: bytes, data: bytes) -> bytes:
body = kind + data
return struct.pack(">I", len(data)) + body + struct.pack(">I", binascii.crc32(body))
# A white image with one blue horizontal stripe gives the vision encoder
# real, non-uniform pixels while keeping the warm-up payload tiny.
rows = []
for y in range(height):
pixel = b"\x20\x70\xd0" if height // 3 <= y < 2 * height // 3 else b"\xf8\xf8\xf8"
rows.append(b"\x00" + pixel * width)
png = (
b"\x89PNG\r\n\x1a\n"
+ chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0))
+ chunk(b"IDAT", zlib.compress(b"".join(rows)))
+ chunk(b"IEND", b"")
)
return "data:image/png;base64," + base64.b64encode(png).decode("ascii")
@app.function(
image=llama_image,
volumes={"/models": model_volume},
timeout=30 * MINUTES,
)
def download_model() -> None:
"""Download only the two GGUF files required by llama-server on CPU."""
from huggingface_hub import hf_hub_download
for filename in (MODEL_FILE, MMPROJ_FILE):
print(f"Downloading {MODEL_REPO}/{filename} ...")
hf_hub_download(
repo_id=MODEL_REPO,
filename=filename,
revision=MODEL_REVISION,
local_dir=MODEL_DIR,
)
model_volume.commit()
print("MiniCPM-V model and projector downloaded.")
@app.function(
image=llama_image,
gpu=GPU,
volumes={"/models": model_volume},
timeout=20 * MINUTES,
)
def warm() -> None:
"""Smoke-test model loading and the complete multimodal request path."""
proc = subprocess.Popen(_server_cmd())
try:
print("Waiting for llama-server to load MiniCPM-V...")
_wait_healthy()
payload = {
"model": SERVED_NAME,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What color is the stripe? Answer briefly."},
{"type": "image_url", "image_url": {"url": _png_data_uri()}},
],
}
],
"temperature": 0,
"max_tokens": 16,
}
request = urllib.request.Request(
f"http://127.0.0.1:{LLAMA_PORT}/v1/chat/completions",
data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
response = urllib.request.urlopen(request, timeout=180).read().decode()
print("Multimodal warm-up response:", response[:800])
finally:
proc.terminate()
try:
proc.wait(timeout=30)
except subprocess.TimeoutExpired:
proc.kill()
@app.function(
image=llama_image,
gpu=GPU,
volumes={"/models": model_volume},
scaledown_window=5 * MINUTES,
timeout=30 * MINUTES,
max_containers=1,
)
@modal.concurrent(max_inputs=4)
@modal.asgi_app()
def serve():
"""Expose llama.cpp only after the model has finished loading."""
import httpx
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
# ``from __future__ import annotations`` makes the route annotation a
# string; FastAPI resolves it through module globals.
globals()["_ProxyRequest"] = Request
@contextlib.asynccontextmanager
async def lifespan(_app: FastAPI):
print("Launching:", " ".join(_server_cmd()))
proc = subprocess.Popen(_server_cmd())
try:
# Modal does not route traffic to the ASGI app until startup exits.
# This avoids llama-server's temporary 503 while weights are loading.
await __import__("asyncio").to_thread(_wait_healthy)
print("MiniCPM-V is ready.")
yield
finally:
proc.terminate()
try:
proc.wait(timeout=30)
except subprocess.TimeoutExpired:
proc.kill()
proxy = FastAPI(lifespan=lifespan)
upstream = f"http://127.0.0.1:{LLAMA_PORT}"
@proxy.api_route(
"/{path:path}",
methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS", "HEAD"],
)
async def forward(path: str, request: _ProxyRequest):
client = httpx.AsyncClient(timeout=None)
upstream_request = client.build_request(
request.method,
f"{upstream}/{path}",
params=request.query_params,
headers={
key: value
for key, value in request.headers.items()
if key.lower() not in {"host", "content-length"}
},
content=await request.body(),
)
response = await client.send(upstream_request, stream=True)
async def body():
try:
async for chunk in response.aiter_raw():
yield chunk
finally:
await response.aclose()
await client.aclose()
return StreamingResponse(
body(),
status_code=response.status_code,
headers={
key: value
for key, value in response.headers.items()
if key.lower() not in {"content-length", "transfer-encoding", "connection"}
},
)
return proxy