Spaces:
Sleeping
Sleeping
| """Modal backend for AI Prof's eyes: MiniCPM-V-4 served by llama.cpp. | |
| The endpoint is OpenAI-compatible and accepts the image_url content produced by | |
| ``ai_prof/vision.py``. Point ``VISION_BASE_URL`` at the deployed URL; the app | |
| adds ``/v1`` when needed. | |
| The GGUF route is intentional: it gives this small, bursty vision service quick | |
| cold starts and keeps the project on llama.cpp for the hackathon's Llama | |
| Champion track. An L4 comfortably holds the Q4_K_M language model, F16 vision | |
| projector, and an 8K context. | |
| Bring-up: | |
| modal run modal_app_vision.py::download_model | |
| modal run modal_app_vision.py::warm | |
| modal deploy modal_app_vision.py | |
| # .env: VISION_BASE_URL=<serve URL> VISION_MODEL=minicpm-v | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| import binascii | |
| import contextlib | |
| import json | |
| import struct | |
| import subprocess | |
| import time | |
| import urllib.request | |
| import zlib | |
| import modal | |
| MODEL_REPO = "openbmb/MiniCPM-V-4-gguf" | |
| MODEL_REVISION = "a5a17436782fb15dff8df61ab0ec3126c3564695" | |
| MODEL_FILE = "ggml-model-Q4_K_M.gguf" | |
| MMPROJ_FILE = "mmproj-model-f16.gguf" | |
| SERVED_NAME = "minicpm-v" | |
| # Official llama.cpp CUDA server image b9570, pinned to its amd64 manifest so an | |
| # upstream image update cannot silently alter multimodal behavior during judging. | |
| LLAMA_CPP_IMAGE = ( | |
| "ghcr.io/ggml-org/llama.cpp:server-cuda" | |
| "@sha256:3f167c81f5f281f642be62d1d9750b609fa38e7aa7be9b9ea2017a7f43a0d5eb" | |
| ) | |
| LLAMA_SERVER = "/app/llama-server" | |
| MODEL_DIR = "/models/minicpm-v-4" | |
| GPU = "L4" | |
| LLAMA_PORT = 8081 | |
| MAX_MODEL_LEN = 8192 | |
| MINUTES = 60 | |
| app = modal.App("ai-prof-vision") | |
| model_volume = modal.Volume.from_name("ai-prof-vision-models", create_if_missing=True) | |
| llama_image = ( | |
| modal.Image.from_registry(LLAMA_CPP_IMAGE, add_python="3.12") | |
| .entrypoint([]) | |
| .pip_install("fastapi[standard]", "httpx", "huggingface_hub>=1.0") | |
| .env({"HF_XET_HIGH_PERFORMANCE": "1"}) | |
| ) | |
| def _server_cmd() -> list[str]: | |
| return [ | |
| LLAMA_SERVER, | |
| "--model", | |
| f"{MODEL_DIR}/{MODEL_FILE}", | |
| "--mmproj", | |
| f"{MODEL_DIR}/{MMPROJ_FILE}", | |
| "--alias", | |
| SERVED_NAME, | |
| "--host", | |
| "127.0.0.1", | |
| "--port", | |
| str(LLAMA_PORT), | |
| "--ctx-size", | |
| str(MAX_MODEL_LEN), | |
| "--n-gpu-layers", | |
| "99", | |
| "--parallel", | |
| "1", | |
| ] | |
| def _wait_healthy(timeout_s: int = 10 * MINUTES) -> None: | |
| deadline = time.time() + timeout_s | |
| while time.time() < deadline: | |
| try: | |
| with urllib.request.urlopen( | |
| f"http://127.0.0.1:{LLAMA_PORT}/health", timeout=5 | |
| ) as response: | |
| if response.status == 200: | |
| return | |
| except Exception: | |
| time.sleep(2) | |
| raise TimeoutError("llama-server did not become healthy in time") | |
| def _png_data_uri(width: int = 64, height: int = 64) -> str: | |
| """Create a tiny valid RGB test image without adding Pillow to the image.""" | |
| def chunk(kind: bytes, data: bytes) -> bytes: | |
| body = kind + data | |
| return struct.pack(">I", len(data)) + body + struct.pack(">I", binascii.crc32(body)) | |
| # A white image with one blue horizontal stripe gives the vision encoder | |
| # real, non-uniform pixels while keeping the warm-up payload tiny. | |
| rows = [] | |
| for y in range(height): | |
| pixel = b"\x20\x70\xd0" if height // 3 <= y < 2 * height // 3 else b"\xf8\xf8\xf8" | |
| rows.append(b"\x00" + pixel * width) | |
| png = ( | |
| b"\x89PNG\r\n\x1a\n" | |
| + chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0)) | |
| + chunk(b"IDAT", zlib.compress(b"".join(rows))) | |
| + chunk(b"IEND", b"") | |
| ) | |
| return "data:image/png;base64," + base64.b64encode(png).decode("ascii") | |
| def download_model() -> None: | |
| """Download only the two GGUF files required by llama-server on CPU.""" | |
| from huggingface_hub import hf_hub_download | |
| for filename in (MODEL_FILE, MMPROJ_FILE): | |
| print(f"Downloading {MODEL_REPO}/{filename} ...") | |
| hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=filename, | |
| revision=MODEL_REVISION, | |
| local_dir=MODEL_DIR, | |
| ) | |
| model_volume.commit() | |
| print("MiniCPM-V model and projector downloaded.") | |
| def warm() -> None: | |
| """Smoke-test model loading and the complete multimodal request path.""" | |
| proc = subprocess.Popen(_server_cmd()) | |
| try: | |
| print("Waiting for llama-server to load MiniCPM-V...") | |
| _wait_healthy() | |
| payload = { | |
| "model": SERVED_NAME, | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": "What color is the stripe? Answer briefly."}, | |
| {"type": "image_url", "image_url": {"url": _png_data_uri()}}, | |
| ], | |
| } | |
| ], | |
| "temperature": 0, | |
| "max_tokens": 16, | |
| } | |
| request = urllib.request.Request( | |
| f"http://127.0.0.1:{LLAMA_PORT}/v1/chat/completions", | |
| data=json.dumps(payload).encode(), | |
| headers={"Content-Type": "application/json"}, | |
| ) | |
| response = urllib.request.urlopen(request, timeout=180).read().decode() | |
| print("Multimodal warm-up response:", response[:800]) | |
| finally: | |
| proc.terminate() | |
| try: | |
| proc.wait(timeout=30) | |
| except subprocess.TimeoutExpired: | |
| proc.kill() | |
| def serve(): | |
| """Expose llama.cpp only after the model has finished loading.""" | |
| import httpx | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import StreamingResponse | |
| # ``from __future__ import annotations`` makes the route annotation a | |
| # string; FastAPI resolves it through module globals. | |
| globals()["_ProxyRequest"] = Request | |
| async def lifespan(_app: FastAPI): | |
| print("Launching:", " ".join(_server_cmd())) | |
| proc = subprocess.Popen(_server_cmd()) | |
| try: | |
| # Modal does not route traffic to the ASGI app until startup exits. | |
| # This avoids llama-server's temporary 503 while weights are loading. | |
| await __import__("asyncio").to_thread(_wait_healthy) | |
| print("MiniCPM-V is ready.") | |
| yield | |
| finally: | |
| proc.terminate() | |
| try: | |
| proc.wait(timeout=30) | |
| except subprocess.TimeoutExpired: | |
| proc.kill() | |
| proxy = FastAPI(lifespan=lifespan) | |
| upstream = f"http://127.0.0.1:{LLAMA_PORT}" | |
| async def forward(path: str, request: _ProxyRequest): | |
| client = httpx.AsyncClient(timeout=None) | |
| upstream_request = client.build_request( | |
| request.method, | |
| f"{upstream}/{path}", | |
| params=request.query_params, | |
| headers={ | |
| key: value | |
| for key, value in request.headers.items() | |
| if key.lower() not in {"host", "content-length"} | |
| }, | |
| content=await request.body(), | |
| ) | |
| response = await client.send(upstream_request, stream=True) | |
| async def body(): | |
| try: | |
| async for chunk in response.aiter_raw(): | |
| yield chunk | |
| finally: | |
| await response.aclose() | |
| await client.aclose() | |
| return StreamingResponse( | |
| body(), | |
| status_code=response.status_code, | |
| headers={ | |
| key: value | |
| for key, value in response.headers.items() | |
| if key.lower() not in {"content-length", "transfer-encoding", "connection"} | |
| }, | |
| ) | |
| return proxy | |