Spaces:
Running
Running
| """Modal backend for AI Prof — VoxCPM2 TTS + distil-Whisper STT. | |
| Two endpoints in one Modal app: | |
| serve() → /v1/audio/speech (TTS — VoxCPM2 via vLLM-Omni) | |
| transcribe() → /v1/audio/transcriptions (STT — distil-whisper-large-v3) | |
| OpenAI-compatible /v1/audio/speech endpoint; point TTS_BASE_URL at the | |
| printed *.modal.run URL (no /v1 suffix — the client appends it). | |
| VoxCPM2 (2B, MiniCPM-4 backbone) outputs 48 kHz WAV and supports Voice | |
| Design via a natural-language prefix prepended to the synthesis input — used | |
| in ai_prof/rtc.py to give AI Prof a consistent professor voice. | |
| vLLM-Omni layers omni-modal (TTS/STT) support on top of vLLM's scheduler | |
| and exposes a drop-in /v1/audio/speech endpoint with PagedAttention and | |
| continuous batching. | |
| Bring-up: | |
| modal run modal_app_vox.py::download_model # 1. pull VoxCPM2 weights (CPU, cheap) | |
| modal run modal_app_vox.py::download_stt # 2. pull Whisper weights (CPU, cheap) | |
| modal run modal_app_vox.py::warm # 3. ONE GPU run: compile CUDA kernels | |
| modal deploy modal_app_vox.py # 4. serve both endpoints | |
| # .env: TTS_BASE_URL=<serve URL> TTS_MODEL=voxcpm2 | |
| # STT_BASE_URL=<transcribe URL> STT_MODEL=distil-whisper-large-v3 | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import subprocess | |
| import time | |
| import urllib.request | |
| import modal | |
| # --- TTS: VoxCPM2 ------------------------------------------------------------ | |
| MODEL_NAME = "openbmb/VoxCPM2" | |
| SERVED_NAME = "voxcpm2" # must match TTS_MODEL in the app's .env | |
| GPU = "A10G" # 24 GB VRAM; VoxCPM2 needs ~8 GB | |
| MAX_MODEL_LEN = 2048 # generous for any TTS input chunk | |
| VLLM_PORT = 8000 | |
| MINUTES = 60 | |
| # --- STT: distil-Whisper ----------------------------------------------------- | |
| # Use the full CTranslate2 HF path; faster-whisper loads these natively. | |
| STT_MODEL = "Systran/faster-distil-whisper-large-v3" | |
| STT_PORT = 8001 | |
| # Minimal OpenAI-compatible /v1/audio/transcriptions endpoint. | |
| # Written to /tmp/stt_app.py inside the container at startup. | |
| _STT_SERVER = """\ | |
| import io, os, tempfile | |
| from fastapi import FastAPI, File, Form, UploadFile | |
| from faster_whisper import WhisperModel | |
| _MODEL_ID = os.environ.get("WHISPER_MODEL", "Systran/faster-distil-whisper-large-v3") | |
| _model = WhisperModel(_MODEL_ID, device="cuda", compute_type="float16") | |
| app = FastAPI() | |
| @app.post("/v1/audio/transcriptions") | |
| async def transcribe( | |
| file: UploadFile = File(...), | |
| model: str = Form(default=_MODEL_ID), | |
| language: str = Form(default=None), | |
| response_format: str = Form(default="json"), | |
| ): | |
| data = await file.read() | |
| with tempfile.NamedTemporaryFile(suffix=os.path.splitext(file.filename or ".wav")[1] or ".wav", delete=False) as tmp: | |
| tmp.write(data) | |
| tmp_path = tmp.name | |
| try: | |
| segments, _ = _model.transcribe(tmp_path, language=language or None) | |
| text = " ".join(s.text for s in segments).strip() | |
| finally: | |
| os.unlink(tmp_path) | |
| return {"text": text} | |
| """ | |
| app = modal.App("ai-prof-vox") | |
| # Persistent caches — same pattern as modal_app.py (brain). | |
| # FlashInfer cache is kept even though VoxCPM2 is transformer-only: | |
| # vLLM itself may emit FlashInfer kernels depending on the backend. | |
| hf_cache = modal.Volume.from_name("ai-prof-vox-hf-cache", create_if_missing=True) | |
| vllm_cache = modal.Volume.from_name("ai-prof-vox-vllm-cache", create_if_missing=True) | |
| flashinfer_cache = modal.Volume.from_name("ai-prof-vox-fi-cache", create_if_missing=True) | |
| triton_cache = modal.Volume.from_name("ai-prof-vox-triton-cache", create_if_missing=True) | |
| VOLUMES = { | |
| "/root/.cache/huggingface": hf_cache, | |
| "/root/.cache/vllm": vllm_cache, | |
| "/root/.cache/flashinfer": flashinfer_cache, | |
| "/root/.triton": triton_cache, | |
| } | |
| # vLLM-Omni installation recipe from the official README: | |
| # 1. pip install vllm==0.19.0 | |
| # 2. git clone vllm-omni && pip install -e . | |
| # We use uv for faster resolution and to avoid pip's slower solver on large graphs. | |
| vox_image = ( | |
| modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12") | |
| .entrypoint([]) | |
| .apt_install("git") | |
| .run_commands( | |
| "pip install uv 'huggingface_hub[hf_transfer]>=0.27'", | |
| "uv pip install --system 'vllm==0.22.1'", | |
| "pip install hf_transfer", # vllm's uv install drops hf_transfer; reinstall after | |
| "git clone --depth 1 https://github.com/vllm-project/vllm-omni.git /opt/vllm-omni", | |
| "cd /opt/vllm-omni && uv pip install --system -e .", | |
| "pip install 'voxcpm>=2.0'", # vllm-omni requires this for VoxCPM2; it's a separate PyPI package | |
| ) | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| ) | |
| def _vllm_omni_cmd() -> list[str]: | |
| # Use `vllm-omni` CLI (installed by pip install -e vllm-omni). | |
| # With vllm 0.22.1 the plugin also patches `vllm serve` to accept --omni, | |
| # but the explicit CLI is the documented/safe path. | |
| return [ | |
| "vllm-omni", "serve", MODEL_NAME, | |
| "--omni", | |
| "--served-model-name", SERVED_NAME, | |
| "--host", "0.0.0.0", "--port", str(VLLM_PORT), | |
| "--max-model-len", str(MAX_MODEL_LEN), | |
| "--max-num-seqs", "4", | |
| "--tensor-parallel-size", "1", | |
| "--trust-remote-code", | |
| ] | |
| def _wait_healthy(timeout_s: int = 20 * MINUTES) -> None: | |
| base = f"http://127.0.0.1:{VLLM_PORT}" | |
| deadline = time.time() + timeout_s | |
| while time.time() < deadline: | |
| try: | |
| urllib.request.urlopen(f"{base}/health", timeout=5) | |
| return | |
| except Exception: | |
| time.sleep(5) | |
| raise TimeoutError("vLLM-Omni did not become healthy in time") | |
| def download_model(model_name: str = MODEL_NAME) -> None: | |
| """Pull VoxCPM2 weights to the Volume on CPU (no GPU billed).""" | |
| from huggingface_hub import snapshot_download | |
| print(f"Downloading {model_name} ...") | |
| snapshot_download(model_name, ignore_patterns=["*.pt", "*.pth"]) | |
| hf_cache.commit() | |
| print("Done.") | |
| def warm() -> None: | |
| """One controlled GPU run: boot vLLM-Omni, fire a warm-up TTS request to | |
| trigger every kernel compile, then commit the caches.""" | |
| proc = subprocess.Popen(_vllm_omni_cmd()) | |
| try: | |
| print("Waiting for vLLM-Omni to compile + become healthy (first run is slow)...") | |
| _wait_healthy() | |
| req_data = json.dumps({ | |
| "model": SERVED_NAME, | |
| "input": "Warm-up synthesis complete.", | |
| "voice": "default", | |
| "response_format": "wav", | |
| }).encode() | |
| req = urllib.request.Request( | |
| f"http://127.0.0.1:{VLLM_PORT}/v1/audio/speech", | |
| data=req_data, | |
| headers={"Content-Type": "application/json"}, | |
| ) | |
| wav_bytes = urllib.request.urlopen(req, timeout=120).read() | |
| print(f"Warm-up TTS response: {len(wav_bytes):,} bytes of WAV audio.") | |
| finally: | |
| proc.terminate() | |
| try: | |
| proc.wait(timeout=30) | |
| except Exception: | |
| proc.kill() | |
| vllm_cache.commit() | |
| flashinfer_cache.commit() | |
| triton_cache.commit() | |
| print("Warm complete — caches committed.") | |
| def serve() -> None: | |
| print("Launching:", " ".join(_vllm_omni_cmd())) | |
| subprocess.Popen(_vllm_omni_cmd()) | |
| # ============================================================================= | |
| # STT — distil-whisper-large-v3 via faster-whisper-server | |
| # Exposes OpenAI-compatible POST /v1/audio/transcriptions. | |
| # Point STT_BASE_URL at the printed *.modal.run URL; set STT_MODEL=distil-whisper-large-v3. | |
| # ============================================================================= | |
| stt_image = ( | |
| modal.Image.from_registry( | |
| "nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04", add_python="3.12" | |
| ) | |
| .entrypoint([]) | |
| .pip_install( | |
| "faster-whisper", | |
| "fastapi", | |
| "uvicorn[standard]", | |
| "python-multipart", | |
| "hf_transfer", | |
| "huggingface_hub", | |
| ) | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| ) | |
| def download_stt(model_name: str = STT_MODEL) -> None: | |
| """Pull distil-Whisper weights to the shared HF cache Volume (CPU, no GPU billed).""" | |
| from huggingface_hub import snapshot_download | |
| print(f"Downloading {model_name} ...") | |
| snapshot_download(model_name) | |
| hf_cache.commit() | |
| print("Done.") | |
| def transcribe() -> None: | |
| with open("/tmp/stt_app.py", "w") as f: | |
| f.write(_STT_SERVER) | |
| cmd = ["uvicorn", "stt_app:app", "--host", "0.0.0.0", "--port", str(STT_PORT)] | |
| print("Launching STT server (model:", STT_MODEL, ")") | |
| subprocess.Popen(cmd, cwd="/tmp", env={**os.environ, "WHISPER_MODEL": STT_MODEL}) | |