""" Local OpenAI-compatible /v1 shim in front of the ZeroGPU Gradio Space. Talks to the Space's dedicated api_name="/generate" endpoint over the raw Gradio REST route (no gradio_client, so the Space's broken schema introspection doesn't matter). pip install fastapi "uvicorn[standard]" httpx export HF_TOKEN=hf_xxx # your Pro token -> your ZeroGPU quota python openai_shim.py # listens on :11346 Point any OpenAI client at http://localhost:11346/v1 """ import json import os import time import uuid from typing import List, Optional, Union import httpx import uvicorn from fastapi import FastAPI, HTTPException from fastapi.responses import StreamingResponse from pydantic import BaseModel SPACE_SUB = os.environ.get("SPACE_SUB", "chabab-qwen-agentworld-35b-zerogpu") BASE = f"https://{SPACE_SUB}.hf.space/gradio_api/call/generate" MODEL_NAME = os.environ.get("MODEL_ID", "Qwen/Qwen-AgentWorld-35B-A3B") PORT = int(os.environ.get("PORT", "11346")) HEADERS = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}", "Content-Type": "application/json"} class ChatMessage(BaseModel): role: str content: Union[str, List[dict]] class ChatRequest(BaseModel): model: str = MODEL_NAME messages: List[ChatMessage] temperature: float = 0.7 max_tokens: int = 512 stream: bool = False def _flatten(messages: List[ChatMessage]): """Return (prompt_text, image_url). Folds history into the prompt; takes the first image_url found on the last user message.""" image_url = "" lines = [] for i, m in enumerate(messages): if isinstance(m.content, str): lines.append(f"{m.role}: {m.content}") continue texts = [] for part in m.content: if part.get("type") == "text": texts.append(part.get("text", "")) elif part.get("type") == "image_url" and i == len(messages) - 1: image_url = image_url or part["image_url"]["url"] lines.append(f"{m.role}: {' '.join(texts)}") return "\n".join(lines), image_url def _call_space(text: str, image_url: str, max_tokens: int, temperature: float) -> str: with httpx.Client(timeout=300) as c: r = c.post(BASE, headers=HEADERS, json={"data": [text, image_url, max_tokens, temperature]}) r.raise_for_status() event_id = r.json()["event_id"] # Stream the SSE result; the final "complete" event carries the output. result = "" with c.stream("GET", f"{BASE}/{event_id}", headers=HEADERS) as s: event = None for line in s.iter_lines(): if line.startswith("event:"): event = line.split(":", 1)[1].strip() elif line.startswith("data:") and event == "complete": payload = json.loads(line[5:].strip()) result = payload[0] if isinstance(payload, list) else payload elif event == "error": raise HTTPException(502, "Space returned an error event") return result or "" app = FastAPI(title="OpenAI shim -> ZeroGPU Space") @app.get("/v1/models") def models(): return {"object": "list", "data": [{"id": MODEL_NAME, "object": "model"}]} @app.post("/v1/chat/completions") def chat_completions(req: ChatRequest): text, image_url = _flatten(req.messages) answer = _call_space(text, image_url, req.max_tokens, req.temperature) cid = f"chatcmpl-{uuid.uuid4().hex}" created = int(time.time()) if req.stream: def sse(): chunk = {"id": cid, "object": "chat.completion.chunk", "created": created, "model": req.model, "choices": [{"index": 0, "delta": {"role": "assistant", "content": answer}, "finish_reason": None}]} yield f"data: {json.dumps(chunk)}\n\n" done = {"id": cid, "object": "chat.completion.chunk", "created": created, "model": req.model, "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]} yield f"data: {json.dumps(done)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(sse(), media_type="text/event-stream") return {"id": cid, "object": "chat.completion", "created": created, "model": req.model, "choices": [{"index": 0, "message": {"role": "assistant", "content": answer}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=PORT)