Spaces:
Running on Zero
Running on Zero
| """ | |
| Local OpenAI-compatible /v1 shim in front of the ZeroGPU Gradio Space. | |
| Talks to the Space's dedicated api_name="/generate" endpoint over the raw | |
| Gradio REST route (no gradio_client, so the Space's broken schema introspection | |
| doesn't matter). | |
| pip install fastapi "uvicorn[standard]" httpx | |
| export HF_TOKEN=hf_xxx # your Pro token -> your ZeroGPU quota | |
| python openai_shim.py # listens on :11346 | |
| Point any OpenAI client at http://localhost:11346/v1 | |
| """ | |
| import json | |
| import os | |
| import time | |
| import uuid | |
| from typing import List, Optional, Union | |
| import httpx | |
| import uvicorn | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import StreamingResponse | |
| from pydantic import BaseModel | |
| SPACE_SUB = os.environ.get("SPACE_SUB", "chabab-qwen-agentworld-35b-zerogpu") | |
| BASE = f"https://{SPACE_SUB}.hf.space/gradio_api/call/generate" | |
| MODEL_NAME = os.environ.get("MODEL_ID", "Qwen/Qwen-AgentWorld-35B-A3B") | |
| PORT = int(os.environ.get("PORT", "11346")) | |
| HEADERS = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN', '')}", | |
| "Content-Type": "application/json"} | |
| class ChatMessage(BaseModel): | |
| role: str | |
| content: Union[str, List[dict]] | |
| class ChatRequest(BaseModel): | |
| model: str = MODEL_NAME | |
| messages: List[ChatMessage] | |
| temperature: float = 0.7 | |
| max_tokens: int = 512 | |
| stream: bool = False | |
| def _flatten(messages: List[ChatMessage]): | |
| """Return (prompt_text, image_url). Folds history into the prompt; takes the | |
| first image_url found on the last user message.""" | |
| image_url = "" | |
| lines = [] | |
| for i, m in enumerate(messages): | |
| if isinstance(m.content, str): | |
| lines.append(f"{m.role}: {m.content}") | |
| continue | |
| texts = [] | |
| for part in m.content: | |
| if part.get("type") == "text": | |
| texts.append(part.get("text", "")) | |
| elif part.get("type") == "image_url" and i == len(messages) - 1: | |
| image_url = image_url or part["image_url"]["url"] | |
| lines.append(f"{m.role}: {' '.join(texts)}") | |
| return "\n".join(lines), image_url | |
| def _call_space(text: str, image_url: str, max_tokens: int, temperature: float) -> str: | |
| with httpx.Client(timeout=300) as c: | |
| r = c.post(BASE, headers=HEADERS, | |
| json={"data": [text, image_url, max_tokens, temperature]}) | |
| r.raise_for_status() | |
| event_id = r.json()["event_id"] | |
| # Stream the SSE result; the final "complete" event carries the output. | |
| result = "" | |
| with c.stream("GET", f"{BASE}/{event_id}", headers=HEADERS) as s: | |
| event = None | |
| for line in s.iter_lines(): | |
| if line.startswith("event:"): | |
| event = line.split(":", 1)[1].strip() | |
| elif line.startswith("data:") and event == "complete": | |
| payload = json.loads(line[5:].strip()) | |
| result = payload[0] if isinstance(payload, list) else payload | |
| elif event == "error": | |
| raise HTTPException(502, "Space returned an error event") | |
| return result or "" | |
| app = FastAPI(title="OpenAI shim -> ZeroGPU Space") | |
| def models(): | |
| return {"object": "list", "data": [{"id": MODEL_NAME, "object": "model"}]} | |
| def chat_completions(req: ChatRequest): | |
| text, image_url = _flatten(req.messages) | |
| answer = _call_space(text, image_url, req.max_tokens, req.temperature) | |
| cid = f"chatcmpl-{uuid.uuid4().hex}" | |
| created = int(time.time()) | |
| if req.stream: | |
| def sse(): | |
| chunk = {"id": cid, "object": "chat.completion.chunk", "created": created, | |
| "model": req.model, | |
| "choices": [{"index": 0, "delta": {"role": "assistant", | |
| "content": answer}, "finish_reason": None}]} | |
| yield f"data: {json.dumps(chunk)}\n\n" | |
| done = {"id": cid, "object": "chat.completion.chunk", "created": created, | |
| "model": req.model, | |
| "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]} | |
| yield f"data: {json.dumps(done)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(sse(), media_type="text/event-stream") | |
| return {"id": cid, "object": "chat.completion", "created": created, | |
| "model": req.model, | |
| "choices": [{"index": 0, | |
| "message": {"role": "assistant", "content": answer}, | |
| "finish_reason": "stop"}], | |
| "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}} | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=PORT) | |