import os import json import asyncio from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse, StreamingResponse from gradio import Server from openai import AsyncOpenAI from pydantic import BaseModel from typing import List, Optional # Initialize gradio.Server (which is a subclass of FastAPI) app = Server() # Define request schemas class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): messages: List[Message] model: str = "zai-org/GLM-5.2:fireworks-ai" temperature: float = 0.7 max_tokens: int = 2048 @app.post("/api/chat") async def chat_endpoint(request: ChatRequest): # Determine API key: fall back solely to the host's environment variable api_key = os.environ.get("HF_TOKEN", "").strip() if not api_key: async def error_generator(): yield "data: " + json.dumps({ "error": "HF_TOKEN not configured on the backend. Please contact the administrator to set the environment variable." }) + "\n\n" yield "data: [DONE]\n\n" headers = { "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no", } return StreamingResponse(error_generator(), media_type="text/event-stream", headers=headers) # Initialize AsyncOpenAI client pointing to Hugging Face Router client = AsyncOpenAI( base_url="https://router.huggingface.co/v1", api_key=api_key, default_headers={ "X-HF-Bill-To": "huggingface" } ) async def event_generator(): try: # Call completion API asynchronously with stream=True stream = await client.chat.completions.create( model="zai-org/GLM-5.2:fireworks-ai", messages=[{"role": m.role, "content": m.content} for m in request.messages], temperature=request.temperature, max_tokens=request.max_tokens, stream=True, ) # Yield chunks as they arrive asynchronously async for chunk in stream: if chunk.choices and len(chunk.choices) > 0: delta_content = chunk.choices[0].delta.content if delta_content: yield f"data: {json.dumps({'content': delta_content})}\n\n" # Signal stream completion yield "data: [DONE]\n\n" except Exception as e: # Yield any error occurring during streaming yield f"data: {json.dumps({'error': str(e)})}\n\n" yield "data: [DONE]\n\n" # Configure headers to bypass buffer layers in reverse proxies (like Nginx/Cloudflare) headers = { "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no", } return StreamingResponse(event_generator(), media_type="text/event-stream", headers=headers) @app.get("/", response_class=HTMLResponse) async def homepage(): html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html") if not os.path.exists(html_path): return HTMLResponse("

index.html not found. Please ensure it is created in the workspace directory.

", status_code=404) with open(html_path, "r", encoding="utf-8") as f: return HTMLResponse(f.read()) if __name__ == "__main__": # Launch Gradio Server (which binds to the FastAPI app underneath) app.launch(show_error=True)