import os
import json
import asyncio
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, StreamingResponse
from gradio import Server
from openai import AsyncOpenAI
from pydantic import BaseModel
from typing import List, Optional

# Initialize gradio.Server (which is a subclass of FastAPI)
app = Server()

# Define request schemas
class Message(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    messages: List[Message]
    model: str = "zai-org/GLM-5.2:fireworks-ai"
    temperature: float = 0.7
    max_tokens: int = 2048

@app.post("/api/chat")
async def chat_endpoint(request: ChatRequest):
    # Determine API key: fall back solely to the host's environment variable
    api_key = os.environ.get("HF_TOKEN", "").strip()

    if not api_key:
        async def error_generator():
            yield "data: " + json.dumps({
                "error": "HF_TOKEN not configured on the backend. Please contact the administrator to set the environment variable."
            }) + "\n\n"
            yield "data: [DONE]\n\n"
        
        headers = {
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no",
        }
        return StreamingResponse(error_generator(), media_type="text/event-stream", headers=headers)

    # Initialize AsyncOpenAI client pointing to Hugging Face Router
    client = AsyncOpenAI(
        base_url="https://router.huggingface.co/v1",
        api_key=api_key,
        default_headers={
            "X-HF-Bill-To": "huggingface"
        }
    )

    async def event_generator():
        try:
            # Call completion API asynchronously with stream=True
            stream = await client.chat.completions.create(
                model="zai-org/GLM-5.2:fireworks-ai",
                messages=[{"role": m.role, "content": m.content} for m in request.messages],
                temperature=request.temperature,
                max_tokens=request.max_tokens,
                stream=True,
            )

            # Yield chunks as they arrive asynchronously
            async for chunk in stream:
                if chunk.choices and len(chunk.choices) > 0:
                    delta_content = chunk.choices[0].delta.content
                    if delta_content:
                        yield f"data: {json.dumps({'content': delta_content})}\n\n"
            
            # Signal stream completion
            yield "data: [DONE]\n\n"
            
        except Exception as e:
            # Yield any error occurring during streaming
            yield f"data: {json.dumps({'error': str(e)})}\n\n"
            yield "data: [DONE]\n\n"

    # Configure headers to bypass buffer layers in reverse proxies (like Nginx/Cloudflare)
    headers = {
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "X-Accel-Buffering": "no",
    }
    return StreamingResponse(event_generator(), media_type="text/event-stream", headers=headers)

@app.get("/", response_class=HTMLResponse)
async def homepage():
    html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
    if not os.path.exists(html_path):
        return HTMLResponse("<h3>index.html not found. Please ensure it is created in the workspace directory.</h3>", status_code=404)
    with open(html_path, "r", encoding="utf-8") as f:
        return HTMLResponse(f.read())

if __name__ == "__main__":
    # Launch Gradio Server (which binds to the FastAPI app underneath)
    app.launch(show_error=True)