NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4-etheroi

Paused

App Files Files Community

NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4-etheroi / app.py

akhaliq HF Staff

fix: Add return type hint -> str to chat endpoint to prevent UserWarning and enable client data reception

e759c0c 25 days ago

Raw

History Blame

2.01 kB

	import os
	import json
	from fastapi.responses import HTMLResponse
	from gradio import Server
	from openai import AsyncOpenAI

	app = Server()

	@app.get("/", response_class=HTMLResponse)
	async def homepage():
	html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
	with open(html_path, "r", encoding="utf-8") as f:
	return f.read()

	@app.get("/hf_token_status")
	async def get_hf_token_status():
	token_exists = bool(os.environ.get("HF_TOKEN"))
	return {"has_token": token_exists}

	@app.api(name="chat")
	async def chat(messages_json: str, temperature: float = 0.7, max_tokens: int = 1024, custom_token: str = None) -> str:
	# Check for Hugging Face token: custom override or environment variable
	hf_token = (custom_token and custom_token.strip()) or os.environ.get("HF_TOKEN")

	if not hf_token:
	yield "Error: Hugging Face Token (HF_TOKEN) is not configured. Please set it in your environment or provide it in the UI Settings panel."
	return

	try:
	messages = json.loads(messages_json)
	except Exception as e:
	yield f"Error parsing chat messages: {str(e)}"
	return

	try:
	client = AsyncOpenAI(
	base_url="https://router.huggingface.co/v1",
	api_key=hf_token,
	default_headers={
	"X-HF-Bill-To": "huggingface"
	}
	)

	stream = await client.chat.completions.create(
	model="nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4:together",
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	stream=True,
	)

	async for chunk in stream:
	if chunk.choices and chunk.choices[0].delta.content is not None:
	yield chunk.choices[0].delta.content

	except Exception as e:
	yield f"Error calling Hugging Face Router: {str(e)}"

	if __name__ == "__main__":
	app.launch(show_error=True)