qwen-agentworld-35b-zerogpu

Running on Zero

App Files Files Community

chabab commited on 3 days ago

Commit

020f30f

verified ·

1 Parent(s): 62cd9d9

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +28 -6
app.py +109 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,13 +1,35 @@
 ---
-title: Qwen Agentworld 35b Zerogpu
-emoji: 🐨
 colorFrom: indigo
-colorTo: indigo
 sdk: gradio
-sdk_version: 6.19.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Qwen-AgentWorld-35B-A3B
+emoji: 🌍
 colorFrom: indigo
+colorTo: purple
 sdk: gradio
+sdk_version: "5.9.1"
 app_file: app.py
+python_version: "3.12"
 pinned: false
+license: apache-2.0
+short_description: Free ZeroGPU demo of Qwen-AgentWorld-35B-A3B (4-bit)
 ---
+# Qwen-AgentWorld-35B-A3B — ZeroGPU Space
+Free GPU demo of [`Qwen/Qwen-AgentWorld-35B-A3B`](https://hf.co/Qwen/Qwen-AgentWorld-35B-A3B)
+running on **Hugging Face ZeroGPU**. The 35B MoE is loaded **4-bit (nf4)** so it
+fits in a ZeroGPU slot.
+## Why this is "free"
+- ZeroGPU compute is free; an **HF Pro** account gets the **largest daily quota**.
+- No always-on server, no per-hour billing (unlike Inference Endpoints).
+## Deploy
+1. Create a new Space → SDK **Gradio**.
+2. In **Settings → Hardware**, select **ZeroGPU** (free with Pro).
+3. Push `app.py`, `requirements.txt`, and this `README.md`.
+Or push from the CLI (see `push_space.py` in this folder).
+## Notes
+- `size`/`duration` are tuned in `app.py`; lower `max_new_tokens` = less quota used.
+- ZeroGPU's backing GPU and per-slot VRAM change over time — if 4-bit ever stops
+  fitting, switch `MODEL_ID` to a pre-quantized mirror.

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Gradio ZeroGPU Space for Qwen/Qwen-AgentWorld-35B-A3B (multimodal: image + text).
+Runs on Hugging Face ZeroGPU (free GPU compute; HF Pro gives the largest quota).
+The 35B MoE is loaded 4-bit quantized so it fits in a ZeroGPU slot and loads fast.
+"""
+import os
+from threading import Thread
+import gradio as gr
+import spaces
+import torch
+from PIL import Image
+from transformers import (
+    AutoModelForImageTextToText,
+    AutoProcessor,
+    BitsAndBytesConfig,
+    TextIteratorStreamer,
+)
+MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen-AgentWorld-35B-A3B")
+# --- Load once at module scope (ZeroGPU registers the cuda tensors here) ------
+quant = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+)
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = AutoModelForImageTextToText.from_pretrained(
+    MODEL_ID,
+    quantization_config=quant,
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+)
+model.eval()
+tokenizer = processor.tokenizer
+def _estimate_duration(message, history, max_new_tokens, temperature):
+    # ~25 tok/s worst case on a half-GPU 4-bit MoE, + load/vision headroom.
+    return min(180, 50 + int(max_new_tokens / 20))
+@spaces.GPU(duration=_estimate_duration)
+def chat(message, history, max_new_tokens=512, temperature=0.7):
+    # ChatInterface(multimodal=True) -> message = {"text": str, "files": [paths]}
+    text = message.get("text", "") if isinstance(message, dict) else str(message)
+    files = message.get("files", []) if isinstance(message, dict) else []
+    # Rebuild prior turns as text only (skip historical media for robustness).
+    messages = []
+    for turn in history:
+        content = turn.get("content")
+        if isinstance(content, str) and content.strip():
+            messages.append({"role": turn["role"], "content": content})
+    images = [Image.open(f).convert("RGB") for f in files]
+    user_content = [{"type": "image"} for _ in images]
+    user_content.append({"type": "text", "text": text})
+    messages.append({"role": "user", "content": user_content})
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[prompt],
+        images=images if images else None,
+        return_tensors="pt",
+    ).to(model.device)
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+    gen_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=int(max_new_tokens),
+        do_sample=temperature > 0,
+        temperature=max(temperature, 0.01),
+        top_p=0.8,
+        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+    )
+    Thread(target=model.generate, kwargs=gen_kwargs, daemon=True).start()
+    acc = ""
+    for piece in streamer:
+        acc += piece
+        yield acc
+demo = gr.ChatInterface(
+    fn=chat,
+    type="messages",
+    multimodal=True,
+    title="Qwen-AgentWorld-35B-A3B (ZeroGPU)",
+    description="Free GPU demo via Hugging Face ZeroGPU. Image + text, 4-bit quantized.",
+    additional_inputs=[
+        gr.Slider(64, 2048, value=512, step=64, label="max_new_tokens"),
+        gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature"),
+    ],
+)
+if __name__ == "__main__":
+    demo.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers>=4.51
+accelerate>=0.30
+bitsandbytes>=0.43
+pillow>=10.0
+torch