chabab commited on
Commit
020f30f
·
verified ·
1 Parent(s): 62cd9d9

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +28 -6
  2. app.py +109 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,13 +1,35 @@
1
  ---
2
- title: Qwen Agentworld 35b Zerogpu
3
- emoji: 🐨
4
  colorFrom: indigo
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 6.19.0
8
- python_version: '3.13'
9
  app_file: app.py
 
10
  pinned: false
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Qwen-AgentWorld-35B-A3B
3
+ emoji: 🌍
4
  colorFrom: indigo
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: "5.9.1"
 
8
  app_file: app.py
9
+ python_version: "3.12"
10
  pinned: false
11
+ license: apache-2.0
12
+ short_description: Free ZeroGPU demo of Qwen-AgentWorld-35B-A3B (4-bit)
13
  ---
14
 
15
+ # Qwen-AgentWorld-35B-A3B ZeroGPU Space
16
+
17
+ Free GPU demo of [`Qwen/Qwen-AgentWorld-35B-A3B`](https://hf.co/Qwen/Qwen-AgentWorld-35B-A3B)
18
+ running on **Hugging Face ZeroGPU**. The 35B MoE is loaded **4-bit (nf4)** so it
19
+ fits in a ZeroGPU slot.
20
+
21
+ ## Why this is "free"
22
+ - ZeroGPU compute is free; an **HF Pro** account gets the **largest daily quota**.
23
+ - No always-on server, no per-hour billing (unlike Inference Endpoints).
24
+
25
+ ## Deploy
26
+ 1. Create a new Space → SDK **Gradio**.
27
+ 2. In **Settings → Hardware**, select **ZeroGPU** (free with Pro).
28
+ 3. Push `app.py`, `requirements.txt`, and this `README.md`.
29
+
30
+ Or push from the CLI (see `push_space.py` in this folder).
31
+
32
+ ## Notes
33
+ - `size`/`duration` are tuned in `app.py`; lower `max_new_tokens` = less quota used.
34
+ - ZeroGPU's backing GPU and per-slot VRAM change over time — if 4-bit ever stops
35
+ fitting, switch `MODEL_ID` to a pre-quantized mirror.
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio ZeroGPU Space for Qwen/Qwen-AgentWorld-35B-A3B (multimodal: image + text).
3
+
4
+ Runs on Hugging Face ZeroGPU (free GPU compute; HF Pro gives the largest quota).
5
+ The 35B MoE is loaded 4-bit quantized so it fits in a ZeroGPU slot and loads fast.
6
+ """
7
+
8
+ import os
9
+ from threading import Thread
10
+
11
+ import gradio as gr
12
+ import spaces
13
+ import torch
14
+ from PIL import Image
15
+ from transformers import (
16
+ AutoModelForImageTextToText,
17
+ AutoProcessor,
18
+ BitsAndBytesConfig,
19
+ TextIteratorStreamer,
20
+ )
21
+
22
+ MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen-AgentWorld-35B-A3B")
23
+
24
+ # --- Load once at module scope (ZeroGPU registers the cuda tensors here) ------
25
+ quant = BitsAndBytesConfig(
26
+ load_in_4bit=True,
27
+ bnb_4bit_quant_type="nf4",
28
+ bnb_4bit_compute_dtype=torch.bfloat16,
29
+ bnb_4bit_use_double_quant=True,
30
+ )
31
+
32
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
33
+ model = AutoModelForImageTextToText.from_pretrained(
34
+ MODEL_ID,
35
+ quantization_config=quant,
36
+ device_map="cuda",
37
+ torch_dtype=torch.bfloat16,
38
+ trust_remote_code=True,
39
+ )
40
+ model.eval()
41
+ tokenizer = processor.tokenizer
42
+
43
+
44
+ def _estimate_duration(message, history, max_new_tokens, temperature):
45
+ # ~25 tok/s worst case on a half-GPU 4-bit MoE, + load/vision headroom.
46
+ return min(180, 50 + int(max_new_tokens / 20))
47
+
48
+
49
+ @spaces.GPU(duration=_estimate_duration)
50
+ def chat(message, history, max_new_tokens=512, temperature=0.7):
51
+ # ChatInterface(multimodal=True) -> message = {"text": str, "files": [paths]}
52
+ text = message.get("text", "") if isinstance(message, dict) else str(message)
53
+ files = message.get("files", []) if isinstance(message, dict) else []
54
+
55
+ # Rebuild prior turns as text only (skip historical media for robustness).
56
+ messages = []
57
+ for turn in history:
58
+ content = turn.get("content")
59
+ if isinstance(content, str) and content.strip():
60
+ messages.append({"role": turn["role"], "content": content})
61
+
62
+ images = [Image.open(f).convert("RGB") for f in files]
63
+ user_content = [{"type": "image"} for _ in images]
64
+ user_content.append({"type": "text", "text": text})
65
+ messages.append({"role": "user", "content": user_content})
66
+
67
+ prompt = processor.apply_chat_template(
68
+ messages, tokenize=False, add_generation_prompt=True
69
+ )
70
+ inputs = processor(
71
+ text=[prompt],
72
+ images=images if images else None,
73
+ return_tensors="pt",
74
+ ).to(model.device)
75
+
76
+ streamer = TextIteratorStreamer(
77
+ tokenizer, skip_prompt=True, skip_special_tokens=True
78
+ )
79
+ gen_kwargs = dict(
80
+ **inputs,
81
+ streamer=streamer,
82
+ max_new_tokens=int(max_new_tokens),
83
+ do_sample=temperature > 0,
84
+ temperature=max(temperature, 0.01),
85
+ top_p=0.8,
86
+ pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
87
+ )
88
+ Thread(target=model.generate, kwargs=gen_kwargs, daemon=True).start()
89
+
90
+ acc = ""
91
+ for piece in streamer:
92
+ acc += piece
93
+ yield acc
94
+
95
+
96
+ demo = gr.ChatInterface(
97
+ fn=chat,
98
+ type="messages",
99
+ multimodal=True,
100
+ title="Qwen-AgentWorld-35B-A3B (ZeroGPU)",
101
+ description="Free GPU demo via Hugging Face ZeroGPU. Image + text, 4-bit quantized.",
102
+ additional_inputs=[
103
+ gr.Slider(64, 2048, value=512, step=64, label="max_new_tokens"),
104
+ gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature"),
105
+ ],
106
+ )
107
+
108
+ if __name__ == "__main__":
109
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers>=4.51
2
+ accelerate>=0.30
3
+ bitsandbytes>=0.43
4
+ pillow>=10.0
5
+ torch