caiovicentino1 commited on
Commit
9c3b3db
·
verified ·
1 Parent(s): 015f8ef

Upload GEMMA4_26B_EXPERT_OFFLOAD_FORK.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. GEMMA4_26B_EXPERT_OFFLOAD_FORK.ipynb +236 -0
GEMMA4_26B_EXPERT_OFFLOAD_FORK.ipynb ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "markdown",
21
+ "metadata": {},
22
+ "source": [
23
+ "# \ud83e\uddca Gemma 4 26B-A4B-it \u2014 Expert Offloading (PolarQuant Fork)\n",
24
+ "\n",
25
+ "**25.2B MoE (3.8B active)** on consumer GPUs via expert CPU offloading.\n",
26
+ "\n",
27
+ "Uses our vLLM fork with `--moe-expert-cache-size` (same as Nemotron).\n",
28
+ "\n",
29
+ "| Component | Location | Size |\n",
30
+ "|---|---|---|\n",
31
+ "| Non-expert weights | GPU | ~5-8 GB |\n",
32
+ "| Expert cache (8 slots) | GPU | ~2-3 GB |\n",
33
+ "| Expert weights (pinned) | CPU | ~42 GB |\n",
34
+ "| **Total GPU** | | **~8-11 GB** |\n"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "metadata": {},
40
+ "execution_count": null,
41
+ "outputs": [],
42
+ "source": [
43
+ "# Install our vLLM fork from source (includes expert offloading + Gemma 4 support)\n",
44
+ "# This takes ~5 min to build\n",
45
+ "\n",
46
+ "!pip install git+https://github.com/huggingface/transformers.git --force-reinstall -q\n",
47
+ "\n",
48
+ "# Build vLLM from our fork\n",
49
+ "!git clone --depth 1 -b nemotron-expert-offload https://github.com/caiovicentino/vllm-expert-offload.git /content/vllm-fork\n",
50
+ "!cd /content/vllm-fork && pip install -e . 2>&1 | tail -5\n",
51
+ "print('\\nDone!')\n"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "metadata": {},
57
+ "execution_count": null,
58
+ "outputs": [],
59
+ "source": [
60
+ "import vllm\n",
61
+ "print(f'vLLM version: {vllm.__version__}')\n",
62
+ "print(f'vLLM path: {vllm.__path__[0]}')\n",
63
+ "\n",
64
+ "# Verify expert offload exists\n",
65
+ "import inspect\n",
66
+ "from vllm import LLM\n",
67
+ "sig = inspect.signature(LLM.__init__)\n",
68
+ "has_cache = 'moe_expert_cache_size' in str(sig)\n",
69
+ "print(f'moe_expert_cache_size: {\"YES\" if has_cache else \"NO\"}')\n",
70
+ "\n",
71
+ "# Verify Gemma 4 support\n",
72
+ "from vllm.model_executor.models.registry import _MODELS\n",
73
+ "has_gemma4 = any('Gemma4' in k for k in _MODELS.keys()) if hasattr(_MODELS, 'keys') else 'check manually'\n",
74
+ "print(f'Gemma4 support: {has_gemma4}')\n"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "metadata": {},
80
+ "execution_count": null,
81
+ "outputs": [],
82
+ "source": [
83
+ "import os\n",
84
+ "os.environ['FLASHINFER_DISABLE_VERSION_CHECK'] = '1'\n",
85
+ "\n",
86
+ "from vllm import LLM, SamplingParams\n",
87
+ "from transformers import AutoTokenizer\n",
88
+ "\n",
89
+ "MODEL = 'google/gemma-4-26B-A4B-it'\n",
90
+ "CACHE_SIZE = 8 # 8 experts cached on GPU\n",
91
+ "\n",
92
+ "llm = LLM(\n",
93
+ " model=MODEL,\n",
94
+ " trust_remote_code=True,\n",
95
+ " dtype='bfloat16',\n",
96
+ " max_model_len=4096,\n",
97
+ " enforce_eager=True,\n",
98
+ " moe_expert_cache_size=CACHE_SIZE,\n",
99
+ " kernel_config={'moe_backend': 'triton'},\n",
100
+ " gpu_memory_utilization=0.95,\n",
101
+ ")\n",
102
+ "print('MODEL LOADED!')\n",
103
+ "\n",
104
+ "import subprocess\n",
105
+ "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n",
106
+ " capture_output=True, text=True)\n",
107
+ "print(f'VRAM: {int(smi.stdout.strip())/1024:.1f} GB')\n"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "metadata": {},
113
+ "execution_count": null,
114
+ "outputs": [],
115
+ "source": [
116
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
117
+ "\n",
118
+ "prompts = [\n",
119
+ " 'What is 2+3? Think step by step.',\n",
120
+ " 'Explain quantum entanglement in simple terms.',\n",
121
+ " 'Write a Python function to check if a number is prime.',\n",
122
+ "]\n",
123
+ "\n",
124
+ "for prompt in prompts:\n",
125
+ " p = tokenizer.apply_chat_template(\n",
126
+ " [{'role': 'user', 'content': prompt}],\n",
127
+ " tokenize=False, add_generation_prompt=True)\n",
128
+ " out = llm.generate([p], SamplingParams(max_tokens=200, temperature=0))\n",
129
+ " text = out[0].outputs[0].text\n",
130
+ " print(f'\\n{\"=\"*60}')\n",
131
+ " print(f'Q: {prompt}')\n",
132
+ " print(f'A: {text[:300]}')\n"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "metadata": {},
138
+ "execution_count": null,
139
+ "outputs": [],
140
+ "source": [
141
+ "import time\n",
142
+ "\n",
143
+ "p = tokenizer.apply_chat_template(\n",
144
+ " [{'role': 'user', 'content': 'Write a detailed essay about artificial intelligence.'}],\n",
145
+ " tokenize=False, add_generation_prompt=True)\n",
146
+ "\n",
147
+ "_ = llm.generate([p], SamplingParams(max_tokens=10, temperature=0)) # warmup\n",
148
+ "\n",
149
+ "speeds = []\n",
150
+ "for run in range(3):\n",
151
+ " t0 = time.time()\n",
152
+ " out = llm.generate([p], SamplingParams(max_tokens=200, temperature=0))\n",
153
+ " n = len(out[0].outputs[0].token_ids)\n",
154
+ " tps = n / (time.time() - t0)\n",
155
+ " speeds.append(tps)\n",
156
+ " print(f'Run {run+1}: {tps:.1f} tok/s ({n} tokens)')\n",
157
+ "\n",
158
+ "import subprocess\n",
159
+ "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n",
160
+ " capture_output=True, text=True)\n",
161
+ "avg_speed = sum(speeds)/len(speeds)\n",
162
+ "print(f'\\nAverage: {avg_speed:.1f} tok/s')\n",
163
+ "print(f'VRAM: {int(smi.stdout.strip())/1024:.1f} GB')\n"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "metadata": {},
169
+ "execution_count": null,
170
+ "outputs": [],
171
+ "source": [
172
+ "from huggingface_hub import HfApi, login\n",
173
+ "login(token='YOUR_HF_TOKEN')\n",
174
+ "api = HfApi()\n",
175
+ "\n",
176
+ "REPO = 'caiovicentino1/Gemma-4-26B-A4B-it-PolarQuant-Q5'\n",
177
+ "api.create_repo(REPO, exist_ok=True)\n",
178
+ "\n",
179
+ "import subprocess\n",
180
+ "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n",
181
+ " capture_output=True, text=True)\n",
182
+ "vram = int(smi.stdout.strip())/1024\n",
183
+ "\n",
184
+ "card = f\"\"\"---\n",
185
+ "license: apache-2.0\n",
186
+ "tags:\n",
187
+ "- polarquant\n",
188
+ "- gemma4\n",
189
+ "- moe\n",
190
+ "- expert-offloading\n",
191
+ "base_model: google/gemma-4-26B-A4B-it\n",
192
+ "pipeline_tag: text-generation\n",
193
+ "---\n",
194
+ "\n",
195
+ "# \ud83e\uddca Gemma-4-26B-A4B-it \u2014 PolarQuant Expert Offloading\n",
196
+ "\n",
197
+ "**25.2B MoE (3.8B active)** on consumer GPUs via expert CPU offloading.\n",
198
+ "\n",
199
+ "| Metric | Value |\n",
200
+ "|---|---|\n",
201
+ "| **VRAM** | {vram:.1f} GB |\n",
202
+ "| **Speed** | {avg_speed:.1f} tok/s |\n",
203
+ "| **Architecture** | 30 layers, 128 experts (top-8) |\n",
204
+ "| **Cache size** | 8 experts |\n",
205
+ "\n",
206
+ "## Quick Start\n",
207
+ "\n",
208
+ "```bash\n",
209
+ "pip install git+https://github.com/caiovicentino/vllm-expert-offload.git@nemotron-expert-offload\n",
210
+ "```\n",
211
+ "\n",
212
+ "```python\n",
213
+ "from vllm import LLM, SamplingParams\n",
214
+ "llm = LLM('google/gemma-4-26B-A4B-it', dtype='bfloat16',\n",
215
+ " moe_expert_cache_size=8, enforce_eager=True,\n",
216
+ " kernel_config={{'moe_backend': 'triton'}})\n",
217
+ "out = llm.generate(['Hello!'], SamplingParams(max_tokens=100))\n",
218
+ "```\n",
219
+ "\n",
220
+ "\ud83d\udcc4 [Paper](https://arxiv.org/abs/2603.29078) \u00b7 \ud83d\udcbb [GitHub](https://github.com/caiovicentino/vllm-expert-offload) \u00b7 \ud83d\udce6 [pip install polarquant](https://pypi.org/project/polarquant/)\n",
221
+ "\"\"\"\n",
222
+ "\n",
223
+ "api.upload_file(path_or_fileobj=card.encode(), path_in_repo='README.md',\n",
224
+ " repo_id=REPO, repo_type='model')\n",
225
+ "try:\n",
226
+ " api.add_collection_item(collection_slug='caiovicentino1/polarquant-models-69cbc96292c5174df2088b08',\n",
227
+ " item_id=REPO, item_type='model')\n",
228
+ " api.add_collection_item(collection_slug='caiovicentino1/polarquant-gemma-models-69ceedd4896e4cd587972c0c',\n",
229
+ " item_id=REPO, item_type='model')\n",
230
+ "except: pass\n",
231
+ "print(f'\\n\u2705 https://huggingface.co/{REPO}')\n",
232
+ ""
233
+ ]
234
+ }
235
+ ]
236
+ }