{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# \ud83e\uddca Gemma 4 26B-A4B-it \u2014 Expert Offloading (PolarQuant Fork)\n", "\n", "**25.2B MoE (3.8B active)** on consumer GPUs via expert CPU offloading.\n", "\n", "Uses our vLLM fork with `--moe-expert-cache-size` (same as Nemotron).\n", "\n", "| Component | Location | Size |\n", "|---|---|---|\n", "| Non-expert weights | GPU | ~5-8 GB |\n", "| Expert cache (8 slots) | GPU | ~2-3 GB |\n", "| Expert weights (pinned) | CPU | ~42 GB |\n", "| **Total GPU** | | **~8-11 GB** |\n" ] }, { "cell_type": "code", "metadata": {}, "execution_count": null, "outputs": [], "source": [ "# Install our vLLM fork from source (includes expert offloading + Gemma 4 support)\n", "# This takes ~5 min to build\n", "\n", "!pip install git+https://github.com/huggingface/transformers.git --force-reinstall -q\n", "\n", "# Build vLLM from our fork\n", "!git clone --depth 1 -b nemotron-expert-offload https://github.com/caiovicentino/vllm-expert-offload.git /content/vllm-fork\n", "!cd /content/vllm-fork && pip install -e . 2>&1 | tail -5\n", "print('\\nDone!')\n" ] }, { "cell_type": "code", "metadata": {}, "execution_count": null, "outputs": [], "source": [ "import vllm\n", "print(f'vLLM version: {vllm.__version__}')\n", "print(f'vLLM path: {vllm.__path__[0]}')\n", "\n", "# Verify expert offload exists\n", "import inspect\n", "from vllm import LLM\n", "sig = inspect.signature(LLM.__init__)\n", "has_cache = 'moe_expert_cache_size' in str(sig)\n", "print(f'moe_expert_cache_size: {\"YES\" if has_cache else \"NO\"}')\n", "\n", "# Verify Gemma 4 support\n", "from vllm.model_executor.models.registry import _MODELS\n", "has_gemma4 = any('Gemma4' in k for k in _MODELS.keys()) if hasattr(_MODELS, 'keys') else 'check manually'\n", "print(f'Gemma4 support: {has_gemma4}')\n" ] }, { "cell_type": "code", "metadata": {}, "execution_count": null, "outputs": [], "source": [ "import os\n", "os.environ['FLASHINFER_DISABLE_VERSION_CHECK'] = '1'\n", "\n", "from vllm import LLM, SamplingParams\n", "from transformers import AutoTokenizer\n", "\n", "MODEL = 'google/gemma-4-26B-A4B-it'\n", "CACHE_SIZE = 8 # 8 experts cached on GPU\n", "\n", "llm = LLM(\n", " model=MODEL,\n", " trust_remote_code=True,\n", " dtype='bfloat16',\n", " max_model_len=4096,\n", " enforce_eager=True,\n", " moe_expert_cache_size=CACHE_SIZE,\n", " kernel_config={'moe_backend': 'triton'},\n", " gpu_memory_utilization=0.95,\n", ")\n", "print('MODEL LOADED!')\n", "\n", "import subprocess\n", "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n", " capture_output=True, text=True)\n", "print(f'VRAM: {int(smi.stdout.strip())/1024:.1f} GB')\n" ] }, { "cell_type": "code", "metadata": {}, "execution_count": null, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n", "\n", "prompts = [\n", " 'What is 2+3? Think step by step.',\n", " 'Explain quantum entanglement in simple terms.',\n", " 'Write a Python function to check if a number is prime.',\n", "]\n", "\n", "for prompt in prompts:\n", " p = tokenizer.apply_chat_template(\n", " [{'role': 'user', 'content': prompt}],\n", " tokenize=False, add_generation_prompt=True)\n", " out = llm.generate([p], SamplingParams(max_tokens=200, temperature=0))\n", " text = out[0].outputs[0].text\n", " print(f'\\n{\"=\"*60}')\n", " print(f'Q: {prompt}')\n", " print(f'A: {text[:300]}')\n" ] }, { "cell_type": "code", "metadata": {}, "execution_count": null, "outputs": [], "source": [ "import time\n", "\n", "p = tokenizer.apply_chat_template(\n", " [{'role': 'user', 'content': 'Write a detailed essay about artificial intelligence.'}],\n", " tokenize=False, add_generation_prompt=True)\n", "\n", "_ = llm.generate([p], SamplingParams(max_tokens=10, temperature=0)) # warmup\n", "\n", "speeds = []\n", "for run in range(3):\n", " t0 = time.time()\n", " out = llm.generate([p], SamplingParams(max_tokens=200, temperature=0))\n", " n = len(out[0].outputs[0].token_ids)\n", " tps = n / (time.time() - t0)\n", " speeds.append(tps)\n", " print(f'Run {run+1}: {tps:.1f} tok/s ({n} tokens)')\n", "\n", "import subprocess\n", "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n", " capture_output=True, text=True)\n", "avg_speed = sum(speeds)/len(speeds)\n", "print(f'\\nAverage: {avg_speed:.1f} tok/s')\n", "print(f'VRAM: {int(smi.stdout.strip())/1024:.1f} GB')\n" ] }, { "cell_type": "code", "metadata": {}, "execution_count": null, "outputs": [], "source": [ "from huggingface_hub import HfApi, login\n", "login(token='YOUR_HF_TOKEN')\n", "api = HfApi()\n", "\n", "REPO = 'caiovicentino1/Gemma-4-26B-A4B-it-PolarQuant-Q5'\n", "api.create_repo(REPO, exist_ok=True)\n", "\n", "import subprocess\n", "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n", " capture_output=True, text=True)\n", "vram = int(smi.stdout.strip())/1024\n", "\n", "card = f\"\"\"---\n", "license: apache-2.0\n", "tags:\n", "- polarquant\n", "- gemma4\n", "- moe\n", "- expert-offloading\n", "base_model: google/gemma-4-26B-A4B-it\n", "pipeline_tag: text-generation\n", "---\n", "\n", "# \ud83e\uddca Gemma-4-26B-A4B-it \u2014 PolarQuant Expert Offloading\n", "\n", "**25.2B MoE (3.8B active)** on consumer GPUs via expert CPU offloading.\n", "\n", "| Metric | Value |\n", "|---|---|\n", "| **VRAM** | {vram:.1f} GB |\n", "| **Speed** | {avg_speed:.1f} tok/s |\n", "| **Architecture** | 30 layers, 128 experts (top-8) |\n", "| **Cache size** | 8 experts |\n", "\n", "## Quick Start\n", "\n", "```bash\n", "pip install git+https://github.com/caiovicentino/vllm-expert-offload.git@nemotron-expert-offload\n", "```\n", "\n", "```python\n", "from vllm import LLM, SamplingParams\n", "llm = LLM('google/gemma-4-26B-A4B-it', dtype='bfloat16',\n", " moe_expert_cache_size=8, enforce_eager=True,\n", " kernel_config={{'moe_backend': 'triton'}})\n", "out = llm.generate(['Hello!'], SamplingParams(max_tokens=100))\n", "```\n", "\n", "\ud83d\udcc4 [Paper](https://arxiv.org/abs/2603.29078) \u00b7 \ud83d\udcbb [GitHub](https://github.com/caiovicentino/vllm-expert-offload) \u00b7 \ud83d\udce6 [pip install polarquant](https://pypi.org/project/polarquant/)\n", "\"\"\"\n", "\n", "api.upload_file(path_or_fileobj=card.encode(), path_in_repo='README.md',\n", " repo_id=REPO, repo_type='model')\n", "try:\n", " api.add_collection_item(collection_slug='caiovicentino1/polarquant-models-69cbc96292c5174df2088b08',\n", " item_id=REPO, item_type='model')\n", " api.add_collection_item(collection_slug='caiovicentino1/polarquant-gemma-models-69ceedd4896e4cd587972c0c',\n", " item_id=REPO, item_type='model')\n", "except: pass\n", "print(f'\\n\u2705 https://huggingface.co/{REPO}')\n", "" ] } ] }