caiovicentino1
/

Gemma-4-26B-A4B-it-HLWQ-Q5

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# \ud83e\uddca Gemma 4 26B-A4B-it \u2014 Expert Offloading (PolarQuant Fork)\n",
+        "\n",
+        "**25.2B MoE (3.8B active)** on consumer GPUs via expert CPU offloading.\n",
+        "\n",
+        "Uses our vLLM fork with `--moe-expert-cache-size` (same as Nemotron).\n",
+        "\n",
+        "| Component | Location | Size |\n",
+        "|---|---|---|\n",
+        "| Non-expert weights | GPU | ~5-8 GB |\n",
+        "| Expert cache (8 slots) | GPU | ~2-3 GB |\n",
+        "| Expert weights (pinned) | CPU | ~42 GB |\n",
+        "| **Total GPU** | | **~8-11 GB** |\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "# Install our vLLM fork from source (includes expert offloading + Gemma 4 support)\n",
+        "# This takes ~5 min to build\n",
+        "\n",
+        "!pip install git+https://github.com/huggingface/transformers.git --force-reinstall -q\n",
+        "\n",
+        "# Build vLLM from our fork\n",
+        "!git clone --depth 1 -b nemotron-expert-offload https://github.com/caiovicentino/vllm-expert-offload.git /content/vllm-fork\n",
+        "!cd /content/vllm-fork && pip install -e . 2>&1 | tail -5\n",
+        "print('\\nDone!')\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "import vllm\n",
+        "print(f'vLLM version: {vllm.__version__}')\n",
+        "print(f'vLLM path: {vllm.__path__[0]}')\n",
+        "\n",
+        "# Verify expert offload exists\n",
+        "import inspect\n",
+        "from vllm import LLM\n",
+        "sig = inspect.signature(LLM.__init__)\n",
+        "has_cache = 'moe_expert_cache_size' in str(sig)\n",
+        "print(f'moe_expert_cache_size: {\"YES\" if has_cache else \"NO\"}')\n",
+        "\n",
+        "# Verify Gemma 4 support\n",
+        "from vllm.model_executor.models.registry import _MODELS\n",
+        "has_gemma4 = any('Gemma4' in k for k in _MODELS.keys()) if hasattr(_MODELS, 'keys') else 'check manually'\n",
+        "print(f'Gemma4 support: {has_gemma4}')\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "os.environ['FLASHINFER_DISABLE_VERSION_CHECK'] = '1'\n",
+        "\n",
+        "from vllm import LLM, SamplingParams\n",
+        "from transformers import AutoTokenizer\n",
+        "\n",
+        "MODEL = 'google/gemma-4-26B-A4B-it'\n",
+        "CACHE_SIZE = 8  # 8 experts cached on GPU\n",
+        "\n",
+        "llm = LLM(\n",
+        "    model=MODEL,\n",
+        "    trust_remote_code=True,\n",
+        "    dtype='bfloat16',\n",
+        "    max_model_len=4096,\n",
+        "    enforce_eager=True,\n",
+        "    moe_expert_cache_size=CACHE_SIZE,\n",
+        "    kernel_config={'moe_backend': 'triton'},\n",
+        "    gpu_memory_utilization=0.95,\n",
+        ")\n",
+        "print('MODEL LOADED!')\n",
+        "\n",
+        "import subprocess\n",
+        "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n",
+        "                     capture_output=True, text=True)\n",
+        "print(f'VRAM: {int(smi.stdout.strip())/1024:.1f} GB')\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
+        "\n",
+        "prompts = [\n",
+        "    'What is 2+3? Think step by step.',\n",
+        "    'Explain quantum entanglement in simple terms.',\n",
+        "    'Write a Python function to check if a number is prime.',\n",
+        "]\n",
+        "\n",
+        "for prompt in prompts:\n",
+        "    p = tokenizer.apply_chat_template(\n",
+        "        [{'role': 'user', 'content': prompt}],\n",
+        "        tokenize=False, add_generation_prompt=True)\n",
+        "    out = llm.generate([p], SamplingParams(max_tokens=200, temperature=0))\n",
+        "    text = out[0].outputs[0].text\n",
+        "    print(f'\\n{\"=\"*60}')\n",
+        "    print(f'Q: {prompt}')\n",
+        "    print(f'A: {text[:300]}')\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "import time\n",
+        "\n",
+        "p = tokenizer.apply_chat_template(\n",
+        "    [{'role': 'user', 'content': 'Write a detailed essay about artificial intelligence.'}],\n",
+        "    tokenize=False, add_generation_prompt=True)\n",
+        "\n",
+        "_ = llm.generate([p], SamplingParams(max_tokens=10, temperature=0))  # warmup\n",
+        "\n",
+        "speeds = []\n",
+        "for run in range(3):\n",
+        "    t0 = time.time()\n",
+        "    out = llm.generate([p], SamplingParams(max_tokens=200, temperature=0))\n",
+        "    n = len(out[0].outputs[0].token_ids)\n",
+        "    tps = n / (time.time() - t0)\n",
+        "    speeds.append(tps)\n",
+        "    print(f'Run {run+1}: {tps:.1f} tok/s ({n} tokens)')\n",
+        "\n",
+        "import subprocess\n",
+        "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n",
+        "                     capture_output=True, text=True)\n",
+        "avg_speed = sum(speeds)/len(speeds)\n",
+        "print(f'\\nAverage: {avg_speed:.1f} tok/s')\n",
+        "print(f'VRAM: {int(smi.stdout.strip())/1024:.1f} GB')\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import HfApi, login\n",
+        "login(token='YOUR_HF_TOKEN')\n",
+        "api = HfApi()\n",
+        "\n",
+        "REPO = 'caiovicentino1/Gemma-4-26B-A4B-it-PolarQuant-Q5'\n",
+        "api.create_repo(REPO, exist_ok=True)\n",
+        "\n",
+        "import subprocess\n",
+        "smi = subprocess.run(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],\n",
+        "                     capture_output=True, text=True)\n",
+        "vram = int(smi.stdout.strip())/1024\n",
+        "\n",
+        "card = f\"\"\"---\n",
+        "license: apache-2.0\n",
+        "tags:\n",
+        "- polarquant\n",
+        "- gemma4\n",
+        "- moe\n",
+        "- expert-offloading\n",
+        "base_model: google/gemma-4-26B-A4B-it\n",
+        "pipeline_tag: text-generation\n",
+        "---\n",
+        "\n",
+        "# \ud83e\uddca Gemma-4-26B-A4B-it \u2014 PolarQuant Expert Offloading\n",
+        "\n",
+        "**25.2B MoE (3.8B active)** on consumer GPUs via expert CPU offloading.\n",
+        "\n",
+        "| Metric | Value |\n",
+        "|---|---|\n",
+        "| **VRAM** | {vram:.1f} GB |\n",
+        "| **Speed** | {avg_speed:.1f} tok/s |\n",
+        "| **Architecture** | 30 layers, 128 experts (top-8) |\n",
+        "| **Cache size** | 8 experts |\n",
+        "\n",
+        "## Quick Start\n",
+        "\n",
+        "```bash\n",
+        "pip install git+https://github.com/caiovicentino/vllm-expert-offload.git@nemotron-expert-offload\n",
+        "```\n",
+        "\n",
+        "```python\n",
+        "from vllm import LLM, SamplingParams\n",
+        "llm = LLM('google/gemma-4-26B-A4B-it', dtype='bfloat16',\n",
+        "          moe_expert_cache_size=8, enforce_eager=True,\n",
+        "          kernel_config={{'moe_backend': 'triton'}})\n",
+        "out = llm.generate(['Hello!'], SamplingParams(max_tokens=100))\n",
+        "```\n",
+        "\n",
+        "\ud83d\udcc4 [Paper](https://arxiv.org/abs/2603.29078) \u00b7 \ud83d\udcbb [GitHub](https://github.com/caiovicentino/vllm-expert-offload) \u00b7 \ud83d\udce6 [pip install polarquant](https://pypi.org/project/polarquant/)\n",
+        "\"\"\"\n",
+        "\n",
+        "api.upload_file(path_or_fileobj=card.encode(), path_in_repo='README.md',\n",
+        "                repo_id=REPO, repo_type='model')\n",
+        "try:\n",
+        "    api.add_collection_item(collection_slug='caiovicentino1/polarquant-models-69cbc96292c5174df2088b08',\n",
+        "        item_id=REPO, item_type='model')\n",
+        "    api.add_collection_item(collection_slug='caiovicentino1/polarquant-gemma-models-69ceedd4896e4cd587972c0c',\n",
+        "        item_id=REPO, item_type='model')\n",
+        "except: pass\n",
+        "print(f'\\n\u2705 https://huggingface.co/{REPO}')\n",
+        ""
+      ]
+    }
+  ]
+}