Spaces:

caiovicentino1
/

PolarQuant-Nemotron-30B-Chat

Configuration error

App Files Files Community

caiovicentino1 commited on Apr 3

Commit

af86bf8

verified ·

1 Parent(s): 076bc9f

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import spaces
+import torch, torch.nn as nn, torch.nn.functional as F, math, gc
+from scipy.stats import norm as sp_norm
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from torchao.quantization import quantize_, Int4WeightOnlyConfig
+import torchao.quantization.utils as _tao_utils
+from threading import Thread
+import gradio as gr
+MODEL = "nvidia/Nemotron-Cascade-2-30B-A3B"
+BS = 128
+_C = {}
+def get_centroids(bits):
+    if bits in _C: return _C[bits]
+    n = 1 << bits; bd = torch.linspace(-4.0, 4.0, n + 1); ct = torch.zeros(n)
+    for _ in range(100):
+        for i in range(n):
+            a, b = bd[i].item(), bd[i+1].item(); pa, pb = sp_norm.cdf(a), sp_norm.cdf(b)
+            ct[i] = (sp_norm.pdf(a) - sp_norm.pdf(b)) / (pb - pa) if pb - pa > 1e-12 else (a + b) / 2
+        for i in range(1, n): bd[i] = (ct[i-1] + ct[i]) / 2
+    _C[bits] = ct; return ct
+def _build_H(n):
+    if n == 1: return torch.tensor([[1.0]])
+    h = _build_H(n // 2)
+    return torch.cat([torch.cat([h,h],1), torch.cat([h,-h],1)], 0) / math.sqrt(2)
+for b in [2,3,4,5,6]: get_centroids(b)
+H_W = _build_H(BS)
+def should_quantize(name, param):
+    if param.ndim < 2 or param.numel() < 256: return False
+    if any(k in name for k in ["norm","layernorm","rmsnorm"]): return False
+    if any(k in name for k in ["A_log",".D","dt_bias","conv1d"]): return False
+    if "bias" in name and param.ndim == 1: return False
+    if name.endswith(".gate.weight") or "router" in name: return False
+    return True
+_orig = _tao_utils.guard_dtype_size
+def _patched(t, n, dtype=None, size=None):
+    if dtype is not None and t.dtype != dtype: t.data = t.data.to(dtype)
+    if size is not None and t.size() != size: raise ValueError(f"{size} vs {t.size()}")
+_tao_utils.guard_dtype_size = _patched
+print("Loading Nemotron on CPU...")
+model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, device_map="cpu",
+    attn_implementation="sdpa", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+_loaded = False
+@spaces.GPU(duration=300)
+def respond(message, history):
+    global _loaded, model
+    if not _loaded:
+        H_dev = H_W.to("cuda"); ct5 = get_centroids(5).to("cuda")
+        int4_cfg = Int4WeightOnlyConfig(group_size=128); n = 0
+        for name, child in list(model.named_modules()):
+            if not isinstance(child, nn.Linear) or child.weight.device.type == "meta": continue
+            if not should_quantize(name, child.weight): continue
+            w = child.weight.data.float().to("cuda"); out_f, in_f = w.shape
+            pad = (BS - in_f % BS) % BS
+            if pad > 0: w = F.pad(w, (0, pad))
+            nb = w.shape[1] // BS; w = w.reshape(out_f, nb, BS)
+            for i in range(0, out_f, 64):
+                e = min(i+64, out_f); w[i:e] = (w[i:e].reshape(-1, BS) @ H_dev).reshape(e-i, nb, BS)
+            norms = w.norm(dim=2, keepdim=True).clamp(min=1e-10); w.div_(norms).mul_(math.sqrt(BS))
+            codes = torch.empty(out_f, nb, BS, dtype=torch.int8, device="cuda")
+            for ci in range(0, out_f, 256):
+                ce = min(ci+256, out_f); codes[ci:ce] = (w[ci:ce].unsqueeze(-1) - ct5.view(1,1,1,-1)).abs().argmin(-1).to(torch.int8)
+            del w; vals = torch.empty(out_f, nb, BS, dtype=torch.float32, device="cuda")
+            for ci in range(0, out_f, 256):
+                ce = min(ci+256, out_f); vals[ci:ce] = ct5[codes[ci:ce].long()] / math.sqrt(BS)
+            del codes; torch.cuda.empty_cache()
+            for i in range(0, out_f, 64):
+                e = min(i+64, out_f); vals[i:e] = (vals[i:e].reshape(-1, BS) @ H_dev).reshape(e-i, nb, BS)
+            vals *= norms; del norms; bf16_w = vals.reshape(out_f, -1)[:, :in_f].to(torch.bfloat16); del vals; torch.cuda.empty_cache()
+            try:
+                with torch.device("meta"): dummy = nn.Sequential(nn.Linear(in_f, out_f, bias=False))
+                dummy[0].weight = nn.Parameter(bf16_w); quantize_(dummy, int4_cfg); child.weight = dummy[0].weight; del dummy
+            except: child.weight.data = bf16_w.cpu()
+            del bf16_w; torch.cuda.empty_cache(); n += 1
+        _tao_utils.guard_dtype_size = _orig
+        for _, p in model.named_parameters():
+            if p.device.type == "cpu": p.data = p.data.to("cuda")
+        for _, b in model.named_buffers():
+            if b.device.type == "cpu": b.data = b.data.to("cuda")
+        gc.collect(); torch.cuda.empty_cache(); _loaded = True
+        print(f"Ready! {n} layers, {torch.cuda.memory_allocated()/1e9:.1f} GB")
+    messages = list(history) + [{"role": "user", "content": message}]
+    chat_out = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
+    input_ids = chat_out["input_ids"].to("cuda") if hasattr(chat_out, "input_ids") else chat_out.to("cuda")
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    Thread(target=model.generate, kwargs=dict(input_ids=input_ids, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.3, streamer=streamer)).start()
+    partial = ""
+    for text in streamer: partial += text; yield partial
+demo = gr.ChatInterface(respond, title="🧊 Nemotron Cascade 30B-A3B — PolarQuant Q5+INT4",
+    description="30B MoE (3B active) | 735 downloads | Most popular PolarQuant model | [Paper](https://arxiv.org/abs/2603.29078)",
+    examples=["Explain quantum computing simply.", "Write a Python binary search.", "Compare TCP vs UDP."], type="messages")
+demo.launch()