Spaces:

Napron
/

small_object_detection

Running

App Files Files Community

Napron commited on Mar 7

Commit

2e3c33d

verified ·

1 Parent(s): 09b81c6

Update nomic_fewshot.py

Browse files

Files changed (1) hide show

nomic_fewshot.py +258 -81

nomic_fewshot.py CHANGED Viewed

@@ -1,147 +1,324 @@
 """
-Few-shot object classification using Nomic embed-vision-v1.5 + embed-text-v1.5.
-Same treatment as Jina: image refs + text prompts, combined with text_weight (default 0.3).
-Used by dfine_jina_pipeline.py and tune_thresholds.py for Nomic crop classification.
 """
 import time
 from pathlib import Path
 import numpy as np
-import torch
-import torch.nn.functional as F
 from PIL import Image
-from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
-from transformers import modeling_utils
 from jina_fewshot import CLASS_PROMPTS, IMAGE_EXTS
-def _patch_tied_weights_for_nomic():
-    """NomicVisionModel has _tied_weights_keys but newer transformers expect all_tied_weights_keys.
-    Only patch when this method exists (newer transformers); older versions don't need it."""
-    if not hasattr(modeling_utils.PreTrainedModel, "mark_tied_weights_as_initialized"):
-        return
-    _orig = modeling_utils.PreTrainedModel.mark_tied_weights_as_initialized
-    def _patched(self, loading_info):
-        if not hasattr(self, "all_tied_weights_keys"):
-            self.all_tied_weights_keys = getattr(self, "_tied_weights_keys", None) or {}
-        return _orig(self, loading_info)
-    modeling_utils.PreTrainedModel.mark_tied_weights_as_initialized = _patched
-def _nomic_mean_pool(last_hidden_state, attention_mask):
-    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
-    return torch.sum(last_hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
-class NomicTextEncoder:
-    """Nomic embed-text-v1.5: text → normalized embedding (aligned to vision space)."""
-    def __init__(self, device="cuda"):
         self.device = device
-        print("[*] Loading nomic-embed-text-v1.5...")
         t0 = time.perf_counter()
-        self.tokenizer = AutoTokenizer.from_pretrained("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
-        if hasattr(torch, "set_default_device"):
-            torch.set_default_device("cpu")
-        try:
-            self.model = AutoModel.from_pretrained(
-                "nomic-ai/nomic-embed-text-v1.5",
-                trust_remote_code=True,
-                low_cpu_mem_usage=False,
-            )
-        finally:
-            if hasattr(torch, "set_default_device"):
-                torch.set_default_device("cpu")
-        self.model = self.model.to(device).eval()
         print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")
     def encode_texts(self, texts: list[str]) -> np.ndarray:
         prefixed = [f"classification: {t}" for t in texts]
-        inputs = self.tokenizer(prefixed, padding=True, truncation=True, return_tensors="pt", max_length=512)
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            out = self.model(**inputs)
-        embs = _nomic_mean_pool(out.last_hidden_state, inputs["attention_mask"])
-        embs = F.normalize(embs, p=2, dim=1)
-        return embs.cpu().float().numpy()
-class NomicVisionEncoder:
-    """Nomic embed-vision-v1.5: image → normalized CLS embedding."""
-    def __init__(self, device="cuda"):
         self.device = device
-        print("[*] Loading nomic-embed-vision-v1.5...")
         t0 = time.perf_counter()
-        self.processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
-        _patch_tied_weights_for_nomic()
-        if hasattr(torch, "set_default_device"):
-            torch.set_default_device("cpu")
-        try:
-            self.model = AutoModel.from_pretrained(
-                "nomic-ai/nomic-embed-vision-v1.5",
-                trust_remote_code=True,
-                low_cpu_mem_usage=False,
-            )
-        finally:
-            if hasattr(torch, "set_default_device"):
-                torch.set_default_device("cpu")
-        self.model = self.model.to(device).eval()
         print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")
-    def encode_images(self, images: list) -> np.ndarray:
-        """Encode images to L2-normalized embeddings (CLS token)."""
-        inputs = self.processor(images=images, return_tensors="pt")
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            out = self.model(**inputs).last_hidden_state
-        # CLS token, then normalize
-        embs = F.normalize(out[:, 0], p=2, dim=1)
-        return embs.cpu().float().numpy()
 def build_refs_nomic(
-    encoder: NomicVisionEncoder,
     refs_dir: Path,
     batch_size: int = 16,
-    text_encoder: NomicTextEncoder | None = None,
     text_weight: float = 0.3,
 ):
-    """Build one ref embedding per class. Same treatment as Jina: image refs + text prompts, combined with text_weight (default 0.3)."""
     class_dirs = sorted(d for d in refs_dir.iterdir() if d.is_dir())
     if not class_dirs:
         raise ValueError(f"No subfolders in {refs_dir}")
     labels = []
     embeddings = []
     if text_encoder is not None:
         print(f"  Text weight: {text_weight:.1f} | Image weight: {1 - text_weight:.1f}\n")
     for d in class_dirs:
         name = d.name
         paths = sorted(str(p) for p in d.iterdir() if p.suffix.lower() in IMAGE_EXTS)
         if not paths:
             continue
         all_embs = []
         for i in range(0, len(paths), batch_size):
-            batch = [Image.open(p).convert("RGB") for p in paths[i : i + batch_size]]
             all_embs.append(encoder.encode_images(batch))
         img_embs = np.concatenate(all_embs, axis=0)
-        img_avg = img_embs.mean(axis=0)
         if text_encoder is not None:
             prompts = CLASS_PROMPTS.get(name, [f"a {name}", f"a person holding a {name}"])
             text_embs = text_encoder.encode_texts(prompts)
-            text_avg = text_embs.mean(axis=0)
             combined = (1.0 - text_weight) * img_avg + text_weight * text_avg
             combined = combined / (np.linalg.norm(combined) + 1e-12)
             labels.append(name)
             embeddings.append(combined)
-            print(f"  {name:<14}: {len(paths)} imgs + {len(prompts)} prompts")
         else:
-            img_avg = img_avg / (np.linalg.norm(img_avg) + 1e-12)
             labels.append(name)
             embeddings.append(img_avg)
             print(f"  {name:<14}: {len(paths)} imgs")
-    return labels, np.stack(embeddings)

 """
+Few-shot object classification using Nomic embed-vision-v1.5 + embed-text-v1.5 via ONNX Runtime.
+Same treatment as current PyTorch version:
+- vision refs -> average image embeddings
+- text prompts -> average text embeddings
+- combine with text_weight
+This version uses:
+- nomic-ai/nomic-embed-text-v1.5   -> ONNX
+- nomic-ai/nomic-embed-vision-v1.5 -> ONNX
+Transformers is used only for preprocessing:
+- AutoTokenizer
+- AutoImageProcessor
 """
 import time
 from pathlib import Path
 import numpy as np
+import onnxruntime as ort
 from PIL import Image
+from huggingface_hub import hf_hub_download
+from transformers import AutoImageProcessor, AutoTokenizer
 from jina_fewshot import CLASS_PROMPTS, IMAGE_EXTS
+def _l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
+    x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
+    norms = np.linalg.norm(x, axis=axis, keepdims=True)
+    norms = np.maximum(norms, eps)
+    return (x / norms).astype(np.float32)
+def _mean_pool(last_hidden_state: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
+    """
+    last_hidden_state: [B, T, D]
+    attention_mask:    [B, T]
+    """
+    mask = attention_mask.astype(np.float32)[..., None]  # [B, T, 1]
+    summed = np.sum(last_hidden_state * mask, axis=1)
+    denom = np.clip(np.sum(mask, axis=1), 1e-9, None)
+    return summed / denom
+def _pick_output(outputs: list[np.ndarray], output_names: list[str], kind: str) -> np.ndarray:
+    """
+    Try to find the main embedding tensor robustly.
+    For both text and vision Nomic ONNX exports, we expect a 3D tensor [B, T, D]
+    or sometimes a 2D tensor [B, D].
+    """
+    # Prefer names that look like hidden states / embeddings
+    preferred_keywords = [
+        "last_hidden_state",
+        "hidden_state",
+        "sentence_embedding",
+        "embedding",
+        "embeddings",
+    ]
+    for kw in preferred_keywords:
+        for i, name in enumerate(output_names):
+            if kw in name.lower():
+                arr = outputs[i]
+                if arr.ndim in (2, 3):
+                    return arr
+    # Fallback: first 3D output, then first 2D output
+    for arr in outputs:
+        if arr.ndim == 3:
+            return arr
+    for arr in outputs:
+        if arr.ndim == 2:
+            return arr
+    raise RuntimeError(
+        f"Could not identify a usable {kind} ONNX output. "
+        f"Output names={output_names}, shapes={[getattr(o, 'shape', None) for o in outputs]}"
+    )
+def _download_onnx_model(repo_id: str, filename: str = "onnx/model.onnx") -> str:
+    print(f"  Downloading ONNX model from {repo_id} ...")
+    onnx_path = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+    )
+    print(f"  Downloaded: {onnx_path}")
+    return onnx_path
+class NomicTextEncoderONNX:
+    """
+    Nomic embed-text-v1.5 ONNX:
+    text -> token embeddings / hidden states -> mean pool -> L2 normalize
+    """
+    def __init__(self, device: str = "cuda"):
         self.device = device
+        self.repo_id = "nomic-ai/nomic-embed-text-v1.5"
+        print("[*] Loading nomic-embed-text-v1.5 (ONNX)...")
         t0 = time.perf_counter()
+        onnx_path = _download_onnx_model(self.repo_id)
+        available = ort.get_available_providers()
+        if "CUDAExecutionProvider" in available and device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        print(f"  ONNX providers: {providers}")
+        self.session = ort.InferenceSession(onnx_path, providers=providers)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.repo_id, trust_remote_code=True)
+        self.input_names = [inp.name for inp in self.session.get_inputs()]
+        self.output_names = [out.name for out in self.session.get_outputs()]
+        print(f"  ONNX inputs: {self.input_names}")
+        print(f"  ONNX outputs: {self.output_names}")
+        self._ids_name = None
+        self._mask_name = None
+        self._token_type_name = None
+        for name in self.input_names:
+            nl = name.lower()
+            if nl == "input_ids" or "input_ids" in nl:
+                self._ids_name = name
+            elif nl == "attention_mask" or "attention" in nl:
+                self._mask_name = name
+            elif nl == "token_type_ids" or "token_type" in nl:
+                self._token_type_name = name
+        print(
+            f"  Mapped: input_ids={self._ids_name}, "
+            f"attention_mask={self._mask_name}, token_type_ids={self._token_type_name}"
+        )
+        # Sanity check
+        test = self.encode_texts(["a red square"])
+        nrm = float(np.linalg.norm(test[0]))
+        print(f"  [SANITY] text embed norm={nrm:.4f}")
         print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")
     def encode_texts(self, texts: list[str]) -> np.ndarray:
         prefixed = [f"classification: {t}" for t in texts]
+        tokens = self.tokenizer(
+            prefixed,
+            padding=True,
+            truncation=True,
+            return_tensors="np",
+            max_length=512,
+        )
+        input_ids = np.asarray(tokens["input_ids"], dtype=np.int64)
+        attention_mask = np.asarray(tokens["attention_mask"], dtype=np.int64)
+        feeds = {}
+        if self._ids_name is not None:
+            feeds[self._ids_name] = input_ids
+        if self._mask_name is not None:
+            feeds[self._mask_name] = attention_mask
+        if self._token_type_name is not None:
+            feeds[self._token_type_name] = np.zeros_like(input_ids, dtype=np.int64)
+        outputs = self.session.run(self.output_names, feeds)
+        main_out = _pick_output(outputs, self.output_names, kind="text")
+        # Current PyTorch behavior: mean-pool last_hidden_state
+        if main_out.ndim == 3:
+            embs = _mean_pool(main_out, attention_mask)
+        elif main_out.ndim == 2:
+            embs = main_out
+        else:
+            raise RuntimeError(f"Unexpected text output rank: {main_out.ndim}")
+        return _l2_normalize(embs, axis=1)
+class NomicVisionEncoderONNX:
+    """
+    Nomic embed-vision-v1.5 ONNX:
+    image -> hidden states -> CLS token -> L2 normalize
+    """
+    def __init__(self, device: str = "cuda"):
         self.device = device
+        self.repo_id = "nomic-ai/nomic-embed-vision-v1.5"
+        print("[*] Loading nomic-embed-vision-v1.5 (ONNX)...")
         t0 = time.perf_counter()
+        onnx_path = _download_onnx_model(self.repo_id)
+        available = ort.get_available_providers()
+        if "CUDAExecutionProvider" in available and device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        print(f"  ONNX providers: {providers}")
+        self.session = ort.InferenceSession(onnx_path, providers=providers)
+        self.processor = AutoImageProcessor.from_pretrained(self.repo_id, trust_remote_code=True)
+        self.input_names = [inp.name for inp in self.session.get_inputs()]
+        self.output_names = [out.name for out in self.session.get_outputs()]
+        print(f"  ONNX inputs: {self.input_names}")
+        print(f"  ONNX outputs: {self.output_names}")
+        self._pixel_name = None
+        for name in self.input_names:
+            if "pixel" in name.lower():
+                self._pixel_name = name
+                break
+        print(f"  Mapped: pixel_values={self._pixel_name}")
+        # Sanity check
+        dummy = Image.new("RGB", (224, 224), color=(255, 0, 0))
+        test = self.encode_images([dummy])
+        nrm = float(np.linalg.norm(test[0]))
+        print(f"  [SANITY] image embed norm={nrm:.4f}")
         print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")
+    def encode_images(self, images: list[Image.Image]) -> np.ndarray:
+        rgb = [img.convert("RGB") for img in images]
+        processed = self.processor(images=rgb, return_tensors="np")
+        if "pixel_values" not in processed:
+            raise RuntimeError(f"Processor did not return pixel_values. Keys={list(processed.keys())}")
+        pixel_values = processed["pixel_values"]
+        pixel_values = (
+            pixel_values.numpy().astype(np.float32)
+            if hasattr(pixel_values, "numpy")
+            else np.asarray(pixel_values, dtype=np.float32)
+        )
+        feeds = {}
+        if self._pixel_name is None:
+            raise RuntimeError(f"Could not find pixel input in ONNX inputs: {self.input_names}")
+        feeds[self._pixel_name] = pixel_values
+        outputs = self.session.run(self.output_names, feeds)
+        main_out = _pick_output(outputs, self.output_names, kind="vision")
+        # Current PyTorch behavior: CLS token from last_hidden_state
+        if main_out.ndim == 3:
+            embs = main_out[:, 0, :]
+        elif main_out.ndim == 2:
+            embs = main_out
+        else:
+            raise RuntimeError(f"Unexpected vision output rank: {main_out.ndim}")
+        return _l2_normalize(embs, axis=1)
 def build_refs_nomic(
+    encoder: NomicVisionEncoderONNX,
     refs_dir: Path,
     batch_size: int = 16,
+    text_encoder: NomicTextEncoderONNX | None = None,
     text_weight: float = 0.3,
 ):
+    """
+    Build one ref embedding per class.
+    Same treatment as Jina:
+    - average reference image embeddings
+    - average class prompt text embeddings
+    - combine with text_weight
+    """
     class_dirs = sorted(d for d in refs_dir.iterdir() if d.is_dir())
     if not class_dirs:
         raise ValueError(f"No subfolders in {refs_dir}")
     labels = []
     embeddings = []
     if text_encoder is not None:
         print(f"  Text weight: {text_weight:.1f} | Image weight: {1 - text_weight:.1f}\n")
     for d in class_dirs:
         name = d.name
         paths = sorted(str(p) for p in d.iterdir() if p.suffix.lower() in IMAGE_EXTS)
         if not paths:
             continue
         all_embs = []
         for i in range(0, len(paths), batch_size):
+            batch = [Image.open(p).convert("RGB") for p in paths[i:i + batch_size]]
             all_embs.append(encoder.encode_images(batch))
         img_embs = np.concatenate(all_embs, axis=0)
+        img_avg = np.nan_to_num(img_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0)
+        img_avg = img_avg / (np.linalg.norm(img_avg) + 1e-12)
         if text_encoder is not None:
             prompts = CLASS_PROMPTS.get(name, [f"a {name}", f"a person holding a {name}"])
             text_embs = text_encoder.encode_texts(prompts)
+            text_avg = np.nan_to_num(text_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0)
+            text_avg = text_avg / (np.linalg.norm(text_avg) + 1e-12)
             combined = (1.0 - text_weight) * img_avg + text_weight * text_avg
+            combined = np.nan_to_num(combined, nan=0.0, posinf=0.0, neginf=0.0)
             combined = combined / (np.linalg.norm(combined) + 1e-12)
             labels.append(name)
             embeddings.append(combined)
+            sim = float(np.dot(img_avg, text_avg))
+            print(
+                f"  {name:<14}: {len(paths)} imgs + {len(prompts)} prompts | "
+                f"img-text sim: {sim:.4f}"
+            )
         else:
             labels.append(name)
             embeddings.append(img_avg)
             print(f"  {name:<14}: {len(paths)} imgs")
+    return labels, np.stack(embeddings).astype(np.float32)