""" Few-shot object classification using Nomic embed-vision-v1.5 + embed-text-v1.5 via ONNX Runtime. Same treatment as current PyTorch version: - vision refs -> average image embeddings - text prompts -> average text embeddings - combine with text_weight This version uses: - nomic-ai/nomic-embed-text-v1.5 -> ONNX - nomic-ai/nomic-embed-vision-v1.5 -> ONNX Transformers is used only for preprocessing: - AutoTokenizer - AutoImageProcessor """ import time from pathlib import Path import numpy as np import onnxruntime as ort from PIL import Image from huggingface_hub import hf_hub_download from transformers import AutoImageProcessor, AutoTokenizer from jina_fewshot import CLASS_PROMPTS, IMAGE_EXTS def _l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray: x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0) norms = np.linalg.norm(x, axis=axis, keepdims=True) norms = np.maximum(norms, eps) return (x / norms).astype(np.float32) def _mean_pool(last_hidden_state: np.ndarray, attention_mask: np.ndarray) -> np.ndarray: """ last_hidden_state: [B, T, D] attention_mask: [B, T] """ mask = attention_mask.astype(np.float32)[..., None] # [B, T, 1] summed = np.sum(last_hidden_state * mask, axis=1) denom = np.clip(np.sum(mask, axis=1), 1e-9, None) return summed / denom def _pick_output(outputs: list[np.ndarray], output_names: list[str], kind: str) -> np.ndarray: """ Try to find the main embedding tensor robustly. For both text and vision Nomic ONNX exports, we expect a 3D tensor [B, T, D] or sometimes a 2D tensor [B, D]. """ # Prefer names that look like hidden states / embeddings preferred_keywords = [ "last_hidden_state", "hidden_state", "sentence_embedding", "embedding", "embeddings", ] for kw in preferred_keywords: for i, name in enumerate(output_names): if kw in name.lower(): arr = outputs[i] if arr.ndim in (2, 3): return arr # Fallback: first 3D output, then first 2D output for arr in outputs: if arr.ndim == 3: return arr for arr in outputs: if arr.ndim == 2: return arr raise RuntimeError( f"Could not identify a usable {kind} ONNX output. " f"Output names={output_names}, shapes={[getattr(o, 'shape', None) for o in outputs]}" ) def _download_onnx_model(repo_id: str, filename: str = "onnx/model.onnx") -> str: print(f" Downloading ONNX model from {repo_id} ...") onnx_path = hf_hub_download( repo_id=repo_id, filename=filename, ) print(f" Downloaded: {onnx_path}") return onnx_path class NomicTextEncoderONNX: """ Nomic embed-text-v1.5 ONNX: text -> token embeddings / hidden states -> mean pool -> L2 normalize """ def __init__(self, device: str = "cuda"): self.device = device self.repo_id = "nomic-ai/nomic-embed-text-v1.5" print("[*] Loading nomic-embed-text-v1.5 (ONNX)...") t0 = time.perf_counter() onnx_path = _download_onnx_model(self.repo_id) available = ort.get_available_providers() if "CUDAExecutionProvider" in available and device == "cuda": providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] else: providers = ["CPUExecutionProvider"] print(f" ONNX providers: {providers}") self.session = ort.InferenceSession(onnx_path, providers=providers) self.tokenizer = AutoTokenizer.from_pretrained(self.repo_id, trust_remote_code=True) self.input_names = [inp.name for inp in self.session.get_inputs()] self.output_names = [out.name for out in self.session.get_outputs()] print(f" ONNX inputs: {self.input_names}") print(f" ONNX outputs: {self.output_names}") self._ids_name = None self._mask_name = None self._token_type_name = None for name in self.input_names: nl = name.lower() if nl == "input_ids" or "input_ids" in nl: self._ids_name = name elif nl == "attention_mask" or "attention" in nl: self._mask_name = name elif nl == "token_type_ids" or "token_type" in nl: self._token_type_name = name print( f" Mapped: input_ids={self._ids_name}, " f"attention_mask={self._mask_name}, token_type_ids={self._token_type_name}" ) # Sanity check test = self.encode_texts(["a red square"]) nrm = float(np.linalg.norm(test[0])) print(f" [SANITY] text embed norm={nrm:.4f}") print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n") def encode_texts(self, texts: list[str]) -> np.ndarray: prefixed = [f"classification: {t}" for t in texts] tokens = self.tokenizer( prefixed, padding=True, truncation=True, return_tensors="np", max_length=512, ) input_ids = np.asarray(tokens["input_ids"], dtype=np.int64) attention_mask = np.asarray(tokens["attention_mask"], dtype=np.int64) feeds = {} if self._ids_name is not None: feeds[self._ids_name] = input_ids if self._mask_name is not None: feeds[self._mask_name] = attention_mask if self._token_type_name is not None: feeds[self._token_type_name] = np.zeros_like(input_ids, dtype=np.int64) outputs = self.session.run(self.output_names, feeds) main_out = _pick_output(outputs, self.output_names, kind="text") # Current PyTorch behavior: mean-pool last_hidden_state if main_out.ndim == 3: embs = _mean_pool(main_out, attention_mask) elif main_out.ndim == 2: embs = main_out else: raise RuntimeError(f"Unexpected text output rank: {main_out.ndim}") return _l2_normalize(embs, axis=1) class NomicVisionEncoderONNX: """ Nomic embed-vision-v1.5 ONNX: image -> hidden states -> CLS token -> L2 normalize """ def __init__(self, device: str = "cuda"): self.device = device self.repo_id = "nomic-ai/nomic-embed-vision-v1.5" print("[*] Loading nomic-embed-vision-v1.5 (ONNX)...") t0 = time.perf_counter() onnx_path = _download_onnx_model(self.repo_id) available = ort.get_available_providers() if "CUDAExecutionProvider" in available and device == "cuda": providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] else: providers = ["CPUExecutionProvider"] print(f" ONNX providers: {providers}") self.session = ort.InferenceSession(onnx_path, providers=providers) self.processor = AutoImageProcessor.from_pretrained(self.repo_id, trust_remote_code=True) self.input_names = [inp.name for inp in self.session.get_inputs()] self.output_names = [out.name for out in self.session.get_outputs()] print(f" ONNX inputs: {self.input_names}") print(f" ONNX outputs: {self.output_names}") self._pixel_name = None for name in self.input_names: if "pixel" in name.lower(): self._pixel_name = name break print(f" Mapped: pixel_values={self._pixel_name}") # Sanity check dummy = Image.new("RGB", (224, 224), color=(255, 0, 0)) test = self.encode_images([dummy]) nrm = float(np.linalg.norm(test[0])) print(f" [SANITY] image embed norm={nrm:.4f}") print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n") def encode_images(self, images: list[Image.Image]) -> np.ndarray: rgb = [img.convert("RGB") for img in images] processed = self.processor(images=rgb, return_tensors="pt") if "pixel_values" not in processed: raise RuntimeError(f"Processor did not return pixel_values. Keys={list(processed.keys())}") pixel_values = processed["pixel_values"].detach().cpu().numpy().astype(np.float32) if self._pixel_name is None: raise RuntimeError(f"Could not find pixel input in ONNX inputs: {self.input_names}") outputs = self.session.run(self.output_names, {self._pixel_name: pixel_values}) main_out = _pick_output(outputs, self.output_names, kind="vision") if main_out.ndim == 3: embs = main_out[:, 0, :] elif main_out.ndim == 2: embs = main_out else: raise RuntimeError(f"Unexpected vision output rank: {main_out.ndim}") return _l2_normalize(embs, axis=1) def build_refs_nomic( encoder: NomicVisionEncoderONNX, refs_dir: Path, batch_size: int = 16, text_encoder: NomicTextEncoderONNX | None = None, text_weight: float = 0.3, ): """ Build one ref embedding per class. Same treatment as Jina: - average reference image embeddings - average class prompt text embeddings - combine with text_weight """ class_dirs = sorted(d for d in refs_dir.iterdir() if d.is_dir()) if not class_dirs: raise ValueError(f"No subfolders in {refs_dir}") labels = [] embeddings = [] if text_encoder is not None: print(f" Text weight: {text_weight:.1f} | Image weight: {1 - text_weight:.1f}\n") for d in class_dirs: name = d.name paths = sorted(str(p) for p in d.iterdir() if p.suffix.lower() in IMAGE_EXTS) if not paths: continue all_embs = [] for i in range(0, len(paths), batch_size): batch = [Image.open(p).convert("RGB") for p in paths[i:i + batch_size]] all_embs.append(encoder.encode_images(batch)) img_embs = np.concatenate(all_embs, axis=0) img_avg = np.nan_to_num(img_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0) img_avg = img_avg / (np.linalg.norm(img_avg) + 1e-12) if text_encoder is not None: prompts = CLASS_PROMPTS.get(name, [f"a {name}", f"a person holding a {name}"]) text_embs = text_encoder.encode_texts(prompts) text_avg = np.nan_to_num(text_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0) text_avg = text_avg / (np.linalg.norm(text_avg) + 1e-12) combined = (1.0 - text_weight) * img_avg + text_weight * text_avg combined = np.nan_to_num(combined, nan=0.0, posinf=0.0, neginf=0.0) combined = combined / (np.linalg.norm(combined) + 1e-12) labels.append(name) embeddings.append(combined) sim = float(np.dot(img_avg, text_avg)) print( f" {name:<14}: {len(paths)} imgs + {len(prompts)} prompts | " f"img-text sim: {sim:.4f}" ) else: labels.append(name) embeddings.append(img_avg) print(f" {name:<14}: {len(paths)} imgs") return labels, np.stack(embeddings).astype(np.float32) NomicTextEncoder = NomicTextEncoderONNX NomicVisionEncoder = NomicVisionEncoderONNX