Spaces:

Napron
/

small_object_detection

Running

App Files Files Community

orik-ss commited on Mar 13

Commit

81de9b1

1 Parent(s): cc88e05

Added siglip and siglip2 for classification

Browse files

Files changed (4) hide show

app.py +22 -5
dfine_jina_pipeline.py +46 -11
siglip2_onnx_zeroshot.py +196 -0
siglip_zeroshot.py +100 -0

app.py CHANGED Viewed

@@ -108,8 +108,15 @@ def run_detection(image, model):
     return out_img, det_json
-def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, min_display_conf=0.703, gap_threshold=0.005):
-    """Tab 2: D-FINE first, then classify crops with Jina.
     Returns (group_crop_gallery, known_crop_gallery, status_message).
     """
     if image is None:
@@ -121,6 +128,7 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, mi
         return [], [], f"Refs folder not found: {refs}"
     dfine_model = "large" if dfine_model_choice.strip().lower() == "large" else "medium"
     group_crops, known_crops, status = run_single_image(
         image,
         refs_dir=refs,
@@ -131,6 +139,7 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, mi
         min_side=24,
         crop_dedup_iou=0.4,
         min_display_conf=float(min_display_conf),
     )
     if status is not None:
@@ -229,10 +238,12 @@ with gr.Blocks(title="Small Object Detection") as app:
         with gr.TabItem("D-FINE + Classify"):
             gr.Markdown(
-                "**D-FINE** runs first (person/car grouping), then small-object crops are classified with **Jina**. "
                 "Choose D-FINE model size (Medium or Large). "
                 "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
-                "with reference images.\n\n"
                 "**Gap** = how much the top class (e.g. gun) must beat the next-best class (e.g. phone). "
                 "Bigger gap means the model is more sure; we only accept the label if both confidence and gap are high enough."
             )
@@ -247,6 +258,12 @@ with gr.Blocks(title="Small Object Detection") as app:
                         height=IMG_HEIGHT
                     )
                     dfine_model_radio = gr.Radio(
                         choices=["Medium", "Large"],
                         value="Large",
@@ -322,7 +339,7 @@ with gr.Blocks(title="Small Object Detection") as app:
             btn_dfine.click(
                 fn=run_dfine_classify,
-                inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio, threshold_slider, gap_slider],
                 outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
                 concurrency_limit=1,
             )

     return out_img, det_json
+CLASSIFIER_MAP = {
+    "Jina-CLIP-v2 (few-shot)": "jina",
+    "SigLIP (zero-shot)": "siglip",
+    "SigLIP2 ONNX (zero-shot)": "siglip2_onnx",
+}
+def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, min_display_conf=0.703, gap_threshold=0.005, classifier_choice="Jina-CLIP-v2 (few-shot)"):
+    """Tab 2: D-FINE first, then classify crops.
     Returns (group_crop_gallery, known_crop_gallery, status_message).
     """
     if image is None:
         return [], [], f"Refs folder not found: {refs}"
     dfine_model = "large" if dfine_model_choice.strip().lower() == "large" else "medium"
+    classifier = CLASSIFIER_MAP.get(classifier_choice, "jina")
     group_crops, known_crops, status = run_single_image(
         image,
         refs_dir=refs,
         min_side=24,
         crop_dedup_iou=0.4,
         min_display_conf=float(min_display_conf),
+        classifier=classifier,
     )
     if status is not None:
         with gr.TabItem("D-FINE + Classify"):
             gr.Markdown(
+                "**D-FINE** runs first (person/car grouping), then small-object crops are classified. "
+                "Choose a **classifier**: Jina-CLIP-v2 (few-shot, uses reference images), "
+                "SigLIP (zero-shot, PyTorch), or SigLIP2 ONNX (zero-shot, larger model). "
                 "Choose D-FINE model size (Medium or Large). "
                 "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
+                "— Jina uses reference images; SigLIP models use only the folder names as class labels.\n\n"
                 "**Gap** = how much the top class (e.g. gun) must beat the next-best class (e.g. phone). "
                 "Bigger gap means the model is more sure; we only accept the label if both confidence and gap are high enough."
             )
                         height=IMG_HEIGHT
                     )
+                    classifier_radio = gr.Radio(
+                        choices=list(CLASSIFIER_MAP.keys()),
+                        value="Jina-CLIP-v2 (few-shot)",
+                        label="Classifier",
+                    )
                     dfine_model_radio = gr.Radio(
                         choices=["Medium", "Large"],
                         value="Large",
             btn_dfine.click(
                 fn=run_dfine_classify,
+                inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio, threshold_slider, gap_slider, classifier_radio],
                 outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
                 concurrency_limit=1,
             )

dfine_jina_pipeline.py CHANGED Viewed

@@ -499,11 +499,46 @@ def main():
 # -----------------------------------------------------------------------------
 _APP_DFINE = None  # (model_id, image_processor, dfine_model, person_car_ids)
-_APP_JINA = None
-_APP_REFS_JINA = None
 DFINE_MODEL_IDS = {"medium": "ustc-community/dfine-medium-obj365", "large": "ustc-community/dfine-large-obj365"}
 def run_single_image(
     pil_image,
@@ -517,6 +552,7 @@ def run_single_image(
     crop_dedup_iou=0.35,
     squarify=True,
     min_display_conf=None,
 ):
     """
     Run D-FINE on one image, then classify small-object crops with Jina.
@@ -571,14 +607,14 @@ def run_single_image(
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
-    # Load Jina encoder + refs (needed for classification)
-    if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
-        jina_encoder = JinaCLIPv2Encoder(device)
-        ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
-        _APP_JINA = (jina_encoder, ref_labels, ref_embs)
-        _APP_REFS_JINA = str(refs_dir)
-    jina_encoder, ref_labels, ref_embs = _APP_JINA
     results_per_crop = []
     group_crop_images = []
@@ -660,8 +696,7 @@ def run_single_image(
             if squarify:
                 bx1, by1, bx2, by2 = squarify_crop_box(bx1, by1, bx2, by2, crop_w, crop_h)
             small_crop = crop_pil.crop((bx1, by1, bx2, by2))
-            q = jina_encoder.encode_images([small_crop], TRUNCATE_DIM)
-            result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
             pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
             conf = result["confidence"]
             results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))

 # -----------------------------------------------------------------------------
 _APP_DFINE = None  # (model_id, image_processor, dfine_model, person_car_ids)
+_APP_CLASSIFIERS = {}  # {classifier_name: (classifier_instance, refs_dir_str)}
 DFINE_MODEL_IDS = {"medium": "ustc-community/dfine-medium-obj365", "large": "ustc-community/dfine-large-obj365"}
+CLASSIFIER_CHOICES = ["jina", "siglip", "siglip2_onnx"]
+def _load_classifier(classifier_name, device, refs_dir):
+    """Factory: load and initialize a classifier by name."""
+    refs_dir = Path(refs_dir)
+    if classifier_name == "jina":
+        jina_encoder = JinaCLIPv2Encoder(device)
+        ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
+        return ("jina_wrapped", jina_encoder, ref_labels, ref_embs)
+    if classifier_name == "siglip":
+        from siglip_zeroshot import SigLIPClassifier
+        clf = SigLIPClassifier(device)
+        clf.build_refs(refs_dir)
+        return clf
+    if classifier_name == "siglip2_onnx":
+        from siglip2_onnx_zeroshot import SigLIP2ONNXClassifier
+        clf = SigLIP2ONNXClassifier(device)
+        clf.build_refs(refs_dir)
+        return clf
+    raise ValueError(f"Unknown classifier: {classifier_name}. Choose from {CLASSIFIER_CHOICES}")
+def _classify_crop(classifier, crop, conf_threshold, gap_threshold):
+    """Unified classify call that works for both Jina (tuple) and SigLIP-style classifiers."""
+    if isinstance(classifier, tuple) and classifier[0] == "jina_wrapped":
+        _, jina_encoder, ref_labels, ref_embs = classifier
+        q = jina_encoder.encode_images([crop], TRUNCATE_DIM)
+        return jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
+    else:
+        return classifier.classify_crop(crop, conf_threshold, gap_threshold)
 def run_single_image(
     pil_image,
     crop_dedup_iou=0.35,
     squarify=True,
     min_display_conf=None,
+    classifier="jina",
 ):
     """
     Run D-FINE on one image, then classify small-object crops with Jina.
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
+    # Load classifier (Jina, SigLIP, or SigLIP2 ONNX)
+    global _APP_CLASSIFIERS
+    clf_key = classifier
+    if clf_key not in _APP_CLASSIFIERS or _APP_CLASSIFIERS[clf_key][1] != str(refs_dir):
+        clf_instance = _load_classifier(classifier, device, refs_dir)
+        _APP_CLASSIFIERS[clf_key] = (clf_instance, str(refs_dir))
+    clf_instance = _APP_CLASSIFIERS[clf_key][0]
     results_per_crop = []
     group_crop_images = []
             if squarify:
                 bx1, by1, bx2, by2 = squarify_crop_box(bx1, by1, bx2, by2, crop_w, crop_h)
             small_crop = crop_pil.crop((bx1, by1, bx2, by2))
+            result = _classify_crop(clf_instance, small_crop, conf_threshold, gap_threshold)
             pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
             conf = result["confidence"]
             results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))

siglip2_onnx_zeroshot.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+SigLIP2 zero-shot classifier using ONNX Runtime.
+Uses onnx-community/siglip2-large-patch16-256-ONNX (separate vision + text models).
+Zero-shot: text prompts only, no reference images needed (folder names used for class labels).
+"""
+import time
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+from huggingface_hub import hf_hub_download
+from transformers import AutoProcessor
+from jina_fewshot import CLASS_PROMPTS, IMAGE_EXTS
+REPO_ID = "onnx-community/siglip2-large-patch16-256-ONNX"
+# Use quantized models to save memory; full fp32 text_model is 2.3GB
+VISION_ONNX = "onnx/vision_model_quantized.onnx"
+TEXT_ONNX = "onnx/text_model_quantized.onnx"
+def _download(repo_id, filename):
+    print(f"  Downloading {filename} from {repo_id}...")
+    path = hf_hub_download(repo_id=repo_id, filename=filename)
+    print(f"  Downloaded: {path}")
+    return path
+def _make_session(onnx_path, device):
+    available = ort.get_available_providers()
+    if "CUDAExecutionProvider" in available and device == "cuda":
+        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    else:
+        providers = ["CPUExecutionProvider"]
+    print(f"  ONNX providers: {providers}")
+    return ort.InferenceSession(onnx_path, providers=providers)
+class SigLIP2ONNXClassifier:
+    """Zero-shot crop classifier using SigLIP2 ONNX (separate vision + text encoders)."""
+    def __init__(self, device="cuda"):
+        print("[*] Loading SigLIP2 ONNX (siglip2-large-patch16-256)...")
+        t0 = time.perf_counter()
+        self.device = device
+        # Download and load vision model
+        vision_path = _download(REPO_ID, VISION_ONNX)
+        self.vision_session = _make_session(vision_path, device)
+        # Download and load text model
+        text_path = _download(REPO_ID, TEXT_ONNX)
+        self.text_session = _make_session(text_path, device)
+        # Processor handles both image preprocessing and tokenization
+        self.processor = AutoProcessor.from_pretrained(REPO_ID)
+        # Map I/O names
+        self._vision_input_names = [i.name for i in self.vision_session.get_inputs()]
+        self._vision_output_names = [o.name for o in self.vision_session.get_outputs()]
+        self._text_input_names = [i.name for i in self.text_session.get_inputs()]
+        self._text_output_names = [o.name for o in self.text_session.get_outputs()]
+        print(f"  Vision inputs: {self._vision_input_names}")
+        print(f"  Vision outputs: {self._vision_output_names}")
+        print(f"  Text inputs: {self._text_input_names}")
+        print(f"  Text outputs: {self._text_output_names}")
+        self.labels = []
+        self._text_embeds = None
+        # Sanity check
+        dummy = Image.new("RGB", (256, 256), color=(255, 0, 0))
+        v_emb = self._encode_image(dummy)
+        print(f"  [SANITY] vision embed shape={v_emb.shape}, norm={np.linalg.norm(v_emb):.4f}")
+        t_emb = self._encode_texts(["a red square"])
+        print(f"  [SANITY] text embed shape={t_emb.shape}, norm={np.linalg.norm(t_emb):.4f}")
+        print(f"[*] SigLIP2 ONNX loaded in {time.perf_counter() - t0:.1f}s")
+    def _encode_image(self, image):
+        """Encode a single PIL image, return [1, D] embedding."""
+        processed = self.processor(images=image, return_tensors="np")
+        pixel_values = processed["pixel_values"].astype(np.float32)
+        feeds = {}
+        for name in self._vision_input_names:
+            if "pixel" in name.lower():
+                feeds[name] = pixel_values
+        outputs = self.vision_session.run(self._vision_output_names, feeds)
+        # Pick the pooler_output or last_hidden_state[:,0,:] — typically first 2D output
+        for out in outputs:
+            if out.ndim == 2:
+                return out
+        # Fallback: CLS token from 3D
+        for out in outputs:
+            if out.ndim == 3:
+                return out[:, 0, :]
+        raise RuntimeError(f"No usable vision output. Shapes: {[o.shape for o in outputs]}")
+    def _encode_texts(self, texts):
+        """Encode text strings, return [N, D] embeddings."""
+        processed = self.processor(text=texts, return_tensors="np", padding=True, truncation=True)
+        feeds = {}
+        for name in self._text_input_names:
+            nl = name.lower()
+            if "input_id" in nl and "input_ids" in processed:
+                feeds[name] = processed["input_ids"].astype(np.int64)
+            elif ("attention" in nl or "mask" in nl) and "attention_mask" in processed:
+                feeds[name] = processed["attention_mask"].astype(np.int64)
+        outputs = self.text_session.run(self._text_output_names, feeds)
+        # Pick pooler_output (2D) or CLS from 3D
+        for out in outputs:
+            if out.ndim == 2:
+                return out
+        for out in outputs:
+            if out.ndim == 3:
+                return out[:, 0, :]
+        raise RuntimeError(f"No usable text output. Shapes: {[o.shape for o in outputs]}")
+    def build_refs(self, refs_dir, **kwargs):
+        """Extract class names from refs_dir subfolders and precompute text embeddings."""
+        refs_dir = Path(refs_dir)
+        self.labels = sorted(d.name for d in refs_dir.iterdir() if d.is_dir())
+        if not self.labels:
+            raise ValueError(f"No subfolders in {refs_dir}")
+        text_prompts = []
+        for name in self.labels:
+            prompts = CLASS_PROMPTS.get(name, [f"a {name}"])
+            text_prompts.append(prompts[0])
+        self._text_embeds = self._encode_texts(text_prompts)
+        print(f"  SigLIP2 ONNX classes: {self.labels}")
+        print(f"  Text prompts: {text_prompts}")
+        print(f"  Text embeds shape: {self._text_embeds.shape}")
+    def classify_crop(self, crop, conf_threshold, gap_threshold):
+        """
+        Classify a single crop image using zero-shot SigLIP2.
+        Computes image-text similarity via dot product + sigmoid (SigLIP style).
+        Returns dict matching jina_fewshot.classify() format.
+        """
+        image_emb = self._encode_image(crop)  # [1, D]
+        text_emb = self._text_embeds  # [N, D]
+        # SigLIP2 uses sigmoid on logits (dot product scaled by model)
+        logits = (image_emb @ text_emb.T).squeeze(0).astype(np.float64)
+        probs = 1.0 / (1.0 + np.exp(-logits))  # sigmoid
+        probs = np.nan_to_num(probs, nan=0.0)
+        sorted_idx = np.argsort(probs)[::-1]
+        best_idx = sorted_idx[0]
+        second_idx = sorted_idx[1]
+        conf = float(probs[best_idx])
+        gap = float(probs[best_idx] - probs[second_idx])
+        conf_ok = conf >= conf_threshold
+        gap_ok = gap >= gap_threshold
+        if conf_ok and gap_ok:
+            prediction = self.labels[best_idx]
+            status = "accepted"
+        else:
+            prediction = "unknown"
+            reasons = []
+            if not conf_ok:
+                reasons.append(f"conf {conf:.4f} < {conf_threshold}")
+            if not gap_ok:
+                reasons.append(f"gap {gap:.4f} < {gap_threshold}")
+            status = "rejected: " + ", ".join(reasons)
+        return {
+            "prediction": prediction,
+            "raw_prediction": self.labels[best_idx],
+            "confidence": conf,
+            "gap": gap,
+            "second_best": self.labels[second_idx],
+            "second_conf": float(probs[second_idx]),
+            "status": status,
+            "all_sims": {self.labels[j]: float(probs[j]) for j in range(len(self.labels))},
+        }

siglip_zeroshot.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+SigLIP zero-shot classifier for crop classification.
+Uses google/siglip-base-patch16-224 via PyTorch.
+Zero-shot: text prompts only, no reference images needed (folder names used for class labels).
+"""
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from transformers import SiglipModel, AutoProcessor
+from jina_fewshot import CLASS_PROMPTS
+class SigLIPClassifier:
+    """Zero-shot crop classifier using SigLIP (PyTorch)."""
+    def __init__(self, device="cuda"):
+        print("[*] Loading SigLIP (google/siglip-base-patch16-224)...")
+        t0 = time.perf_counter()
+        self.device = device
+        self.model = SiglipModel.from_pretrained("google/siglip-base-patch16-224")
+        self.model = self.model.to(device).eval()
+        self.processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+        self.labels = []
+        self._text_prompts = []
+        print(f"[*] SigLIP loaded in {time.perf_counter() - t0:.1f}s (device={device})")
+    def build_refs(self, refs_dir, **kwargs):
+        """Extract class names from refs_dir subfolders. No images needed."""
+        refs_dir = Path(refs_dir)
+        self.labels = sorted(d.name for d in refs_dir.iterdir() if d.is_dir())
+        if not self.labels:
+            raise ValueError(f"No subfolders in {refs_dir}")
+        # Build one prompt per class (first from CLASS_PROMPTS, fallback to "a {name}")
+        self._text_prompts = []
+        for name in self.labels:
+            prompts = CLASS_PROMPTS.get(name, [f"a {name}"])
+            self._text_prompts.append(prompts[0])
+        print(f"  SigLIP classes: {self.labels}")
+        print(f"  Text prompts: {self._text_prompts}")
+    def classify_crop(self, crop, conf_threshold, gap_threshold):
+        """
+        Classify a single crop image using zero-shot SigLIP.
+        Returns dict matching jina_fewshot.classify() format.
+        """
+        inputs = self.processor(
+            text=self._text_prompts,
+            images=crop,
+            return_tensors="pt",
+            padding="max_length",
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits_per_image
+            probs = torch.sigmoid(logits).cpu().numpy().squeeze(0)
+        probs = np.nan_to_num(probs.astype(np.float64), nan=0.0)
+        sorted_idx = np.argsort(probs)[::-1]
+        best_idx = sorted_idx[0]
+        second_idx = sorted_idx[1]
+        conf = float(probs[best_idx])
+        gap = float(probs[best_idx] - probs[second_idx])
+        conf_ok = conf >= conf_threshold
+        gap_ok = gap >= gap_threshold
+        if conf_ok and gap_ok:
+            prediction = self.labels[best_idx]
+            status = "accepted"
+        else:
+            prediction = "unknown"
+            reasons = []
+            if not conf_ok:
+                reasons.append(f"conf {conf:.4f} < {conf_threshold}")
+            if not gap_ok:
+                reasons.append(f"gap {gap:.4f} < {gap_threshold}")
+            status = "rejected: " + ", ".join(reasons)
+        return {
+            "prediction": prediction,
+            "raw_prediction": self.labels[best_idx],
+            "confidence": conf,
+            "gap": gap,
+            "second_best": self.labels[second_idx],
+            "second_conf": float(probs[second_idx]),
+            "status": status,
+            "all_sims": {self.labels[j]: float(probs[j]) for j in range(len(self.labels))},
+        }