Spaces:

Napron
/

small_object_detection

Running

App Files Files Community

orik-ss commited on Mar 13

Commit

80cacd4

1 Parent(s): 2455309

Removed object detection tab

Browse files

Files changed (3) hide show

app.py +88 -269
dfine_jina_pipeline.py +26 -18
siglip_zeroshot.py +14 -6

app.py CHANGED Viewed

@@ -1,131 +1,32 @@
-""" Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + SigLIP Classify. """
 import os
-os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
-import json
-import numpy as np
 import gradio as gr
-from ultralytics import YOLO
 from pathlib import Path
-# Tab 2: D-FINE runs first, then SigLIP for crop classification
 from dfine_jina_pipeline import run_single_image
-# --- Object Detection (Tab 1) ---
-PERSON_CLASS = 0
-CAR_CLASS = 2
-KNIFE_CLASS = 80
-WEAPON_CLASS = 81
-DRAW_CLASSES = [PERSON_CLASS, CAR_CLASS, KNIFE_CLASS, WEAPON_CLASS]
-CLASS_NAMES = {
-    PERSON_CLASS: "person",
-    CAR_CLASS: "car",
-    KNIFE_CLASS: "knife",
-    WEAPON_CLASS: "weapon",
-}
-CONF = 0.25
-IMGSZ = 640
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-MODELS_DIR = os.path.join(BASE_DIR, "models")
-REFS_DIR = os.path.join(BASE_DIR, "refs")
-def _load_model(version: str):
-    path = os.path.join(MODELS_DIR, version, "best.pt")
-    if not os.path.isfile(path):
-        raise FileNotFoundError(f"Model not found: {path}")
-    return YOLO(path)
-MODELS = {"v1": _load_model("v1")}
-MODEL_CLASSES = {
-    "v1": ["person", "car", "knife", "weapon"]
-}
-def run_detection(image, model):
-    if image is None:
-        return None, "{}"
-    img = image if isinstance(image, np.ndarray) else np.array(image)
-    if img.ndim == 2:
-        img = np.stack([img] * 3, axis=-1)
-    results = model.predict(
-        source=img,
-        imgsz=IMGSZ,
-        conf=CONF,
-        device="cpu",
-        verbose=False,
-    )
-    r = results[0]
-    if r.boxes is None or len(r.boxes) == 0:
-        return image, json.dumps({"detections": []}, indent=2)
-    clss = r.boxes.cls.cpu().numpy()
-    confs = r.boxes.conf.cpu().numpy()
-    keep = [
-        i for i in range(len(r.boxes))
-        if int(clss[i]) in DRAW_CLASSES
-    ]
-    if not keep:
-        return image, json.dumps({"detections": []}, indent=2)
-    detections = []
-    for i in keep:
-        cls_id = int(clss[i])
-        detections.append({
-            "class": CLASS_NAMES.get(cls_id, str(cls_id)),
-            "confidence": round(float(confs[i]), 3),
-            "bbox": r.boxes.xyxy[i].cpu().numpy().tolist(),
-        })
-    r.boxes = r.boxes[keep]
-    out_img = r.plot()
-    det_json = json.dumps(
-        {"detections": detections},
-        indent=2
-    )
-    return out_img, det_json
-def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, siglip_threshold):
-    """Tab 2: D-FINE first, then classify crops with SigLIP.
     Returns (group_crop_gallery, known_crop_gallery, status_message).
     """
     if image is None:
         return [], [], "Upload an image."
-    refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR)
-    if not refs.is_dir():
-        return [], [], f"Refs folder not found: {refs}"
-    dfine_model = dfine_model_choice.strip().lower() if dfine_model_choice else "large-obj365"
     conf_thresh = float(siglip_threshold)
     group_crops, known_crops, status = run_single_image(
         image,
-        refs_dir=refs,
         dfine_model=dfine_model,
         det_threshold=float(dfine_threshold),
         conf_threshold=conf_thresh,
@@ -134,6 +35,7 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, si
         crop_dedup_iou=0.4,
         min_display_conf=conf_thresh,
         classifier="siglip",
     )
     return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status or ""
@@ -142,187 +44,104 @@ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, si
 IMG_HEIGHT = 400
-TAB_STYLE = """
-<style>
-[data-testid="tabs"] > div:first-child,
-.gr-tabs > div:first-child,
-div[class*="tabs"] > div:first-child {
-    display: flex !important;
-    width: 100% !important;
-}
-[data-testid="tabs"] button,
-.gr-tabs button,
-div[class*="tabs"] > div:first-child button {
-    flex: 1 !important;
-    min-width: 0 !important;
-    min-height: 40px !important;
-    color: white !important;
-    font-weight: 700 !important;
-    font-size: 1rem !important;
-    text-align: center !important;
-    justify-content: center !important;
-}
-[data-testid="tabs"] button:not([aria-selected="true"]),
-.gr-tabs button:not([aria-selected="true"]),
-div[class*="tabs"] > div:first-child button:not([aria-selected="true"]) {
-    background: #6b7280 !important;
-    border-color: #6b7280 !important;
-}
-[data-testid="tabs"] button[aria-selected="true"],
-.gr-tabs button[aria-selected="true"],
-div[class*="tabs"] > div:first-child button[aria-selected="true"] {
-    background: var(--primary-500, #f97316) !important;
-    border-color: var(--primary-500, #f97316) !important;
-}
-</style>
-"""
 with gr.Blocks(title="Small Object Detection") as app:
-    gr.HTML(TAB_STYLE)
     gr.Markdown("# Small Object Detection")
-    with gr.Tabs():
-        with gr.TabItem("Object Detection"):
-            gr.Markdown(
-                "**Classes:** " + ", ".join(MODEL_CLASSES["v1"])
-            )
-            with gr.Row():
-                with gr.Column(scale=1):
-                    inp_det = gr.Image(
-                        label="Input image",
-                        height=IMG_HEIGHT
-                    )
-                    btn_det = gr.Button(
-                        "Detect",
-                        variant="primary"
-                    )
-                    out_img_det = gr.Image(
-                        label="Output",
-                        height=IMG_HEIGHT
-                    )
-                    det_output = gr.JSON(
-                        label="Detections"
-                    )
-            btn_det.click(
-                fn=lambda img: run_detection(img, MODELS["v1"]),
-                inputs=inp_det,
-                outputs=[out_img_det, det_output],
             )
-        with gr.TabItem("D-FINE + Classify"):
-            gr.Markdown(
-                "**D-FINE** runs first (person/car grouping), then small-object crops are classified with **SigLIP** (zero-shot). "
-                "Choose a D-FINE model (obj365, coco, or obj2coco variants in small/medium/large). "
-                "Uses the **refs** folder names as class labels (e.g. refs/phone/, refs/cigarette/)."
             )
-            with gr.Row():
-                with gr.Column(scale=1):
-                    inp_dfine = gr.Image(
-                        type="pil",
-                        label="Input image",
-                        height=IMG_HEIGHT
-                    )
-                    dfine_model_radio = gr.Dropdown(
-                        choices=[
-                            "small-obj365", "medium-obj365", "large-obj365",
-                            "small-coco", "medium-coco", "large-coco",
-                            "small-obj2coco", "medium-obj2coco", "large-obj2coco",
-                        ],
-                        value="large-obj365",
-                        label="D-FINE model",
-                    )
-                    dfine_threshold_slider = gr.Slider(
-                        minimum=0.05,
-                        maximum=0.5,
-                        value=0.2,
-                        step=0.05,
-                        label="D-FINE detection threshold (applied to chosen model)",
-                    )
-                    def update_dfine_threshold_default(choice):
-                        if not choice:
-                            return gr.update(value=0.15)
-                        size = choice.strip().lower().split("-")[0]
-                        defaults = {"large": 0.2, "medium": 0.15, "small": 0.1}
-                        return gr.update(value=defaults.get(size, 0.15))
-                    dfine_model_radio.change(
-                        fn=update_dfine_threshold_default,
-                        inputs=[dfine_model_radio],
-                        outputs=[dfine_threshold_slider],
-                    )
-                    siglip_threshold_slider = gr.Slider(
-                        minimum=0.001,
-                        maximum=0.1,
-                        value=0.01,
-                        step=0.001,
-                        label="SigLIP: min confidence threshold",
-                    )
-                    refs_path = gr.Textbox(
-                        label="Refs folder path",
-                        value=REFS_DIR,
-                        placeholder="e.g. refs or /path/to/refs",
-                    )
-                    btn_dfine = gr.Button(
-                        "Run D-FINE + Classify",
-                        variant="primary"
-                    )
-                with gr.Column(scale=1):
-                    out_gallery_dfine = gr.Gallery(
-                        label="Person/car crops (all D-FINE objects inside drawn with label + score)",
-                        height=IMG_HEIGHT,
-                        columns=2,
-                        object_fit="contain",
-                    )
-                    out_gallery_known = gr.Gallery(
-                        label="Known objects (class + score above each crop)",
-                        height=IMG_HEIGHT,
-                        columns=4,
-                        object_fit="contain",
-                    )
-                    out_status_dfine = gr.Textbox(
-                        label="Classification details",
-                        lines=8,
-                        interactive=False,
-                    )
-            btn_dfine.click(
-                fn=run_dfine_classify,
-                inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio, siglip_threshold_slider],
-                outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
-                concurrency_limit=1,
             )
 app.launch(
     server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
@@ -332,4 +151,4 @@ app.launch(
             os.environ.get("GRADIO_SERVER_PORT", 7860)
         )
     ),
-)

+""" Gradio app: D-FINE + SigLIP Classify. """
 import os
 import gradio as gr
 from pathlib import Path
 from dfine_jina_pipeline import run_single_image
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_LABELS = "gun, knife, cigarette, phone"
+def run_dfine_classify(image, dfine_threshold, dfine_model_choice, siglip_threshold, labels_text):
+    """D-FINE first, then classify crops with SigLIP.
     Returns (group_crop_gallery, known_crop_gallery, status_message).
     """
     if image is None:
         return [], [], "Upload an image."
+    labels = [l.strip() for l in labels_text.split(",") if l.strip()]
+    if not labels:
+        return [], [], "Enter at least one label."
+    dfine_model = dfine_model_choice.strip().lower() if dfine_model_choice else "medium-obj2coco"
     conf_thresh = float(siglip_threshold)
     group_crops, known_crops, status = run_single_image(
         image,
         dfine_model=dfine_model,
         det_threshold=float(dfine_threshold),
         conf_threshold=conf_thresh,
         crop_dedup_iou=0.4,
         min_display_conf=conf_thresh,
         classifier="siglip",
+        labels=labels,
     )
     return [(g, None) for g in (group_crops or [])], [(k, None) for k in (known_crops or [])], status or ""
 IMG_HEIGHT = 400
 with gr.Blocks(title="Small Object Detection") as app:
     gr.Markdown("# Small Object Detection")
+    gr.Markdown(
+        "**D-FINE** detects persons/cars, then small-object crops are classified with **SigLIP** (zero-shot). "
+        "Choose a D-FINE model and enter comma-separated class labels for SigLIP."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            inp_dfine = gr.Image(
+                type="pil",
+                label="Input image",
+                height=IMG_HEIGHT
+            )
+            dfine_model_radio = gr.Dropdown(
+                choices=[
+                    "small-obj365", "medium-obj365", "large-obj365",
+                    "small-coco", "medium-coco", "large-coco",
+                    "small-obj2coco", "medium-obj2coco", "large-obj2coco",
+                ],
+                value="medium-obj2coco",
+                label="D-FINE model",
+            )
+            dfine_threshold_slider = gr.Slider(
+                minimum=0.05,
+                maximum=0.5,
+                value=0.15,
+                step=0.05,
+                label="D-FINE detection threshold",
+            )
+            def update_dfine_threshold_default(choice):
+                if not choice:
+                    return gr.update(value=0.15)
+                size = choice.strip().lower().split("-")[0]
+                defaults = {"large": 0.2, "medium": 0.15, "small": 0.1}
+                return gr.update(value=defaults.get(size, 0.15))
+            dfine_model_radio.change(
+                fn=update_dfine_threshold_default,
+                inputs=[dfine_model_radio],
+                outputs=[dfine_threshold_slider],
+            )
+            siglip_threshold_slider = gr.Slider(
+                minimum=0.001,
+                maximum=0.1,
+                value=0.005,
+                step=0.001,
+                label="SigLIP: min confidence threshold",
+            )
+            labels_input = gr.Textbox(
+                label="Labels (comma-separated)",
+                value=DEFAULT_LABELS,
+                placeholder="e.g. gun, knife, cigarette, phone",
+            )
+            btn_dfine = gr.Button(
+                "Run D-FINE + Classify",
+                variant="primary"
             )
+        with gr.Column(scale=1):
+            out_gallery_dfine = gr.Gallery(
+                label="Person/car crops (all D-FINE objects inside drawn with label + score)",
+                height=IMG_HEIGHT,
+                columns=2,
+                object_fit="contain",
+            )
+            out_gallery_known = gr.Gallery(
+                label="Known objects (class + score above each crop)",
+                height=IMG_HEIGHT,
+                columns=4,
+                object_fit="contain",
             )
+            out_status_dfine = gr.Textbox(
+                label="Classification details",
+                lines=8,
+                interactive=False,
             )
+    btn_dfine.click(
+        fn=run_dfine_classify,
+        inputs=[inp_dfine, dfine_threshold_slider, dfine_model_radio, siglip_threshold_slider, labels_input],
+        outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
+        concurrency_limit=1,
+    )
 app.launch(
     server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"),
             os.environ.get("GRADIO_SERVER_PORT", 7860)
         )
     ),
+)

dfine_jina_pipeline.py CHANGED Viewed

@@ -519,9 +519,10 @@ DFINE_MODEL_IDS = {
 CLASSIFIER_CHOICES = ["jina", "siglip", "siglip2_onnx"]
-def _load_classifier(classifier_name, device, refs_dir):
     """Factory: load and initialize a classifier by name."""
-    refs_dir = Path(refs_dir)
     if classifier_name == "jina":
         jina_encoder = JinaCLIPv2Encoder(device)
@@ -531,13 +532,13 @@ def _load_classifier(classifier_name, device, refs_dir):
     if classifier_name == "siglip":
         from siglip_zeroshot import SigLIPClassifier
         clf = SigLIPClassifier(device)
-        clf.build_refs(refs_dir)
         return clf
     if classifier_name == "siglip2_onnx":
         from siglip2_onnx_zeroshot import SigLIP2ONNXClassifier
         clf = SigLIP2ONNXClassifier(device)
-        clf.build_refs(refs_dir)
         return clf
     raise ValueError(f"Unknown classifier: {classifier_name}. Choose from {CLASSIFIER_CHOICES}")
@@ -555,7 +556,7 @@ def _classify_crop(classifier, crop, conf_threshold, gap_threshold):
 def run_single_image(
     pil_image,
-    refs_dir,
     device=None,
     dfine_model="large",
     det_threshold=0.3,
@@ -565,13 +566,15 @@ def run_single_image(
     crop_dedup_iou=0.35,
     squarify=True,
     min_display_conf=None,
-    classifier="jina",
 ):
     """
-    Run D-FINE on one image, then classify small-object crops with Jina.
-    refs_dir: path to refs folder (str or Path).
-    dfine_model: "medium" or "large".
     Returns (group_crop_images, known_crop_composites, status_message).
     """
@@ -581,11 +584,15 @@ def run_single_image(
         min_display_conf = MIN_DISPLAY_CONF
     from PIL import Image
-    global _APP_DFINE, _APP_JINA, _APP_REFS_JINA
-    refs_dir = Path(refs_dir)
-    if not refs_dir.is_dir():
-        return [], [], f"Refs folder not found: {refs_dir}"
     dfine_model = (dfine_model or "large-obj365").strip().lower()
     if dfine_model not in DFINE_MODEL_IDS:
@@ -620,12 +627,13 @@ def run_single_image(
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
-    # Load classifier (Jina, SigLIP, or SigLIP2 ONNX)
     global _APP_CLASSIFIERS
     clf_key = classifier
-    if clf_key not in _APP_CLASSIFIERS or _APP_CLASSIFIERS[clf_key][1] != str(refs_dir):
-        clf_instance = _load_classifier(classifier, device, refs_dir)
-        _APP_CLASSIFIERS[clf_key] = (clf_instance, str(refs_dir))
     clf_instance = _APP_CLASSIFIERS[clf_key][0]
@@ -745,7 +753,7 @@ def run_single_image(
     # Build known-only gallery: only objects with conf >= min_display_conf
     known_crop_composites = []
     for (_gidx, _box, crop_pil, pred, conf) in results_per_crop:
-        if pred not in KNOWN_DISPLAY_CLASSES or conf < min_display_conf:
             continue
         composite = draw_label_on_image(crop_pil, pred, conf)
         known_crop_composites.append(np.array(composite))

 CLASSIFIER_CHOICES = ["jina", "siglip", "siglip2_onnx"]
+def _load_classifier(classifier_name, device, refs_dir=None, labels=None):
     """Factory: load and initialize a classifier by name."""
+    if refs_dir:
+        refs_dir = Path(refs_dir)
     if classifier_name == "jina":
         jina_encoder = JinaCLIPv2Encoder(device)
     if classifier_name == "siglip":
         from siglip_zeroshot import SigLIPClassifier
         clf = SigLIPClassifier(device)
+        clf.build_refs(refs_dir=refs_dir, labels=labels)
         return clf
     if classifier_name == "siglip2_onnx":
         from siglip2_onnx_zeroshot import SigLIP2ONNXClassifier
         clf = SigLIP2ONNXClassifier(device)
+        clf.build_refs(refs_dir=refs_dir, labels=labels)
         return clf
     raise ValueError(f"Unknown classifier: {classifier_name}. Choose from {CLASSIFIER_CHOICES}")
 def run_single_image(
     pil_image,
+    refs_dir=None,
     device=None,
     dfine_model="large",
     det_threshold=0.3,
     crop_dedup_iou=0.35,
     squarify=True,
     min_display_conf=None,
+    classifier="siglip",
+    labels=None,
 ):
     """
+    Run D-FINE on one image, then classify small-object crops.
+    refs_dir: path to refs folder (str or Path), optional if labels provided.
+    labels: list of class label strings for zero-shot classifiers.
+    dfine_model: key from DFINE_MODEL_IDS.
     Returns (group_crop_images, known_crop_composites, status_message).
     """
         min_display_conf = MIN_DISPLAY_CONF
     from PIL import Image
+    global _APP_DFINE
+    if refs_dir:
+        refs_dir = Path(refs_dir)
+        if not refs_dir.is_dir():
+            return [], [], f"Refs folder not found: {refs_dir}"
+    if not refs_dir and not labels:
+        return [], [], "Provide either refs_dir or labels."
     dfine_model = (dfine_model or "large-obj365").strip().lower()
     if dfine_model not in DFINE_MODEL_IDS:
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
+    # Load classifier
     global _APP_CLASSIFIERS
+    cache_key = str(labels) if labels else str(refs_dir)
     clf_key = classifier
+    if clf_key not in _APP_CLASSIFIERS or _APP_CLASSIFIERS[clf_key][1] != cache_key:
+        clf_instance = _load_classifier(classifier, device, refs_dir=refs_dir, labels=labels)
+        _APP_CLASSIFIERS[clf_key] = (clf_instance, cache_key)
     clf_instance = _APP_CLASSIFIERS[clf_key][0]
     # Build known-only gallery: only objects with conf >= min_display_conf
     known_crop_composites = []
     for (_gidx, _box, crop_pil, pred, conf) in results_per_crop:
+        if pred.startswith("unknown") or conf < min_display_conf:
             continue
         composite = draw_label_on_image(crop_pil, pred, conf)
         known_crop_composites.append(np.array(composite))

siglip_zeroshot.py CHANGED Viewed

@@ -27,15 +27,23 @@ class SigLIPClassifier:
         print(f"[*] SigLIP loaded in {time.perf_counter() - t0:.1f}s (device={device})")
-    def build_refs(self, refs_dir, **kwargs):
-        """Extract class names from refs_dir subfolders as plain labels."""
-        refs_dir = Path(refs_dir)
-        self.labels = sorted(d.name for d in refs_dir.iterdir() if d.is_dir())
         if not self.labels:
-            raise ValueError(f"No subfolders in {refs_dir}")
         print(f"  SigLIP labels: {self.labels}")
     def classify_crop(self, crop, conf_threshold, gap_threshold):
         """
         Classify a single crop image using zero-shot SigLIP.

         print(f"[*] SigLIP loaded in {time.perf_counter() - t0:.1f}s (device={device})")
+    def set_labels(self, labels):
+        """Set class labels directly from a list of strings."""
+        self.labels = list(labels)
         if not self.labels:
+            raise ValueError("No labels provided")
         print(f"  SigLIP labels: {self.labels}")
+    def build_refs(self, refs_dir=None, labels=None, **kwargs):
+        """Set labels from a list or extract from refs_dir subfolders."""
+        if labels:
+            self.set_labels(labels)
+        elif refs_dir:
+            refs_dir = Path(refs_dir)
+            self.set_labels(sorted(d.name for d in refs_dir.iterdir() if d.is_dir()))
+        else:
+            raise ValueError("Provide either labels or refs_dir")
     def classify_crop(self, crop, conf_threshold, gap_threshold):
         """
         Classify a single crop image using zero-shot SigLIP.