Spaces:

Napron
/

small_object_detection

Running

App Files Files Community

Orkhan Hasanli commited on Mar 11

Commit

cc2035e

1 Parent(s): 3ecb1be

Modified the pipeline to run DFINE on small crops again

Browse files

Files changed (1) hide show

dfine_jina_pipeline.py +91 -101

dfine_jina_pipeline.py CHANGED Viewed

@@ -123,8 +123,10 @@ def expand_box_by_margin(box, margin_ratio, img_w, img_h):
     return [x1, y1, x2, y2]
-# 10% margin on person/car group box when testing if object center is inside (avoids missing boundary objects)
 PERSON_CAR_GROUP_MARGIN = 0.10
 def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
@@ -569,127 +571,115 @@ def run_single_image(
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
-    candidates = []
     for gidx, grp in enumerate(top_groups):
-        x1, y1, x2, y2 = grp["box"]
-        group_box = [x1, y1, x2, y2]
-        group_box_with_margin = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
-        inside = [
-            d for d in detections
-            if box_center_inside(d["box"], group_box_with_margin) and d["cls"] not in person_car_ids
-        ]
         inside = deduplicate_by_iou(inside, iou_threshold=0.9)
-        for crop_idx, d in enumerate(inside):
             bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
             obj_w, obj_h = bx2 - bx1, by2 - by1
             if obj_w <= 0 or obj_h <= 0:
                 continue
-            # Small objects (min side < 24 px): expand by 60%; larger: 30%
             min_side_obj = min(obj_w, obj_h)
             pad_ratio = 0.6 if min_side_obj < 24 else 0.3
             pad_x = obj_w * pad_ratio
             pad_y = obj_h * pad_ratio
-            bx1 = max(0, int(bx1 - pad_x))
-            by1 = max(0, int(by1 - pad_y))
-            bx2 = min(img_w, int(bx2 + pad_x))
-            by2 = min(img_h, int(by2 + pad_y))
             if bx2 <= bx1 or by2 <= by1:
                 continue
-            if min(bx2 - bx1, by2 - by1) < min_side:
                 continue
-            expanded_box = [bx1, by1, bx2, by2]
-            candidates.append((expanded_box, d, gidx, crop_idx))
-    def crop_area(box):
-        return (box[2] - box[0]) * (box[3] - box[1])
-    candidates.sort(key=lambda c: -crop_area(c[0]))
-    kept = []
-    for c in candidates:
-        def is_same_object(box_a, box_b):
-            if box_iou(box_a, box_b) >= crop_dedup_iou:
-                return True
-            if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
-                return True
-            return False
-        if not any(is_same_object(c[0], k[0]) for k in kept):
-            kept.append(c)
-    if not kept:
-        if not candidates:
-            return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
-        return [], [], "No small-object crops (after dedup)."
-    # Load Jina encoder + refs
-    if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
-        jina_encoder = JinaCLIPv2Encoder(device)
-        ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
-        _APP_JINA = (jina_encoder, ref_labels, ref_embs)
-        _APP_REFS_JINA = str(refs_dir)
-    jina_encoder, ref_labels, ref_embs = _APP_JINA
-    # Classify each kept crop and store (gidx, box_in_full_image, crop_pil, pred, conf)
-    results_per_crop = []
-    for expanded_box, d, gidx, crop_idx in kept:
-        if squarify:
-            bx1, by1, bx2, by2 = squarify_crop_box(
-                expanded_box[0],
-                expanded_box[1],
-                expanded_box[2],
-                expanded_box[3],
-                img_w,
-                img_h,
-            )
         else:
-            bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
-        crop_pil = pil.crop((bx1, by1, bx2, by2))
-        q = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
-        result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
-        pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
-        conf = result["confidence"]
-        results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
-    # Build group crop images: always show every person/car crop; draw only bboxes of crops we sent to classifier (no person/car bbox)
-    group_crop_images = []
-    for gidx, grp in enumerate(top_groups):
-        gx1, gy1, gx2, gy2 = grp["box"]
-        gx1, gy1 = int(gx1), int(gy1)
-        gx2, gy2 = int(gx2), int(gy2)
-        gx1, gy1 = max(0, gx1), max(0, gy1)
-        gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
-        if gx2 <= gx1 or gy2 <= gy1:
-            continue
-        group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
-        crop_w, crop_h = group_crop.size
-        # Only bboxes that were actually cropped and sent to classification (same shape as crop we sent)
-        boxes_to_draw = []
-        for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
-            if gidx2 != gidx:
-                continue
-            rx1 = max(0, min(crop_w, bx1 - gx1))
-            ry1 = max(0, min(crop_h, by1 - gy1))
-            rx2 = max(0, min(crop_w, bx2 - gx1))
-            ry2 = max(0, min(crop_h, by2 - gy1))
-            if rx2 > rx1 and ry2 > ry1:
-                boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
-        if boxes_to_draw:
-            group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
-        group_crop_images.append(np.array(group_crop))
     # Build known-only gallery: only objects with conf >= min_display_conf
     known_crop_composites = []

     return [x1, y1, x2, y2]
+# 10% margin on person/car group crop (expand crop before running D-FINE on it)
 PERSON_CAR_GROUP_MARGIN = 0.10
+# Min side (px) for object crops extracted from person/car crop before sending to classifier (objects in crop are larger)
+MIN_OBJECT_CROP_SIDE = 112
 def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
+    # Load Jina encoder + refs (needed for classification)
+    if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
+        jina_encoder = JinaCLIPv2Encoder(device)
+        ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
+        _APP_JINA = (jina_encoder, ref_labels, ref_embs)
+        _APP_REFS_JINA = str(refs_dir)
+    jina_encoder, ref_labels, ref_embs = _APP_JINA
+    results_per_crop = []
+    group_crop_images = []
+    # For each person/car group: crop (with 10% margin), run D-FINE on crop, detect objects, then classify each
     for gidx, grp in enumerate(top_groups):
+        group_box = [grp["box"][0], grp["box"][1], grp["box"][2], grp["box"][3]]
+        crop_box = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
+        gx1 = max(0, int(crop_box[0]))
+        gy1 = max(0, int(crop_box[1]))
+        gx2 = min(img_w, int(crop_box[2]))
+        gy2 = min(img_h, int(crop_box[3]))
+        if gx2 <= gx1 or gy2 <= gy1:
+            continue
+        crop_pil = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
+        crop_w, crop_h = crop_pil.size
+        # Run D-FINE on person/car crop to detect objects inside
+        detections_crop = run_dfine(crop_pil, image_processor, dfine_model_obj, device, det_threshold)
+        inside = [d for d in detections_crop if d["cls"] not in person_car_ids]
         inside = deduplicate_by_iou(inside, iou_threshold=0.9)
+        candidates = []
+        for d in inside:
             bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
             obj_w, obj_h = bx2 - bx1, by2 - by1
             if obj_w <= 0 or obj_h <= 0:
                 continue
             min_side_obj = min(obj_w, obj_h)
             pad_ratio = 0.6 if min_side_obj < 24 else 0.3
             pad_x = obj_w * pad_ratio
             pad_y = obj_h * pad_ratio
+            bx1 = max(0.0, bx1 - pad_x)
+            by1 = max(0.0, by1 - pad_y)
+            bx2 = min(crop_w, bx2 + pad_x)
+            by2 = min(crop_h, by2 + pad_y)
             if bx2 <= bx1 or by2 <= by1:
                 continue
+            w, h = bx2 - bx1, by2 - by1
+            if min(w, h) < MIN_OBJECT_CROP_SIDE:
+                need = MIN_OBJECT_CROP_SIDE - min(w, h)
+                half = need / 2.0
+                if w < h:
+                    bx1 = max(0, bx1 - half)
+                    bx2 = min(crop_w, bx2 + half)
+                else:
+                    by1 = max(0, by1 - half)
+                    by2 = min(crop_h, by2 + half)
+                w, h = bx2 - bx1, by2 - by1
+                if w < MIN_OBJECT_CROP_SIDE:
+                    add = (MIN_OBJECT_CROP_SIDE - w) / 2
+                    bx1 = max(0, bx1 - add)
+                    bx2 = min(crop_w, bx2 + add)
+                if h < MIN_OBJECT_CROP_SIDE:
+                    add = (MIN_OBJECT_CROP_SIDE - h) / 2
+                    by1 = max(0, by1 - add)
+                    by2 = min(crop_h, by2 + add)
+            bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
+            if bx2 <= bx1 or by2 <= by1:
                 continue
+            candidates.append(([bx1, by1, bx2, by2], d, gidx))
+        def crop_area(box):
+            return (box[2] - box[0]) * (box[3] - box[1])
+        candidates.sort(key=lambda c: -crop_area(c[0]))
+        kept = []
+        for c in candidates:
+            expanded_box = c[0]
+            if not any(
+                box_iou(expanded_box, k[0]) >= crop_dedup_iou
+                or box_center_inside(expanded_box, k[0])
+                or box_center_inside(k[0], expanded_box)
+                for k in kept
+            ):
+                kept.append(c)
+        for (bx1, by1, bx2, by2), d, _ in kept:
+            if squarify:
+                bx1, by1, bx2, by2 = squarify_crop_box(bx1, by1, bx2, by2, crop_w, crop_h)
+            small_crop = crop_pil.crop((bx1, by1, bx2, by2))
+            q = jina_encoder.encode_images([small_crop], TRUNCATE_DIM)
+            result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
+            pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
+            conf = result["confidence"]
+            results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))
+        # Draw bboxes on this group crop (bboxes already in crop coords)
+        boxes_to_draw = [
+            (bx1, by1, bx2, by2, pred, conf)
+            for (gidx2, (bx1, by1, bx2, by2), _sc, pred, conf) in results_per_crop
+            if gidx2 == gidx
+        ]
+        if boxes_to_draw:
+            crop_pil_drawn = draw_bboxes_on_image(crop_pil.copy(), boxes_to_draw)
         else:
+            crop_pil_drawn = crop_pil
+        group_crop_images.append(np.array(crop_pil_drawn))
+    if not results_per_crop:
+        return group_crop_images if group_crop_images else [], [], "No small-object crops: D-FINE on person/car crops did not detect any object (gun/phone/etc.), or all were below min size."
     # Build known-only gallery: only objects with conf >= min_display_conf
     known_crop_composites = []