Orkhan Hasanli commited on
Commit
cc2035e
·
1 Parent(s): 3ecb1be

Modified the pipeline to run DFINE on small crops again

Browse files
Files changed (1) hide show
  1. dfine_jina_pipeline.py +91 -101
dfine_jina_pipeline.py CHANGED
@@ -123,8 +123,10 @@ def expand_box_by_margin(box, margin_ratio, img_w, img_h):
123
  return [x1, y1, x2, y2]
124
 
125
 
126
- # 10% margin on person/car group box when testing if object center is inside (avoids missing boundary objects)
127
  PERSON_CAR_GROUP_MARGIN = 0.10
 
 
128
 
129
 
130
  def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
@@ -569,127 +571,115 @@ def run_single_image(
569
  grouped.sort(key=lambda x: x["conf"], reverse=True)
570
  top_groups = grouped[:10]
571
 
572
- candidates = []
 
 
 
 
 
 
 
573
 
 
 
 
 
574
  for gidx, grp in enumerate(top_groups):
575
- x1, y1, x2, y2 = grp["box"]
576
- group_box = [x1, y1, x2, y2]
577
- group_box_with_margin = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
 
 
 
 
 
 
 
578
 
579
- inside = [
580
- d for d in detections
581
- if box_center_inside(d["box"], group_box_with_margin) and d["cls"] not in person_car_ids
582
- ]
583
  inside = deduplicate_by_iou(inside, iou_threshold=0.9)
584
 
585
- for crop_idx, d in enumerate(inside):
 
586
  bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
587
  obj_w, obj_h = bx2 - bx1, by2 - by1
588
  if obj_w <= 0 or obj_h <= 0:
589
  continue
590
-
591
- # Small objects (min side < 24 px): expand by 60%; larger: 30%
592
  min_side_obj = min(obj_w, obj_h)
593
  pad_ratio = 0.6 if min_side_obj < 24 else 0.3
594
  pad_x = obj_w * pad_ratio
595
  pad_y = obj_h * pad_ratio
596
- bx1 = max(0, int(bx1 - pad_x))
597
- by1 = max(0, int(by1 - pad_y))
598
- bx2 = min(img_w, int(bx2 + pad_x))
599
- by2 = min(img_h, int(by2 + pad_y))
600
-
601
  if bx2 <= bx1 or by2 <= by1:
602
  continue
603
-
604
- if min(bx2 - bx1, by2 - by1) < min_side:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  continue
 
606
 
607
- expanded_box = [bx1, by1, bx2, by2]
608
- candidates.append((expanded_box, d, gidx, crop_idx))
609
-
610
- def crop_area(box):
611
- return (box[2] - box[0]) * (box[3] - box[1])
612
-
613
- candidates.sort(key=lambda c: -crop_area(c[0]))
614
- kept = []
615
-
616
- for c in candidates:
617
- def is_same_object(box_a, box_b):
618
- if box_iou(box_a, box_b) >= crop_dedup_iou:
619
- return True
620
- if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
621
- return True
622
- return False
623
-
624
- if not any(is_same_object(c[0], k[0]) for k in kept):
625
- kept.append(c)
626
-
627
- if not kept:
628
- if not candidates:
629
- return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
630
- return [], [], "No small-object crops (after dedup)."
631
-
632
- # Load Jina encoder + refs
633
- if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
634
- jina_encoder = JinaCLIPv2Encoder(device)
635
- ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
636
- _APP_JINA = (jina_encoder, ref_labels, ref_embs)
637
- _APP_REFS_JINA = str(refs_dir)
638
 
639
- jina_encoder, ref_labels, ref_embs = _APP_JINA
 
 
 
 
 
 
 
 
 
 
640
 
641
- # Classify each kept crop and store (gidx, box_in_full_image, crop_pil, pred, conf)
642
- results_per_crop = []
643
- for expanded_box, d, gidx, crop_idx in kept:
644
- if squarify:
645
- bx1, by1, bx2, by2 = squarify_crop_box(
646
- expanded_box[0],
647
- expanded_box[1],
648
- expanded_box[2],
649
- expanded_box[3],
650
- img_w,
651
- img_h,
652
- )
 
 
 
 
 
 
653
  else:
654
- bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
655
-
656
- crop_pil = pil.crop((bx1, by1, bx2, by2))
657
-
658
- q = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
659
- result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
660
 
661
- pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
662
- conf = result["confidence"]
663
- results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
664
-
665
- # Build group crop images: always show every person/car crop; draw only bboxes of crops we sent to classifier (no person/car bbox)
666
- group_crop_images = []
667
- for gidx, grp in enumerate(top_groups):
668
- gx1, gy1, gx2, gy2 = grp["box"]
669
- gx1, gy1 = int(gx1), int(gy1)
670
- gx2, gy2 = int(gx2), int(gy2)
671
- gx1, gy1 = max(0, gx1), max(0, gy1)
672
- gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
673
- if gx2 <= gx1 or gy2 <= gy1:
674
- continue
675
- group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
676
- crop_w, crop_h = group_crop.size
677
-
678
- # Only bboxes that were actually cropped and sent to classification (same shape as crop we sent)
679
- boxes_to_draw = []
680
- for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
681
- if gidx2 != gidx:
682
- continue
683
- rx1 = max(0, min(crop_w, bx1 - gx1))
684
- ry1 = max(0, min(crop_h, by1 - gy1))
685
- rx2 = max(0, min(crop_w, bx2 - gx1))
686
- ry2 = max(0, min(crop_h, by2 - gy1))
687
- if rx2 > rx1 and ry2 > ry1:
688
- boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
689
-
690
- if boxes_to_draw:
691
- group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
692
- group_crop_images.append(np.array(group_crop))
693
 
694
  # Build known-only gallery: only objects with conf >= min_display_conf
695
  known_crop_composites = []
 
123
  return [x1, y1, x2, y2]
124
 
125
 
126
+ # 10% margin on person/car group crop (expand crop before running D-FINE on it)
127
  PERSON_CAR_GROUP_MARGIN = 0.10
128
+ # Min side (px) for object crops extracted from person/car crop before sending to classifier (objects in crop are larger)
129
+ MIN_OBJECT_CROP_SIDE = 112
130
 
131
 
132
  def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
 
571
  grouped.sort(key=lambda x: x["conf"], reverse=True)
572
  top_groups = grouped[:10]
573
 
574
+ # Load Jina encoder + refs (needed for classification)
575
+ if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
576
+ jina_encoder = JinaCLIPv2Encoder(device)
577
+ ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
578
+ _APP_JINA = (jina_encoder, ref_labels, ref_embs)
579
+ _APP_REFS_JINA = str(refs_dir)
580
+
581
+ jina_encoder, ref_labels, ref_embs = _APP_JINA
582
 
583
+ results_per_crop = []
584
+ group_crop_images = []
585
+
586
+ # For each person/car group: crop (with 10% margin), run D-FINE on crop, detect objects, then classify each
587
  for gidx, grp in enumerate(top_groups):
588
+ group_box = [grp["box"][0], grp["box"][1], grp["box"][2], grp["box"][3]]
589
+ crop_box = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
590
+ gx1 = max(0, int(crop_box[0]))
591
+ gy1 = max(0, int(crop_box[1]))
592
+ gx2 = min(img_w, int(crop_box[2]))
593
+ gy2 = min(img_h, int(crop_box[3]))
594
+ if gx2 <= gx1 or gy2 <= gy1:
595
+ continue
596
+ crop_pil = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
597
+ crop_w, crop_h = crop_pil.size
598
 
599
+ # Run D-FINE on person/car crop to detect objects inside
600
+ detections_crop = run_dfine(crop_pil, image_processor, dfine_model_obj, device, det_threshold)
601
+ inside = [d for d in detections_crop if d["cls"] not in person_car_ids]
 
602
  inside = deduplicate_by_iou(inside, iou_threshold=0.9)
603
 
604
+ candidates = []
605
+ for d in inside:
606
  bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
607
  obj_w, obj_h = bx2 - bx1, by2 - by1
608
  if obj_w <= 0 or obj_h <= 0:
609
  continue
 
 
610
  min_side_obj = min(obj_w, obj_h)
611
  pad_ratio = 0.6 if min_side_obj < 24 else 0.3
612
  pad_x = obj_w * pad_ratio
613
  pad_y = obj_h * pad_ratio
614
+ bx1 = max(0.0, bx1 - pad_x)
615
+ by1 = max(0.0, by1 - pad_y)
616
+ bx2 = min(crop_w, bx2 + pad_x)
617
+ by2 = min(crop_h, by2 + pad_y)
 
618
  if bx2 <= bx1 or by2 <= by1:
619
  continue
620
+ w, h = bx2 - bx1, by2 - by1
621
+ if min(w, h) < MIN_OBJECT_CROP_SIDE:
622
+ need = MIN_OBJECT_CROP_SIDE - min(w, h)
623
+ half = need / 2.0
624
+ if w < h:
625
+ bx1 = max(0, bx1 - half)
626
+ bx2 = min(crop_w, bx2 + half)
627
+ else:
628
+ by1 = max(0, by1 - half)
629
+ by2 = min(crop_h, by2 + half)
630
+ w, h = bx2 - bx1, by2 - by1
631
+ if w < MIN_OBJECT_CROP_SIDE:
632
+ add = (MIN_OBJECT_CROP_SIDE - w) / 2
633
+ bx1 = max(0, bx1 - add)
634
+ bx2 = min(crop_w, bx2 + add)
635
+ if h < MIN_OBJECT_CROP_SIDE:
636
+ add = (MIN_OBJECT_CROP_SIDE - h) / 2
637
+ by1 = max(0, by1 - add)
638
+ by2 = min(crop_h, by2 + add)
639
+ bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
640
+ if bx2 <= bx1 or by2 <= by1:
641
  continue
642
+ candidates.append(([bx1, by1, bx2, by2], d, gidx))
643
 
644
+ def crop_area(box):
645
+ return (box[2] - box[0]) * (box[3] - box[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
647
+ candidates.sort(key=lambda c: -crop_area(c[0]))
648
+ kept = []
649
+ for c in candidates:
650
+ expanded_box = c[0]
651
+ if not any(
652
+ box_iou(expanded_box, k[0]) >= crop_dedup_iou
653
+ or box_center_inside(expanded_box, k[0])
654
+ or box_center_inside(k[0], expanded_box)
655
+ for k in kept
656
+ ):
657
+ kept.append(c)
658
 
659
+ for (bx1, by1, bx2, by2), d, _ in kept:
660
+ if squarify:
661
+ bx1, by1, bx2, by2 = squarify_crop_box(bx1, by1, bx2, by2, crop_w, crop_h)
662
+ small_crop = crop_pil.crop((bx1, by1, bx2, by2))
663
+ q = jina_encoder.encode_images([small_crop], TRUNCATE_DIM)
664
+ result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
665
+ pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
666
+ conf = result["confidence"]
667
+ results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))
668
+
669
+ # Draw bboxes on this group crop (bboxes already in crop coords)
670
+ boxes_to_draw = [
671
+ (bx1, by1, bx2, by2, pred, conf)
672
+ for (gidx2, (bx1, by1, bx2, by2), _sc, pred, conf) in results_per_crop
673
+ if gidx2 == gidx
674
+ ]
675
+ if boxes_to_draw:
676
+ crop_pil_drawn = draw_bboxes_on_image(crop_pil.copy(), boxes_to_draw)
677
  else:
678
+ crop_pil_drawn = crop_pil
679
+ group_crop_images.append(np.array(crop_pil_drawn))
 
 
 
 
680
 
681
+ if not results_per_crop:
682
+ return group_crop_images if group_crop_images else [], [], "No small-object crops: D-FINE on person/car crops did not detect any object (gun/phone/etc.), or all were below min size."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
684
  # Build known-only gallery: only objects with conf >= min_display_conf
685
  known_crop_composites = []