Spaces:
Running
Running
Orkhan Hasanli commited on
Commit ·
cc2035e
1
Parent(s): 3ecb1be
Modified the pipeline to run DFINE on small crops again
Browse files- dfine_jina_pipeline.py +91 -101
dfine_jina_pipeline.py
CHANGED
|
@@ -123,8 +123,10 @@ def expand_box_by_margin(box, margin_ratio, img_w, img_h):
|
|
| 123 |
return [x1, y1, x2, y2]
|
| 124 |
|
| 125 |
|
| 126 |
-
# 10% margin on person/car group
|
| 127 |
PERSON_CAR_GROUP_MARGIN = 0.10
|
|
|
|
|
|
|
| 128 |
|
| 129 |
|
| 130 |
def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
|
|
@@ -569,127 +571,115 @@ def run_single_image(
|
|
| 569 |
grouped.sort(key=lambda x: x["conf"], reverse=True)
|
| 570 |
top_groups = grouped[:10]
|
| 571 |
|
| 572 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
for gidx, grp in enumerate(top_groups):
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
]
|
| 583 |
inside = deduplicate_by_iou(inside, iou_threshold=0.9)
|
| 584 |
|
| 585 |
-
|
|
|
|
| 586 |
bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
|
| 587 |
obj_w, obj_h = bx2 - bx1, by2 - by1
|
| 588 |
if obj_w <= 0 or obj_h <= 0:
|
| 589 |
continue
|
| 590 |
-
|
| 591 |
-
# Small objects (min side < 24 px): expand by 60%; larger: 30%
|
| 592 |
min_side_obj = min(obj_w, obj_h)
|
| 593 |
pad_ratio = 0.6 if min_side_obj < 24 else 0.3
|
| 594 |
pad_x = obj_w * pad_ratio
|
| 595 |
pad_y = obj_h * pad_ratio
|
| 596 |
-
bx1 = max(0,
|
| 597 |
-
by1 = max(0,
|
| 598 |
-
bx2 = min(
|
| 599 |
-
by2 = min(
|
| 600 |
-
|
| 601 |
if bx2 <= bx1 or by2 <= by1:
|
| 602 |
continue
|
| 603 |
-
|
| 604 |
-
if min(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
continue
|
|
|
|
| 606 |
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
def crop_area(box):
|
| 611 |
-
return (box[2] - box[0]) * (box[3] - box[1])
|
| 612 |
-
|
| 613 |
-
candidates.sort(key=lambda c: -crop_area(c[0]))
|
| 614 |
-
kept = []
|
| 615 |
-
|
| 616 |
-
for c in candidates:
|
| 617 |
-
def is_same_object(box_a, box_b):
|
| 618 |
-
if box_iou(box_a, box_b) >= crop_dedup_iou:
|
| 619 |
-
return True
|
| 620 |
-
if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
|
| 621 |
-
return True
|
| 622 |
-
return False
|
| 623 |
-
|
| 624 |
-
if not any(is_same_object(c[0], k[0]) for k in kept):
|
| 625 |
-
kept.append(c)
|
| 626 |
-
|
| 627 |
-
if not kept:
|
| 628 |
-
if not candidates:
|
| 629 |
-
return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
|
| 630 |
-
return [], [], "No small-object crops (after dedup)."
|
| 631 |
-
|
| 632 |
-
# Load Jina encoder + refs
|
| 633 |
-
if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
|
| 634 |
-
jina_encoder = JinaCLIPv2Encoder(device)
|
| 635 |
-
ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
|
| 636 |
-
_APP_JINA = (jina_encoder, ref_labels, ref_embs)
|
| 637 |
-
_APP_REFS_JINA = str(refs_dir)
|
| 638 |
|
| 639 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
else:
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
crop_pil = pil.crop((bx1, by1, bx2, by2))
|
| 657 |
-
|
| 658 |
-
q = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
|
| 659 |
-
result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
|
| 660 |
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
|
| 664 |
-
|
| 665 |
-
# Build group crop images: always show every person/car crop; draw only bboxes of crops we sent to classifier (no person/car bbox)
|
| 666 |
-
group_crop_images = []
|
| 667 |
-
for gidx, grp in enumerate(top_groups):
|
| 668 |
-
gx1, gy1, gx2, gy2 = grp["box"]
|
| 669 |
-
gx1, gy1 = int(gx1), int(gy1)
|
| 670 |
-
gx2, gy2 = int(gx2), int(gy2)
|
| 671 |
-
gx1, gy1 = max(0, gx1), max(0, gy1)
|
| 672 |
-
gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
|
| 673 |
-
if gx2 <= gx1 or gy2 <= gy1:
|
| 674 |
-
continue
|
| 675 |
-
group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
|
| 676 |
-
crop_w, crop_h = group_crop.size
|
| 677 |
-
|
| 678 |
-
# Only bboxes that were actually cropped and sent to classification (same shape as crop we sent)
|
| 679 |
-
boxes_to_draw = []
|
| 680 |
-
for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
|
| 681 |
-
if gidx2 != gidx:
|
| 682 |
-
continue
|
| 683 |
-
rx1 = max(0, min(crop_w, bx1 - gx1))
|
| 684 |
-
ry1 = max(0, min(crop_h, by1 - gy1))
|
| 685 |
-
rx2 = max(0, min(crop_w, bx2 - gx1))
|
| 686 |
-
ry2 = max(0, min(crop_h, by2 - gy1))
|
| 687 |
-
if rx2 > rx1 and ry2 > ry1:
|
| 688 |
-
boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
|
| 689 |
-
|
| 690 |
-
if boxes_to_draw:
|
| 691 |
-
group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
|
| 692 |
-
group_crop_images.append(np.array(group_crop))
|
| 693 |
|
| 694 |
# Build known-only gallery: only objects with conf >= min_display_conf
|
| 695 |
known_crop_composites = []
|
|
|
|
| 123 |
return [x1, y1, x2, y2]
|
| 124 |
|
| 125 |
|
| 126 |
+
# 10% margin on person/car group crop (expand crop before running D-FINE on it)
|
| 127 |
PERSON_CAR_GROUP_MARGIN = 0.10
|
| 128 |
+
# Min side (px) for object crops extracted from person/car crop before sending to classifier (objects in crop are larger)
|
| 129 |
+
MIN_OBJECT_CROP_SIDE = 112
|
| 130 |
|
| 131 |
|
| 132 |
def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
|
|
|
|
| 571 |
grouped.sort(key=lambda x: x["conf"], reverse=True)
|
| 572 |
top_groups = grouped[:10]
|
| 573 |
|
| 574 |
+
# Load Jina encoder + refs (needed for classification)
|
| 575 |
+
if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
|
| 576 |
+
jina_encoder = JinaCLIPv2Encoder(device)
|
| 577 |
+
ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
|
| 578 |
+
_APP_JINA = (jina_encoder, ref_labels, ref_embs)
|
| 579 |
+
_APP_REFS_JINA = str(refs_dir)
|
| 580 |
+
|
| 581 |
+
jina_encoder, ref_labels, ref_embs = _APP_JINA
|
| 582 |
|
| 583 |
+
results_per_crop = []
|
| 584 |
+
group_crop_images = []
|
| 585 |
+
|
| 586 |
+
# For each person/car group: crop (with 10% margin), run D-FINE on crop, detect objects, then classify each
|
| 587 |
for gidx, grp in enumerate(top_groups):
|
| 588 |
+
group_box = [grp["box"][0], grp["box"][1], grp["box"][2], grp["box"][3]]
|
| 589 |
+
crop_box = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
|
| 590 |
+
gx1 = max(0, int(crop_box[0]))
|
| 591 |
+
gy1 = max(0, int(crop_box[1]))
|
| 592 |
+
gx2 = min(img_w, int(crop_box[2]))
|
| 593 |
+
gy2 = min(img_h, int(crop_box[3]))
|
| 594 |
+
if gx2 <= gx1 or gy2 <= gy1:
|
| 595 |
+
continue
|
| 596 |
+
crop_pil = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
|
| 597 |
+
crop_w, crop_h = crop_pil.size
|
| 598 |
|
| 599 |
+
# Run D-FINE on person/car crop to detect objects inside
|
| 600 |
+
detections_crop = run_dfine(crop_pil, image_processor, dfine_model_obj, device, det_threshold)
|
| 601 |
+
inside = [d for d in detections_crop if d["cls"] not in person_car_ids]
|
|
|
|
| 602 |
inside = deduplicate_by_iou(inside, iou_threshold=0.9)
|
| 603 |
|
| 604 |
+
candidates = []
|
| 605 |
+
for d in inside:
|
| 606 |
bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
|
| 607 |
obj_w, obj_h = bx2 - bx1, by2 - by1
|
| 608 |
if obj_w <= 0 or obj_h <= 0:
|
| 609 |
continue
|
|
|
|
|
|
|
| 610 |
min_side_obj = min(obj_w, obj_h)
|
| 611 |
pad_ratio = 0.6 if min_side_obj < 24 else 0.3
|
| 612 |
pad_x = obj_w * pad_ratio
|
| 613 |
pad_y = obj_h * pad_ratio
|
| 614 |
+
bx1 = max(0.0, bx1 - pad_x)
|
| 615 |
+
by1 = max(0.0, by1 - pad_y)
|
| 616 |
+
bx2 = min(crop_w, bx2 + pad_x)
|
| 617 |
+
by2 = min(crop_h, by2 + pad_y)
|
|
|
|
| 618 |
if bx2 <= bx1 or by2 <= by1:
|
| 619 |
continue
|
| 620 |
+
w, h = bx2 - bx1, by2 - by1
|
| 621 |
+
if min(w, h) < MIN_OBJECT_CROP_SIDE:
|
| 622 |
+
need = MIN_OBJECT_CROP_SIDE - min(w, h)
|
| 623 |
+
half = need / 2.0
|
| 624 |
+
if w < h:
|
| 625 |
+
bx1 = max(0, bx1 - half)
|
| 626 |
+
bx2 = min(crop_w, bx2 + half)
|
| 627 |
+
else:
|
| 628 |
+
by1 = max(0, by1 - half)
|
| 629 |
+
by2 = min(crop_h, by2 + half)
|
| 630 |
+
w, h = bx2 - bx1, by2 - by1
|
| 631 |
+
if w < MIN_OBJECT_CROP_SIDE:
|
| 632 |
+
add = (MIN_OBJECT_CROP_SIDE - w) / 2
|
| 633 |
+
bx1 = max(0, bx1 - add)
|
| 634 |
+
bx2 = min(crop_w, bx2 + add)
|
| 635 |
+
if h < MIN_OBJECT_CROP_SIDE:
|
| 636 |
+
add = (MIN_OBJECT_CROP_SIDE - h) / 2
|
| 637 |
+
by1 = max(0, by1 - add)
|
| 638 |
+
by2 = min(crop_h, by2 + add)
|
| 639 |
+
bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
|
| 640 |
+
if bx2 <= bx1 or by2 <= by1:
|
| 641 |
continue
|
| 642 |
+
candidates.append(([bx1, by1, bx2, by2], d, gidx))
|
| 643 |
|
| 644 |
+
def crop_area(box):
|
| 645 |
+
return (box[2] - box[0]) * (box[3] - box[1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 646 |
|
| 647 |
+
candidates.sort(key=lambda c: -crop_area(c[0]))
|
| 648 |
+
kept = []
|
| 649 |
+
for c in candidates:
|
| 650 |
+
expanded_box = c[0]
|
| 651 |
+
if not any(
|
| 652 |
+
box_iou(expanded_box, k[0]) >= crop_dedup_iou
|
| 653 |
+
or box_center_inside(expanded_box, k[0])
|
| 654 |
+
or box_center_inside(k[0], expanded_box)
|
| 655 |
+
for k in kept
|
| 656 |
+
):
|
| 657 |
+
kept.append(c)
|
| 658 |
|
| 659 |
+
for (bx1, by1, bx2, by2), d, _ in kept:
|
| 660 |
+
if squarify:
|
| 661 |
+
bx1, by1, bx2, by2 = squarify_crop_box(bx1, by1, bx2, by2, crop_w, crop_h)
|
| 662 |
+
small_crop = crop_pil.crop((bx1, by1, bx2, by2))
|
| 663 |
+
q = jina_encoder.encode_images([small_crop], TRUNCATE_DIM)
|
| 664 |
+
result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
|
| 665 |
+
pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
|
| 666 |
+
conf = result["confidence"]
|
| 667 |
+
results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))
|
| 668 |
+
|
| 669 |
+
# Draw bboxes on this group crop (bboxes already in crop coords)
|
| 670 |
+
boxes_to_draw = [
|
| 671 |
+
(bx1, by1, bx2, by2, pred, conf)
|
| 672 |
+
for (gidx2, (bx1, by1, bx2, by2), _sc, pred, conf) in results_per_crop
|
| 673 |
+
if gidx2 == gidx
|
| 674 |
+
]
|
| 675 |
+
if boxes_to_draw:
|
| 676 |
+
crop_pil_drawn = draw_bboxes_on_image(crop_pil.copy(), boxes_to_draw)
|
| 677 |
else:
|
| 678 |
+
crop_pil_drawn = crop_pil
|
| 679 |
+
group_crop_images.append(np.array(crop_pil_drawn))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 680 |
|
| 681 |
+
if not results_per_crop:
|
| 682 |
+
return group_crop_images if group_crop_images else [], [], "No small-object crops: D-FINE on person/car crops did not detect any object (gun/phone/etc.), or all were below min size."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
|
| 684 |
# Build known-only gallery: only objects with conf >= min_display_conf
|
| 685 |
known_crop_composites = []
|