Spaces:

Napron
/

small_object_detection

Running

App Files Files Community

orik-ss commited on Mar 13

Commit

cc88e05

2 Parent(s): bb55c4d cc2035e

Merge branch 'main' of https://huggingface.co/spaces/Napron/small_object_detection

Browse files

Files changed (2) hide show

app.py +47 -20
dfine_jina_pipeline.py +157 -199

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + Classify (Jina or Nomic). """
 import os
 os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
@@ -9,7 +9,7 @@ import gradio as gr
 from ultralytics import YOLO
 from pathlib import Path
-# Tab 2: D-FINE runs first, then user chooses Jina or Nomic for crop classification
 from dfine_jina_pipeline import run_single_image
@@ -108,8 +108,8 @@ def run_detection(image, model):
     return out_img, det_json
-def run_dfine_classify(image, encoder_choice, refs_path, min_display_conf=0.7):
-    """Tab 2: D-FINE first, then classify crops with Jina or Nomic.
     Returns (group_crop_gallery, known_crop_gallery, status_message).
     """
     if image is None:
@@ -120,15 +120,14 @@ def run_dfine_classify(image, encoder_choice, refs_path, min_display_conf=0.7):
     if not refs.is_dir():
         return [], [], f"Refs folder not found: {refs}"
-    # Tuned on COCO GT: conf=0.5, gap=0.02.
-    # Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
     group_crops, known_crops, status = run_single_image(
         image,
         refs_dir=refs,
-        encoder_choice=encoder_choice.lower(),
-        det_threshold=0.15,
         conf_threshold=0.5,
-        gap_threshold=0.02,
         min_side=24,
         crop_dedup_iou=0.4,
         min_display_conf=float(min_display_conf),
@@ -230,10 +229,12 @@ with gr.Blocks(title="Small Object Detection") as app:
         with gr.TabItem("D-FINE + Classify"):
             gr.Markdown(
-                "**D-FINE** runs first (person/car grouping), then small-object crops are classified. "
-                "Choose **Jina** or **Nomic** for the embedding/classification model. "
                 "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
-                "with reference images."
             )
             with gr.Row():
@@ -246,10 +247,28 @@ with gr.Blocks(title="Small Object Detection") as app:
                         height=IMG_HEIGHT
                     )
-                    encoder_choice = gr.Radio(
-                        choices=["Jina", "Nomic"],
-                        value="Jina",
-                        label="Embedding / classification model",
                     )
                     refs_path = gr.Textbox(
@@ -268,13 +287,21 @@ with gr.Blocks(title="Small Object Detection") as app:
                     threshold_slider = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
-                        value=0.7,
-                        step=0.05,
                         label="Threshold (min display confidence)",
                     )
                     out_gallery_dfine = gr.Gallery(
-                        label="Person/car crops (bboxes: gun, knife, cigarette, phone only)",
                         height=IMG_HEIGHT,
                         columns=2,
                         object_fit="contain",
@@ -295,7 +322,7 @@ with gr.Blocks(title="Small Object Detection") as app:
             btn_dfine.click(
                 fn=run_dfine_classify,
-                inputs=[inp_dfine, encoder_choice, refs_path, threshold_slider],
                 outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
                 concurrency_limit=1,
             )

+""" Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + Classify (Jina). """
 import os
 os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
 from ultralytics import YOLO
 from pathlib import Path
+# Tab 2: D-FINE runs first, then Jina for crop classification
 from dfine_jina_pipeline import run_single_image
     return out_img, det_json
+def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, min_display_conf=0.703, gap_threshold=0.005):
+    """Tab 2: D-FINE first, then classify crops with Jina.
     Returns (group_crop_gallery, known_crop_gallery, status_message).
     """
     if image is None:
     if not refs.is_dir():
         return [], [], f"Refs folder not found: {refs}"
+    dfine_model = "large" if dfine_model_choice.strip().lower() == "large" else "medium"
     group_crops, known_crops, status = run_single_image(
         image,
         refs_dir=refs,
+        dfine_model=dfine_model,
+        det_threshold=float(dfine_threshold),
         conf_threshold=0.5,
+        gap_threshold=float(gap_threshold),
         min_side=24,
         crop_dedup_iou=0.4,
         min_display_conf=float(min_display_conf),
         with gr.TabItem("D-FINE + Classify"):
             gr.Markdown(
+                "**D-FINE** runs first (person/car grouping), then small-object crops are classified with **Jina**. "
+                "Choose D-FINE model size (Medium or Large). "
                 "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
+                "with reference images.\n\n"
+                "**Gap** = how much the top class (e.g. gun) must beat the next-best class (e.g. phone). "
+                "Bigger gap means the model is more sure; we only accept the label if both confidence and gap are high enough."
             )
             with gr.Row():
                         height=IMG_HEIGHT
                     )
+                    dfine_model_radio = gr.Radio(
+                        choices=["Medium", "Large"],
+                        value="Large",
+                        label="D-FINE model",
+                    )
+                    # Default threshold: Large=0.2, Medium=0.15 (slider updates when model changes)
+                    dfine_threshold_slider = gr.Slider(
+                        minimum=0.05,
+                        maximum=0.5,
+                        value=0.2,
+                        step=0.05,
+                        label="D-FINE detection threshold (applied to chosen model)",
+                    )
+                    def update_dfine_threshold_default(choice):
+                        return gr.update(value=0.2 if (choice and choice.strip().lower() == "large") else 0.15)
+                    dfine_model_radio.change(
+                        fn=update_dfine_threshold_default,
+                        inputs=[dfine_model_radio],
+                        outputs=[dfine_threshold_slider],
                     )
                     refs_path = gr.Textbox(
                     threshold_slider = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
+                        value=0.703,
+                        step=0.005,
                         label="Threshold (min display confidence)",
                     )
+                    gap_slider = gr.Slider(
+                        minimum=0.0,
+                        maximum=0.02,
+                        value=0.005,
+                        step=0.001,
+                        label="Gap: how much the top guess must beat the runner-up (higher = stricter, fewer accepted)",
+                    )
                     out_gallery_dfine = gr.Gallery(
+                        label="Person/car crops (all D-FINE objects inside drawn with label + score)",
                         height=IMG_HEIGHT,
                         columns=2,
                         object_fit="contain",
             btn_dfine.click(
                 fn=run_dfine_classify,
+                inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio, threshold_slider, gap_slider],
                 outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
                 concurrency_limit=1,
             )

dfine_jina_pipeline.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """ Pipeline: D-FINE (person/car only) → group detections → crop regions →
-find all bboxes inside each crop → Jina-CLIP-v2 and Nomic embeddings on those crops.
-Outputs separate crop folders per model (jina_crops, nomic_crops) for visual comparison.
 """
 import argparse
@@ -29,9 +29,8 @@ from jina_fewshot import (
 KNOWN_DISPLAY_CLASSES = {"gun", "knife", "cigarette", "phone"}
 # Only show objects (and group crops) with confidence >= this
 MIN_DISPLAY_CONF = 0.7
-from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
 # -----------------------------------------------------------------------------
 # Detection + grouping (from reference_detection.py)
@@ -109,6 +108,27 @@ def box_center_inside(box, crop_box):
     )
 def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
     """
     Expand the shorter side to match the longer (same ratio / square), centered, clamped to image.
@@ -177,7 +197,7 @@ def parse_args():
     p = argparse.ArgumentParser(
         description="D-FINE (person/car) → group → Jina-CLIP-v2 on crops inside groups"
     )
-    p.add_argument("--refs", required=True, help="Reference images folder for Jina and Nomic (e.g. refs/)")
     p.add_argument("--input", required=True, help="Full-frame images folder")
     p.add_argument("--output", default="pipeline_results", help="Output folder (CSV, etc.)")
     p.add_argument("--det-threshold", type=float, default=0.13, help="D-FINE score threshold")
@@ -191,6 +211,7 @@ def parse_args():
     p.add_argument("--text-weight", type=float, default=0.3)
     p.add_argument("--max-images", type=int, default=None)
     p.add_argument("--device", default=None)
     return p.parse_args()
@@ -282,10 +303,11 @@ def main():
         raise SystemExit(f"No images in {input_dir}")
     # Load D-FINE
-    print("[*] Loading D-FINE (dfine-medium-obj365)...")
     t0 = time.perf_counter()
-    image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-medium-obj365")
-    dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
     dfine_model = dfine_model.to(device).eval()
     person_car_ids = get_person_car_label_ids(dfine_model)
     print(f"  Person/car label IDs: {person_car_ids} ({time.perf_counter()-t0:.1f}s)")
@@ -303,25 +325,8 @@ def main():
     )
     print(f"  Jina refs: {ref_labels} ({time.perf_counter()-t0:.1f}s)\n")
-    # Load Nomic vision + text, build refs (same as Jina: image + text prompts, text_weight 0.3)
-    print("[*] Loading Nomic embed-vision + embed-text and building refs...")
-    t0 = time.perf_counter()
-    nomic_encoder = NomicVisionEncoder(device)
-    nomic_text_encoder = NomicTextEncoder(device)
-    ref_labels_nomic, ref_embs_nomic = build_refs_nomic(
-        nomic_encoder,
-        refs_dir,
-        batch_size=16,
-        text_encoder=nomic_text_encoder,
-        text_weight=args.text_weight,
-    )
-    print(f"  Nomic refs: {ref_labels_nomic} ({time.perf_counter()-t0:.1f}s)\n")
-    # Separate output folders per model for visual comparison
     jina_crops_dir = output_dir / "jina_crops"
-    nomic_crops_dir = output_dir / "nomic_crops"
     jina_crops_dir.mkdir(parents=True, exist_ok=True)
-    nomic_crops_dir.mkdir(parents=True, exist_ok=True)
     # CSV
     csv_path = output_dir / "results.csv"
@@ -344,9 +349,6 @@ def main():
         "jina_prediction",
         "jina_confidence",
         "jina_status",
-        "nomic_prediction",
-        "nomic_confidence",
-        "nomic_status",
     ])
     for img_path in paths:
@@ -363,7 +365,7 @@ def main():
             args.det_threshold
         )
-        person_car = [d for d in detections if d["cls"] in person_car_ids]
         if not person_car:
             continue
@@ -379,10 +381,11 @@ def main():
         for gidx, grp in enumerate(top_groups):
             x1, y1, x2, y2 = grp["box"]
             group_box = [x1, y1, x2, y2]
             inside = [
                 d for d in detections
-                if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
             ]
             inside = deduplicate_by_iou(inside, iou_threshold=0.9)
@@ -392,8 +395,11 @@ def main():
                 if obj_w <= 0 or obj_h <= 0:
                     continue
-                pad_x = obj_w * 0.3
-                pad_y = obj_h * 0.3
                 bx1 = max(0, int(bx1 - pad_x))
                 by1 = max(0, int(by1 - pad_y))
                 bx2 = min(img_w, int(bx2 + pad_x))
@@ -428,7 +434,7 @@ def main():
             if not any(is_same_object(expanded_box, k[0]) for k in kept):
                 kept.append(c)
-        # 5) Optionally squarify, then run Jina and Nomic only on kept crops
         for i, (expanded_box, d, gidx, crop_idx, x1, y1, x2, y2) in enumerate(kept):
             if not args.no_squarify:
                 bx1, by1, bx2, by2 = squarify_crop_box(
@@ -464,25 +470,6 @@ def main():
             ann_jina = draw_label_on_image(crop_pil, label_jina, conf_jina)
             ann_jina.save(jina_crops_dir / crop_name)
-            q_nomic = nomic_encoder.encode_images([crop_pil])
-            result_nomic = jina_classify(
-                q_nomic,
-                ref_labels_nomic,
-                ref_embs_nomic,
-                args.conf_threshold,
-                args.gap_threshold
-            )
-            if result_nomic["prediction"] in ref_labels_nomic:
-                label_nomic = result_nomic["prediction"]
-                conf_nomic = result_nomic["confidence"]
-            else:
-                label_nomic = f"unnamed (dfine: {d['label']})"
-                conf_nomic = 0.0
-            ann_nomic = draw_label_on_image(crop_pil, label_nomic, conf_nomic)
-            ann_nomic.save(nomic_crops_dir / crop_name)
             w.writerow([
                 img_path.name,
                 crop_name,
@@ -500,33 +487,29 @@ def main():
                 result_jina["prediction"],
                 f"{result_jina['confidence']:.4f}",
                 result_jina["status"],
-                result_nomic["prediction"],
-                f"{result_nomic['confidence']:.4f}",
-                result_nomic["status"],
             ])
     f.close()
     print(f"[*] Wrote {csv_path}")
     print(f"[*] Jina crops: {jina_crops_dir}")
-    print(f"[*] Nomic crops: {nomic_crops_dir}")
 # -----------------------------------------------------------------------------
-# Single-image runner for Gradio app: D-FINE first, then Jina or Nomic (user choice)
 # -----------------------------------------------------------------------------
-_APP_DFINE = None
 _APP_JINA = None
-_APP_NOMIC = None
 _APP_REFS_JINA = None
-_APP_REFS_NOMIC = None
 def run_single_image(
     pil_image,
     refs_dir,
     device=None,
-    encoder_choice="jina",
     det_threshold=0.3,
     conf_threshold=0.75,
     gap_threshold=0.05,
@@ -536,15 +519,12 @@ def run_single_image(
     min_display_conf=None,
 ):
     """
-    Run D-FINE on one image, then classify small-object crops with Jina or Nomic.
     refs_dir: path to refs folder (str or Path).
-    encoder_choice: "jina" or "nomic".
     Returns (group_crop_images, known_crop_composites, status_message).
-    - group_crop_images: list of PIL/numpy (one per person/car group, with bboxes for known objects only).
-    - known_crop_composites: list of PIL/numpy (label+score above + crop) for known classes only.
-    - status_message: None on success, or error/empty-state string.
     """
     import numpy as np
@@ -552,12 +532,17 @@ def run_single_image(
         min_display_conf = MIN_DISPLAY_CONF
     from PIL import Image
-    global _APP_DFINE, _APP_JINA, _APP_NOMIC, _APP_REFS_JINA, _APP_REFS_NOMIC
     refs_dir = Path(refs_dir)
     if not refs_dir.is_dir():
         return [], [], f"Refs folder not found: {refs_dir}"
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     print(f"[*] Device: {device}")
@@ -565,163 +550,136 @@ def run_single_image(
     img_w, img_h = pil.size
     group_dist = 0.1 * max(img_h, img_w)
-    # Load D-FINE once
-    if _APP_DFINE is None:
-        image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-medium-obj365")
-        dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
-        dfine_model = dfine_model.to(device).eval()
-        person_car_ids = get_person_car_label_ids(dfine_model)
-        _APP_DFINE = (image_processor, dfine_model, person_car_ids)
-    image_processor, dfine_model, person_car_ids = _APP_DFINE
-    detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
-    person_car = [d for d in detections if d["cls"] in person_car_ids]
     if not person_car:
-        return [], [], "No person/car detected. No small-object crops."
     grouped = group_detections(person_car, group_dist)
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
-    candidates = []
     for gidx, grp in enumerate(top_groups):
-        x1, y1, x2, y2 = grp["box"]
-        group_box = [x1, y1, x2, y2]
-        inside = [
-            d for d in detections
-            if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
-        ]
         inside = deduplicate_by_iou(inside, iou_threshold=0.9)
-        for crop_idx, d in enumerate(inside):
             bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
             obj_w, obj_h = bx2 - bx1, by2 - by1
             if obj_w <= 0 or obj_h <= 0:
                 continue
-            pad_x, pad_y = obj_w * 0.3, obj_h * 0.3
-            bx1 = max(0, int(bx1 - pad_x))
-            by1 = max(0, int(by1 - pad_y))
-            bx2 = min(img_w, int(bx2 + pad_x))
-            by2 = min(img_h, int(by2 + pad_y))
             if bx2 <= bx1 or by2 <= by1:
                 continue
-            if min(bx2 - bx1, by2 - by1) < min_side:
                 continue
-            expanded_box = [bx1, by1, bx2, by2]
-            candidates.append((expanded_box, d, gidx, crop_idx))
-    def crop_area(box):
-        return (box[2] - box[0]) * (box[3] - box[1])
-    candidates.sort(key=lambda c: -crop_area(c[0]))
-    kept = []
-    for c in candidates:
-        def is_same_object(box_a, box_b):
-            if box_iou(box_a, box_b) >= crop_dedup_iou:
-                return True
-            if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
-                return True
-            return False
-        if not any(is_same_object(c[0], k[0]) for k in kept):
-            kept.append(c)
-    if not kept:
-        if not candidates:
-            return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
-        return [], [], "No small-object crops (after dedup)."
-    # Load encoder + refs for chosen model
-    if encoder_choice == "jina":
-        if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
-            jina_encoder = JinaCLIPv2Encoder(device)
-            ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
-            _APP_JINA = (jina_encoder, ref_labels, ref_embs)
-            _APP_REFS_JINA = str(refs_dir)
-        jina_encoder, ref_labels, ref_embs = _APP_JINA
-    else:
-        if _APP_NOMIC is None or _APP_REFS_NOMIC != str(refs_dir):
-            nomic_encoder = NomicVisionEncoder(device)
-            nomic_text_encoder = NomicTextEncoder(device)
-            ref_labels, ref_embs = build_refs_nomic(
-                nomic_encoder,
-                refs_dir,
-                batch_size=16,
-                text_encoder=nomic_text_encoder,
-                text_weight=0.3,
-            )
-            _APP_NOMIC = (nomic_encoder, ref_labels, ref_embs)
-            _APP_REFS_NOMIC = str(refs_dir)
-        nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
-    # Classify each kept crop and store (gidx, box_in_full_image, crop_pil, pred, conf)
-    results_per_crop = []
-    for expanded_box, d, gidx, crop_idx in kept:
-        if squarify:
-            bx1, by1, bx2, by2 = squarify_crop_box(
-                expanded_box[0],
-                expanded_box[1],
-                expanded_box[2],
-                expanded_box[3],
-                img_w,
-                img_h,
-            )
-        else:
-            bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
-        crop_pil = pil.crop((bx1, by1, bx2, by2))
-        if encoder_choice == "jina":
-            q = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
             result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
         else:
-            q = nomic_encoder.encode_images([crop_pil])
-            result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
-        pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
-        conf = result["confidence"]
-        results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
-    # Build group crop images: only groups that contain at least one known object with conf >= MIN_DISPLAY_CONF
-    group_crop_images = []
-    for gidx, grp in enumerate(top_groups):
-        gx1, gy1, gx2, gy2 = grp["box"]
-        gx1, gy1 = int(gx1), int(gy1)
-        gx2, gy2 = int(gx2), int(gy2)
-        gx1, gy1 = max(0, gx1), max(0, gy1)
-        gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
-        if gx2 <= gx1 or gy2 <= gy1:
-            continue
-        group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
-        crop_w, crop_h = group_crop.size
-        boxes_to_draw = []
-        for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
-            if gidx2 != gidx or pred not in KNOWN_DISPLAY_CLASSES or conf < min_display_conf:
-                continue
-            # Convert to group-crop-relative coords and clamp
-            rx1 = max(0, min(crop_w, bx1 - gx1))
-            ry1 = max(0, min(crop_h, by1 - gy1))
-            rx2 = max(0, min(crop_w, bx2 - gx1))
-            ry2 = max(0, min(crop_h, by2 - gy1))
-            if rx2 > rx1 and ry2 > ry1:
-                boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
-        # Only show this group crop if it has at least one known object >= min_display_conf
-        if not boxes_to_draw:
-            continue
-        group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
-        group_crop_images.append(np.array(group_crop))
     # Build known-only gallery: only objects with conf >= min_display_conf
     known_crop_composites = []

 """ Pipeline: D-FINE (person/car only) → group detections → crop regions →
+find all bboxes inside each crop → Jina-CLIP-v2 embeddings and classification.
+Outputs jina_crops folder and results CSV.
 """
 import argparse
 KNOWN_DISPLAY_CLASSES = {"gun", "knife", "cigarette", "phone"}
 # Only show objects (and group crops) with confidence >= this
 MIN_DISPLAY_CONF = 0.7
+# Person/car detections must have confidence > this to be used for grouping
+PERSON_CAR_MIN_CONF = 0.9
 # -----------------------------------------------------------------------------
 # Detection + grouping (from reference_detection.py)
     )
+def expand_box_by_margin(box, margin_ratio, img_w, img_h):
+    """Expand box [x1,y1,x2,y2] by margin_ratio (e.g. 0.1 = 10%) on all sides, clamped to image."""
+    x1, y1, x2, y2 = box
+    w, h = x2 - x1, y2 - y1
+    if w <= 0 or h <= 0:
+        return box
+    mx = w * margin_ratio
+    my = h * margin_ratio
+    x1 = max(0, x1 - mx)
+    y1 = max(0, y1 - my)
+    x2 = min(img_w, x2 + mx)
+    y2 = min(img_h, y2 + my)
+    return [x1, y1, x2, y2]
+# 10% margin on person/car group crop (expand crop before running D-FINE on it)
+PERSON_CAR_GROUP_MARGIN = 0.10
+# Min side (px) for object crops extracted from person/car crop before sending to classifier (objects in crop are larger)
+MIN_OBJECT_CROP_SIDE = 112
 def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
     """
     Expand the shorter side to match the longer (same ratio / square), centered, clamped to image.
     p = argparse.ArgumentParser(
         description="D-FINE (person/car) → group → Jina-CLIP-v2 on crops inside groups"
     )
+    p.add_argument("--refs", required=True, help="Reference images folder for Jina (e.g. refs/)")
     p.add_argument("--input", required=True, help="Full-frame images folder")
     p.add_argument("--output", default="pipeline_results", help="Output folder (CSV, etc.)")
     p.add_argument("--det-threshold", type=float, default=0.13, help="D-FINE score threshold")
     p.add_argument("--text-weight", type=float, default=0.3)
     p.add_argument("--max-images", type=int, default=None)
     p.add_argument("--device", default=None)
+    p.add_argument("--dfine-model", choices=["medium", "large"], default="large", help="D-FINE model size")
     return p.parse_args()
         raise SystemExit(f"No images in {input_dir}")
     # Load D-FINE
+    dfine_model_id = DFINE_MODEL_IDS.get(args.dfine_model, DFINE_MODEL_IDS["large"])
+    print(f"[*] Loading D-FINE ({dfine_model_id})...")
     t0 = time.perf_counter()
+    image_processor = AutoImageProcessor.from_pretrained(dfine_model_id)
+    dfine_model = DFineForObjectDetection.from_pretrained(dfine_model_id)
     dfine_model = dfine_model.to(device).eval()
     person_car_ids = get_person_car_label_ids(dfine_model)
     print(f"  Person/car label IDs: {person_car_ids} ({time.perf_counter()-t0:.1f}s)")
     )
     print(f"  Jina refs: {ref_labels} ({time.perf_counter()-t0:.1f}s)\n")
     jina_crops_dir = output_dir / "jina_crops"
     jina_crops_dir.mkdir(parents=True, exist_ok=True)
     # CSV
     csv_path = output_dir / "results.csv"
         "jina_prediction",
         "jina_confidence",
         "jina_status",
     ])
     for img_path in paths:
             args.det_threshold
         )
+        person_car = [d for d in detections if d["cls"] in person_car_ids and d["conf"] > PERSON_CAR_MIN_CONF]
         if not person_car:
             continue
         for gidx, grp in enumerate(top_groups):
             x1, y1, x2, y2 = grp["box"]
             group_box = [x1, y1, x2, y2]
+            group_box_with_margin = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
             inside = [
                 d for d in detections
+                if box_center_inside(d["box"], group_box_with_margin) and d["cls"] not in person_car_ids
             ]
             inside = deduplicate_by_iou(inside, iou_threshold=0.9)
                 if obj_w <= 0 or obj_h <= 0:
                     continue
+                # Small objects (min side < 24 px): expand by 60%; larger: 30%
+                min_side_obj = min(obj_w, obj_h)
+                pad_ratio = 0.6 if min_side_obj < 24 else 0.3
+                pad_x = obj_w * pad_ratio
+                pad_y = obj_h * pad_ratio
                 bx1 = max(0, int(bx1 - pad_x))
                 by1 = max(0, int(by1 - pad_y))
                 bx2 = min(img_w, int(bx2 + pad_x))
             if not any(is_same_object(expanded_box, k[0]) for k in kept):
                 kept.append(c)
+        # 5) Optionally squarify, then run Jina on kept crops
         for i, (expanded_box, d, gidx, crop_idx, x1, y1, x2, y2) in enumerate(kept):
             if not args.no_squarify:
                 bx1, by1, bx2, by2 = squarify_crop_box(
             ann_jina = draw_label_on_image(crop_pil, label_jina, conf_jina)
             ann_jina.save(jina_crops_dir / crop_name)
             w.writerow([
                 img_path.name,
                 crop_name,
                 result_jina["prediction"],
                 f"{result_jina['confidence']:.4f}",
                 result_jina["status"],
             ])
     f.close()
     print(f"[*] Wrote {csv_path}")
     print(f"[*] Jina crops: {jina_crops_dir}")
 # -----------------------------------------------------------------------------
+# Single-image runner for Gradio app: D-FINE first, then Jina
 # -----------------------------------------------------------------------------
+_APP_DFINE = None  # (model_id, image_processor, dfine_model, person_car_ids)
 _APP_JINA = None
 _APP_REFS_JINA = None
+DFINE_MODEL_IDS = {"medium": "ustc-community/dfine-medium-obj365", "large": "ustc-community/dfine-large-obj365"}
 def run_single_image(
     pil_image,
     refs_dir,
     device=None,
+    dfine_model="large",
     det_threshold=0.3,
     conf_threshold=0.75,
     gap_threshold=0.05,
     min_display_conf=None,
 ):
     """
+    Run D-FINE on one image, then classify small-object crops with Jina.
     refs_dir: path to refs folder (str or Path).
+    dfine_model: "medium" or "large".
     Returns (group_crop_images, known_crop_composites, status_message).
     """
     import numpy as np
         min_display_conf = MIN_DISPLAY_CONF
     from PIL import Image
+    global _APP_DFINE, _APP_JINA, _APP_REFS_JINA
     refs_dir = Path(refs_dir)
     if not refs_dir.is_dir():
         return [], [], f"Refs folder not found: {refs_dir}"
+    dfine_model = (dfine_model or "large").strip().lower()
+    if dfine_model not in DFINE_MODEL_IDS:
+        dfine_model = "large"
+    model_id = DFINE_MODEL_IDS[dfine_model]
     device = device or ("cuda" if torch.cuda.is_available() else "cpu")
     print(f"[*] Device: {device}")
     img_w, img_h = pil.size
     group_dist = 0.1 * max(img_h, img_w)
+    # Load D-FINE (reload if user switched model)
+    if _APP_DFINE is None or _APP_DFINE[0] != dfine_model:
+        print(f"[*] Loading D-FINE ({model_id})...")
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+        dfine_model_obj = DFineForObjectDetection.from_pretrained(model_id)
+        dfine_model_obj = dfine_model_obj.to(device).eval()
+        person_car_ids = get_person_car_label_ids(dfine_model_obj)
+        _APP_DFINE = (dfine_model, image_processor, dfine_model_obj, person_car_ids)
+    _model_id, image_processor, dfine_model_obj, person_car_ids = _APP_DFINE
+    # Apply user's D-FINE detection threshold to the chosen model (medium or large)
+    detections = run_dfine(pil, image_processor, dfine_model_obj, device, det_threshold)
+    person_car = [d for d in detections if d["cls"] in person_car_ids and d["conf"] > PERSON_CAR_MIN_CONF]
     if not person_car:
+        return [], [], "No person/car detected (or none with confidence > 0.9). No small-object crops."
     grouped = group_detections(person_car, group_dist)
     grouped.sort(key=lambda x: x["conf"], reverse=True)
     top_groups = grouped[:10]
+    # Load Jina encoder + refs (needed for classification)
+    if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
+        jina_encoder = JinaCLIPv2Encoder(device)
+        ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
+        _APP_JINA = (jina_encoder, ref_labels, ref_embs)
+        _APP_REFS_JINA = str(refs_dir)
+    jina_encoder, ref_labels, ref_embs = _APP_JINA
+    results_per_crop = []
+    group_crop_images = []
+    # For each person/car group: crop (with 10% margin), run D-FINE on crop, detect objects, then classify each
     for gidx, grp in enumerate(top_groups):
+        group_box = [grp["box"][0], grp["box"][1], grp["box"][2], grp["box"][3]]
+        crop_box = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
+        gx1 = max(0, int(crop_box[0]))
+        gy1 = max(0, int(crop_box[1]))
+        gx2 = min(img_w, int(crop_box[2]))
+        gy2 = min(img_h, int(crop_box[3]))
+        if gx2 <= gx1 or gy2 <= gy1:
+            continue
+        crop_pil = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
+        crop_w, crop_h = crop_pil.size
+        # Run D-FINE on person/car crop to detect objects inside
+        detections_crop = run_dfine(crop_pil, image_processor, dfine_model_obj, device, det_threshold)
+        inside = [d for d in detections_crop if d["cls"] not in person_car_ids]
         inside = deduplicate_by_iou(inside, iou_threshold=0.9)
+        candidates = []
+        for d in inside:
             bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
             obj_w, obj_h = bx2 - bx1, by2 - by1
             if obj_w <= 0 or obj_h <= 0:
                 continue
+            min_side_obj = min(obj_w, obj_h)
+            pad_ratio = 0.6 if min_side_obj < 24 else 0.3
+            pad_x = obj_w * pad_ratio
+            pad_y = obj_h * pad_ratio
+            bx1 = max(0.0, bx1 - pad_x)
+            by1 = max(0.0, by1 - pad_y)
+            bx2 = min(crop_w, bx2 + pad_x)
+            by2 = min(crop_h, by2 + pad_y)
             if bx2 <= bx1 or by2 <= by1:
                 continue
+            w, h = bx2 - bx1, by2 - by1
+            if min(w, h) < MIN_OBJECT_CROP_SIDE:
+                need = MIN_OBJECT_CROP_SIDE - min(w, h)
+                half = need / 2.0
+                if w < h:
+                    bx1 = max(0, bx1 - half)
+                    bx2 = min(crop_w, bx2 + half)
+                else:
+                    by1 = max(0, by1 - half)
+                    by2 = min(crop_h, by2 + half)
+                w, h = bx2 - bx1, by2 - by1
+                if w < MIN_OBJECT_CROP_SIDE:
+                    add = (MIN_OBJECT_CROP_SIDE - w) / 2
+                    bx1 = max(0, bx1 - add)
+                    bx2 = min(crop_w, bx2 + add)
+                if h < MIN_OBJECT_CROP_SIDE:
+                    add = (MIN_OBJECT_CROP_SIDE - h) / 2
+                    by1 = max(0, by1 - add)
+                    by2 = min(crop_h, by2 + add)
+            bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
+            if bx2 <= bx1 or by2 <= by1:
                 continue
+            candidates.append(([bx1, by1, bx2, by2], d, gidx))
+        def crop_area(box):
+            return (box[2] - box[0]) * (box[3] - box[1])
+        candidates.sort(key=lambda c: -crop_area(c[0]))
+        kept = []
+        for c in candidates:
+            expanded_box = c[0]
+            if not any(
+                box_iou(expanded_box, k[0]) >= crop_dedup_iou
+                or box_center_inside(expanded_box, k[0])
+                or box_center_inside(k[0], expanded_box)
+                for k in kept
+            ):
+                kept.append(c)
+        for (bx1, by1, bx2, by2), d, _ in kept:
+            if squarify:
+                bx1, by1, bx2, by2 = squarify_crop_box(bx1, by1, bx2, by2, crop_w, crop_h)
+            small_crop = crop_pil.crop((bx1, by1, bx2, by2))
+            q = jina_encoder.encode_images([small_crop], TRUNCATE_DIM)
             result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
+            pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
+            conf = result["confidence"]
+            results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))
+        # Draw bboxes on this group crop (bboxes already in crop coords)
+        boxes_to_draw = [
+            (bx1, by1, bx2, by2, pred, conf)
+            for (gidx2, (bx1, by1, bx2, by2), _sc, pred, conf) in results_per_crop
+            if gidx2 == gidx
+        ]
+        if boxes_to_draw:
+            crop_pil_drawn = draw_bboxes_on_image(crop_pil.copy(), boxes_to_draw)
         else:
+            crop_pil_drawn = crop_pil
+        group_crop_images.append(np.array(crop_pil_drawn))
+    if not results_per_crop:
+        return group_crop_images if group_crop_images else [], [], "No small-object crops: D-FINE on person/car crops did not detect any object (gun/phone/etc.), or all were below min size."
     # Build known-only gallery: only objects with conf >= min_display_conf
     known_crop_composites = []