orik-ss commited on
Commit
cc88e05
·
2 Parent(s): bb55c4dcc2035e

Merge branch 'main' of https://huggingface.co/spaces/Napron/small_object_detection

Browse files
Files changed (2) hide show
  1. app.py +47 -20
  2. dfine_jina_pipeline.py +157 -199
app.py CHANGED
@@ -1,4 +1,4 @@
1
- """ Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + Classify (Jina or Nomic). """
2
 
3
  import os
4
  os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
@@ -9,7 +9,7 @@ import gradio as gr
9
  from ultralytics import YOLO
10
  from pathlib import Path
11
 
12
- # Tab 2: D-FINE runs first, then user chooses Jina or Nomic for crop classification
13
  from dfine_jina_pipeline import run_single_image
14
 
15
 
@@ -108,8 +108,8 @@ def run_detection(image, model):
108
  return out_img, det_json
109
 
110
 
111
- def run_dfine_classify(image, encoder_choice, refs_path, min_display_conf=0.7):
112
- """Tab 2: D-FINE first, then classify crops with Jina or Nomic.
113
  Returns (group_crop_gallery, known_crop_gallery, status_message).
114
  """
115
  if image is None:
@@ -120,15 +120,14 @@ def run_dfine_classify(image, encoder_choice, refs_path, min_display_conf=0.7):
120
  if not refs.is_dir():
121
  return [], [], f"Refs folder not found: {refs}"
122
 
123
- # Tuned on COCO GT: conf=0.5, gap=0.02.
124
- # Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local.
125
  group_crops, known_crops, status = run_single_image(
126
  image,
127
  refs_dir=refs,
128
- encoder_choice=encoder_choice.lower(),
129
- det_threshold=0.15,
130
  conf_threshold=0.5,
131
- gap_threshold=0.02,
132
  min_side=24,
133
  crop_dedup_iou=0.4,
134
  min_display_conf=float(min_display_conf),
@@ -230,10 +229,12 @@ with gr.Blocks(title="Small Object Detection") as app:
230
  with gr.TabItem("D-FINE + Classify"):
231
 
232
  gr.Markdown(
233
- "**D-FINE** runs first (person/car grouping), then small-object crops are classified. "
234
- "Choose **Jina** or **Nomic** for the embedding/classification model. "
235
  "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
236
- "with reference images."
 
 
237
  )
238
 
239
  with gr.Row():
@@ -246,10 +247,28 @@ with gr.Blocks(title="Small Object Detection") as app:
246
  height=IMG_HEIGHT
247
  )
248
 
249
- encoder_choice = gr.Radio(
250
- choices=["Jina", "Nomic"],
251
- value="Jina",
252
- label="Embedding / classification model",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  )
254
 
255
  refs_path = gr.Textbox(
@@ -268,13 +287,21 @@ with gr.Blocks(title="Small Object Detection") as app:
268
  threshold_slider = gr.Slider(
269
  minimum=0.0,
270
  maximum=1.0,
271
- value=0.7,
272
- step=0.05,
273
  label="Threshold (min display confidence)",
274
  )
275
 
 
 
 
 
 
 
 
 
276
  out_gallery_dfine = gr.Gallery(
277
- label="Person/car crops (bboxes: gun, knife, cigarette, phone only)",
278
  height=IMG_HEIGHT,
279
  columns=2,
280
  object_fit="contain",
@@ -295,7 +322,7 @@ with gr.Blocks(title="Small Object Detection") as app:
295
 
296
  btn_dfine.click(
297
  fn=run_dfine_classify,
298
- inputs=[inp_dfine, encoder_choice, refs_path, threshold_slider],
299
  outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
300
  concurrency_limit=1,
301
  )
 
1
+ """ Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + Classify (Jina). """
2
 
3
  import os
4
  os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp")
 
9
  from ultralytics import YOLO
10
  from pathlib import Path
11
 
12
+ # Tab 2: D-FINE runs first, then Jina for crop classification
13
  from dfine_jina_pipeline import run_single_image
14
 
15
 
 
108
  return out_img, det_json
109
 
110
 
111
+ def run_dfine_classify(image, refs_path, dfine_threshold, dfine_model_choice, min_display_conf=0.703, gap_threshold=0.005):
112
+ """Tab 2: D-FINE first, then classify crops with Jina.
113
  Returns (group_crop_gallery, known_crop_gallery, status_message).
114
  """
115
  if image is None:
 
120
  if not refs.is_dir():
121
  return [], [], f"Refs folder not found: {refs}"
122
 
123
+ dfine_model = "large" if dfine_model_choice.strip().lower() == "large" else "medium"
 
124
  group_crops, known_crops, status = run_single_image(
125
  image,
126
  refs_dir=refs,
127
+ dfine_model=dfine_model,
128
+ det_threshold=float(dfine_threshold),
129
  conf_threshold=0.5,
130
+ gap_threshold=float(gap_threshold),
131
  min_side=24,
132
  crop_dedup_iou=0.4,
133
  min_display_conf=float(min_display_conf),
 
229
  with gr.TabItem("D-FINE + Classify"):
230
 
231
  gr.Markdown(
232
+ "**D-FINE** runs first (person/car grouping), then small-object crops are classified with **Jina**. "
233
+ "Choose D-FINE model size (Medium or Large). "
234
  "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) "
235
+ "with reference images.\n\n"
236
+ "**Gap** = how much the top class (e.g. gun) must beat the next-best class (e.g. phone). "
237
+ "Bigger gap means the model is more sure; we only accept the label if both confidence and gap are high enough."
238
  )
239
 
240
  with gr.Row():
 
247
  height=IMG_HEIGHT
248
  )
249
 
250
+ dfine_model_radio = gr.Radio(
251
+ choices=["Medium", "Large"],
252
+ value="Large",
253
+ label="D-FINE model",
254
+ )
255
+
256
+ # Default threshold: Large=0.2, Medium=0.15 (slider updates when model changes)
257
+ dfine_threshold_slider = gr.Slider(
258
+ minimum=0.05,
259
+ maximum=0.5,
260
+ value=0.2,
261
+ step=0.05,
262
+ label="D-FINE detection threshold (applied to chosen model)",
263
+ )
264
+
265
+ def update_dfine_threshold_default(choice):
266
+ return gr.update(value=0.2 if (choice and choice.strip().lower() == "large") else 0.15)
267
+
268
+ dfine_model_radio.change(
269
+ fn=update_dfine_threshold_default,
270
+ inputs=[dfine_model_radio],
271
+ outputs=[dfine_threshold_slider],
272
  )
273
 
274
  refs_path = gr.Textbox(
 
287
  threshold_slider = gr.Slider(
288
  minimum=0.0,
289
  maximum=1.0,
290
+ value=0.703,
291
+ step=0.005,
292
  label="Threshold (min display confidence)",
293
  )
294
 
295
+ gap_slider = gr.Slider(
296
+ minimum=0.0,
297
+ maximum=0.02,
298
+ value=0.005,
299
+ step=0.001,
300
+ label="Gap: how much the top guess must beat the runner-up (higher = stricter, fewer accepted)",
301
+ )
302
+
303
  out_gallery_dfine = gr.Gallery(
304
+ label="Person/car crops (all D-FINE objects inside drawn with label + score)",
305
  height=IMG_HEIGHT,
306
  columns=2,
307
  object_fit="contain",
 
322
 
323
  btn_dfine.click(
324
  fn=run_dfine_classify,
325
+ inputs=[inp_dfine, refs_path, dfine_threshold_slider, dfine_model_radio, threshold_slider, gap_slider],
326
  outputs=[out_gallery_dfine, out_gallery_known, out_status_dfine],
327
  concurrency_limit=1,
328
  )
dfine_jina_pipeline.py CHANGED
@@ -1,6 +1,6 @@
1
  """ Pipeline: D-FINE (person/car only) → group detections → crop regions →
2
- find all bboxes inside each crop → Jina-CLIP-v2 and Nomic embeddings on those crops.
3
- Outputs separate crop folders per model (jina_crops, nomic_crops) for visual comparison.
4
  """
5
 
6
  import argparse
@@ -29,9 +29,8 @@ from jina_fewshot import (
29
  KNOWN_DISPLAY_CLASSES = {"gun", "knife", "cigarette", "phone"}
30
  # Only show objects (and group crops) with confidence >= this
31
  MIN_DISPLAY_CONF = 0.7
32
-
33
- from nomic_fewshot import NomicTextEncoder, NomicVisionEncoder, build_refs_nomic
34
-
35
 
36
  # -----------------------------------------------------------------------------
37
  # Detection + grouping (from reference_detection.py)
@@ -109,6 +108,27 @@ def box_center_inside(box, crop_box):
109
  )
110
 
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
113
  """
114
  Expand the shorter side to match the longer (same ratio / square), centered, clamped to image.
@@ -177,7 +197,7 @@ def parse_args():
177
  p = argparse.ArgumentParser(
178
  description="D-FINE (person/car) → group → Jina-CLIP-v2 on crops inside groups"
179
  )
180
- p.add_argument("--refs", required=True, help="Reference images folder for Jina and Nomic (e.g. refs/)")
181
  p.add_argument("--input", required=True, help="Full-frame images folder")
182
  p.add_argument("--output", default="pipeline_results", help="Output folder (CSV, etc.)")
183
  p.add_argument("--det-threshold", type=float, default=0.13, help="D-FINE score threshold")
@@ -191,6 +211,7 @@ def parse_args():
191
  p.add_argument("--text-weight", type=float, default=0.3)
192
  p.add_argument("--max-images", type=int, default=None)
193
  p.add_argument("--device", default=None)
 
194
  return p.parse_args()
195
 
196
 
@@ -282,10 +303,11 @@ def main():
282
  raise SystemExit(f"No images in {input_dir}")
283
 
284
  # Load D-FINE
285
- print("[*] Loading D-FINE (dfine-medium-obj365)...")
 
286
  t0 = time.perf_counter()
287
- image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-medium-obj365")
288
- dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
289
  dfine_model = dfine_model.to(device).eval()
290
  person_car_ids = get_person_car_label_ids(dfine_model)
291
  print(f" Person/car label IDs: {person_car_ids} ({time.perf_counter()-t0:.1f}s)")
@@ -303,25 +325,8 @@ def main():
303
  )
304
  print(f" Jina refs: {ref_labels} ({time.perf_counter()-t0:.1f}s)\n")
305
 
306
- # Load Nomic vision + text, build refs (same as Jina: image + text prompts, text_weight 0.3)
307
- print("[*] Loading Nomic embed-vision + embed-text and building refs...")
308
- t0 = time.perf_counter()
309
- nomic_encoder = NomicVisionEncoder(device)
310
- nomic_text_encoder = NomicTextEncoder(device)
311
- ref_labels_nomic, ref_embs_nomic = build_refs_nomic(
312
- nomic_encoder,
313
- refs_dir,
314
- batch_size=16,
315
- text_encoder=nomic_text_encoder,
316
- text_weight=args.text_weight,
317
- )
318
- print(f" Nomic refs: {ref_labels_nomic} ({time.perf_counter()-t0:.1f}s)\n")
319
-
320
- # Separate output folders per model for visual comparison
321
  jina_crops_dir = output_dir / "jina_crops"
322
- nomic_crops_dir = output_dir / "nomic_crops"
323
  jina_crops_dir.mkdir(parents=True, exist_ok=True)
324
- nomic_crops_dir.mkdir(parents=True, exist_ok=True)
325
 
326
  # CSV
327
  csv_path = output_dir / "results.csv"
@@ -344,9 +349,6 @@ def main():
344
  "jina_prediction",
345
  "jina_confidence",
346
  "jina_status",
347
- "nomic_prediction",
348
- "nomic_confidence",
349
- "nomic_status",
350
  ])
351
 
352
  for img_path in paths:
@@ -363,7 +365,7 @@ def main():
363
  args.det_threshold
364
  )
365
 
366
- person_car = [d for d in detections if d["cls"] in person_car_ids]
367
  if not person_car:
368
  continue
369
 
@@ -379,10 +381,11 @@ def main():
379
  for gidx, grp in enumerate(top_groups):
380
  x1, y1, x2, y2 = grp["box"]
381
  group_box = [x1, y1, x2, y2]
 
382
 
383
  inside = [
384
  d for d in detections
385
- if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
386
  ]
387
  inside = deduplicate_by_iou(inside, iou_threshold=0.9)
388
 
@@ -392,8 +395,11 @@ def main():
392
  if obj_w <= 0 or obj_h <= 0:
393
  continue
394
 
395
- pad_x = obj_w * 0.3
396
- pad_y = obj_h * 0.3
 
 
 
397
  bx1 = max(0, int(bx1 - pad_x))
398
  by1 = max(0, int(by1 - pad_y))
399
  bx2 = min(img_w, int(bx2 + pad_x))
@@ -428,7 +434,7 @@ def main():
428
  if not any(is_same_object(expanded_box, k[0]) for k in kept):
429
  kept.append(c)
430
 
431
- # 5) Optionally squarify, then run Jina and Nomic only on kept crops
432
  for i, (expanded_box, d, gidx, crop_idx, x1, y1, x2, y2) in enumerate(kept):
433
  if not args.no_squarify:
434
  bx1, by1, bx2, by2 = squarify_crop_box(
@@ -464,25 +470,6 @@ def main():
464
  ann_jina = draw_label_on_image(crop_pil, label_jina, conf_jina)
465
  ann_jina.save(jina_crops_dir / crop_name)
466
 
467
- q_nomic = nomic_encoder.encode_images([crop_pil])
468
- result_nomic = jina_classify(
469
- q_nomic,
470
- ref_labels_nomic,
471
- ref_embs_nomic,
472
- args.conf_threshold,
473
- args.gap_threshold
474
- )
475
-
476
- if result_nomic["prediction"] in ref_labels_nomic:
477
- label_nomic = result_nomic["prediction"]
478
- conf_nomic = result_nomic["confidence"]
479
- else:
480
- label_nomic = f"unnamed (dfine: {d['label']})"
481
- conf_nomic = 0.0
482
-
483
- ann_nomic = draw_label_on_image(crop_pil, label_nomic, conf_nomic)
484
- ann_nomic.save(nomic_crops_dir / crop_name)
485
-
486
  w.writerow([
487
  img_path.name,
488
  crop_name,
@@ -500,33 +487,29 @@ def main():
500
  result_jina["prediction"],
501
  f"{result_jina['confidence']:.4f}",
502
  result_jina["status"],
503
- result_nomic["prediction"],
504
- f"{result_nomic['confidence']:.4f}",
505
- result_nomic["status"],
506
  ])
507
 
508
  f.close()
509
  print(f"[*] Wrote {csv_path}")
510
  print(f"[*] Jina crops: {jina_crops_dir}")
511
- print(f"[*] Nomic crops: {nomic_crops_dir}")
512
 
513
 
514
  # -----------------------------------------------------------------------------
515
- # Single-image runner for Gradio app: D-FINE first, then Jina or Nomic (user choice)
516
  # -----------------------------------------------------------------------------
517
 
518
- _APP_DFINE = None
519
  _APP_JINA = None
520
- _APP_NOMIC = None
521
  _APP_REFS_JINA = None
522
- _APP_REFS_NOMIC = None
 
523
 
524
 
525
  def run_single_image(
526
  pil_image,
527
  refs_dir,
528
  device=None,
529
- encoder_choice="jina",
530
  det_threshold=0.3,
531
  conf_threshold=0.75,
532
  gap_threshold=0.05,
@@ -536,15 +519,12 @@ def run_single_image(
536
  min_display_conf=None,
537
  ):
538
  """
539
- Run D-FINE on one image, then classify small-object crops with Jina or Nomic.
540
 
541
  refs_dir: path to refs folder (str or Path).
542
- encoder_choice: "jina" or "nomic".
543
 
544
  Returns (group_crop_images, known_crop_composites, status_message).
545
- - group_crop_images: list of PIL/numpy (one per person/car group, with bboxes for known objects only).
546
- - known_crop_composites: list of PIL/numpy (label+score above + crop) for known classes only.
547
- - status_message: None on success, or error/empty-state string.
548
  """
549
  import numpy as np
550
 
@@ -552,12 +532,17 @@ def run_single_image(
552
  min_display_conf = MIN_DISPLAY_CONF
553
  from PIL import Image
554
 
555
- global _APP_DFINE, _APP_JINA, _APP_NOMIC, _APP_REFS_JINA, _APP_REFS_NOMIC
556
 
557
  refs_dir = Path(refs_dir)
558
  if not refs_dir.is_dir():
559
  return [], [], f"Refs folder not found: {refs_dir}"
560
 
 
 
 
 
 
561
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
562
  print(f"[*] Device: {device}")
563
 
@@ -565,163 +550,136 @@ def run_single_image(
565
  img_w, img_h = pil.size
566
  group_dist = 0.1 * max(img_h, img_w)
567
 
568
- # Load D-FINE once
569
- if _APP_DFINE is None:
570
- image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine-medium-obj365")
571
- dfine_model = DFineForObjectDetection.from_pretrained("ustc-community/dfine-medium-obj365")
572
- dfine_model = dfine_model.to(device).eval()
573
- person_car_ids = get_person_car_label_ids(dfine_model)
574
- _APP_DFINE = (image_processor, dfine_model, person_car_ids)
 
575
 
576
- image_processor, dfine_model, person_car_ids = _APP_DFINE
577
 
578
- detections = run_dfine(pil, image_processor, dfine_model, device, det_threshold)
579
- person_car = [d for d in detections if d["cls"] in person_car_ids]
 
580
  if not person_car:
581
- return [], [], "No person/car detected. No small-object crops."
582
 
583
  grouped = group_detections(person_car, group_dist)
584
  grouped.sort(key=lambda x: x["conf"], reverse=True)
585
  top_groups = grouped[:10]
586
 
587
- candidates = []
 
 
 
 
 
 
 
588
 
 
 
 
 
589
  for gidx, grp in enumerate(top_groups):
590
- x1, y1, x2, y2 = grp["box"]
591
- group_box = [x1, y1, x2, y2]
 
 
 
 
 
 
 
 
592
 
593
- inside = [
594
- d for d in detections
595
- if box_center_inside(d["box"], group_box) and d["cls"] not in person_car_ids
596
- ]
597
  inside = deduplicate_by_iou(inside, iou_threshold=0.9)
598
 
599
- for crop_idx, d in enumerate(inside):
 
600
  bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
601
  obj_w, obj_h = bx2 - bx1, by2 - by1
602
  if obj_w <= 0 or obj_h <= 0:
603
  continue
604
-
605
- pad_x, pad_y = obj_w * 0.3, obj_h * 0.3
606
- bx1 = max(0, int(bx1 - pad_x))
607
- by1 = max(0, int(by1 - pad_y))
608
- bx2 = min(img_w, int(bx2 + pad_x))
609
- by2 = min(img_h, int(by2 + pad_y))
610
-
 
611
  if bx2 <= bx1 or by2 <= by1:
612
  continue
613
-
614
- if min(bx2 - bx1, by2 - by1) < min_side:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  continue
 
616
 
617
- expanded_box = [bx1, by1, bx2, by2]
618
- candidates.append((expanded_box, d, gidx, crop_idx))
619
-
620
- def crop_area(box):
621
- return (box[2] - box[0]) * (box[3] - box[1])
622
-
623
- candidates.sort(key=lambda c: -crop_area(c[0]))
624
- kept = []
625
-
626
- for c in candidates:
627
- def is_same_object(box_a, box_b):
628
- if box_iou(box_a, box_b) >= crop_dedup_iou:
629
- return True
630
- if box_center_inside(box_a, box_b) or box_center_inside(box_b, box_a):
631
- return True
632
- return False
633
-
634
- if not any(is_same_object(c[0], k[0]) for k in kept):
635
- kept.append(c)
636
-
637
- if not kept:
638
- if not candidates:
639
- return [], [], "No small-object crops: D-FINE did not detect any object (gun/phone/etc.) inside person/car areas, or all were below min size. Try a higher-resolution image."
640
- return [], [], "No small-object crops (after dedup)."
641
-
642
- # Load encoder + refs for chosen model
643
- if encoder_choice == "jina":
644
- if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
645
- jina_encoder = JinaCLIPv2Encoder(device)
646
- ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
647
- _APP_JINA = (jina_encoder, ref_labels, ref_embs)
648
- _APP_REFS_JINA = str(refs_dir)
649
-
650
- jina_encoder, ref_labels, ref_embs = _APP_JINA
651
- else:
652
- if _APP_NOMIC is None or _APP_REFS_NOMIC != str(refs_dir):
653
- nomic_encoder = NomicVisionEncoder(device)
654
- nomic_text_encoder = NomicTextEncoder(device)
655
- ref_labels, ref_embs = build_refs_nomic(
656
- nomic_encoder,
657
- refs_dir,
658
- batch_size=16,
659
- text_encoder=nomic_text_encoder,
660
- text_weight=0.3,
661
- )
662
- _APP_NOMIC = (nomic_encoder, ref_labels, ref_embs)
663
- _APP_REFS_NOMIC = str(refs_dir)
664
-
665
- nomic_encoder, ref_labels, ref_embs = _APP_NOMIC
666
-
667
- # Classify each kept crop and store (gidx, box_in_full_image, crop_pil, pred, conf)
668
- results_per_crop = []
669
- for expanded_box, d, gidx, crop_idx in kept:
670
- if squarify:
671
- bx1, by1, bx2, by2 = squarify_crop_box(
672
- expanded_box[0],
673
- expanded_box[1],
674
- expanded_box[2],
675
- expanded_box[3],
676
- img_w,
677
- img_h,
678
- )
679
- else:
680
- bx1, by1, bx2, by2 = expanded_box[0], expanded_box[1], expanded_box[2], expanded_box[3]
681
 
682
- crop_pil = pil.crop((bx1, by1, bx2, by2))
 
 
 
 
 
 
 
 
 
 
683
 
684
- if encoder_choice == "jina":
685
- q = jina_encoder.encode_images([crop_pil], TRUNCATE_DIM)
 
 
 
686
  result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
 
 
 
 
 
 
 
 
 
 
 
 
687
  else:
688
- q = nomic_encoder.encode_images([crop_pil])
689
- result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
690
-
691
- pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
692
- conf = result["confidence"]
693
- results_per_crop.append((gidx, (bx1, by1, bx2, by2), crop_pil, pred, conf))
694
 
695
- # Build group crop images: only groups that contain at least one known object with conf >= MIN_DISPLAY_CONF
696
- group_crop_images = []
697
- for gidx, grp in enumerate(top_groups):
698
- gx1, gy1, gx2, gy2 = grp["box"]
699
- gx1, gy1 = int(gx1), int(gy1)
700
- gx2, gy2 = int(gx2), int(gy2)
701
- gx1, gy1 = max(0, gx1), max(0, gy1)
702
- gx2, gy2 = min(img_w, gx2), min(img_h, gy2)
703
- if gx2 <= gx1 or gy2 <= gy1:
704
- continue
705
- group_crop = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
706
- crop_w, crop_h = group_crop.size
707
-
708
- boxes_to_draw = []
709
- for (gidx2, (bx1, by1, bx2, by2), _crop_pil, pred, conf) in results_per_crop:
710
- if gidx2 != gidx or pred not in KNOWN_DISPLAY_CLASSES or conf < min_display_conf:
711
- continue
712
- # Convert to group-crop-relative coords and clamp
713
- rx1 = max(0, min(crop_w, bx1 - gx1))
714
- ry1 = max(0, min(crop_h, by1 - gy1))
715
- rx2 = max(0, min(crop_w, bx2 - gx1))
716
- ry2 = max(0, min(crop_h, by2 - gy1))
717
- if rx2 > rx1 and ry2 > ry1:
718
- boxes_to_draw.append((rx1, ry1, rx2, ry2, pred, conf))
719
-
720
- # Only show this group crop if it has at least one known object >= min_display_conf
721
- if not boxes_to_draw:
722
- continue
723
- group_crop = draw_bboxes_on_image(group_crop, boxes_to_draw)
724
- group_crop_images.append(np.array(group_crop))
725
 
726
  # Build known-only gallery: only objects with conf >= min_display_conf
727
  known_crop_composites = []
 
1
  """ Pipeline: D-FINE (person/car only) → group detections → crop regions →
2
+ find all bboxes inside each crop → Jina-CLIP-v2 embeddings and classification.
3
+ Outputs jina_crops folder and results CSV.
4
  """
5
 
6
  import argparse
 
29
  KNOWN_DISPLAY_CLASSES = {"gun", "knife", "cigarette", "phone"}
30
  # Only show objects (and group crops) with confidence >= this
31
  MIN_DISPLAY_CONF = 0.7
32
+ # Person/car detections must have confidence > this to be used for grouping
33
+ PERSON_CAR_MIN_CONF = 0.9
 
34
 
35
  # -----------------------------------------------------------------------------
36
  # Detection + grouping (from reference_detection.py)
 
108
  )
109
 
110
 
111
+ def expand_box_by_margin(box, margin_ratio, img_w, img_h):
112
+ """Expand box [x1,y1,x2,y2] by margin_ratio (e.g. 0.1 = 10%) on all sides, clamped to image."""
113
+ x1, y1, x2, y2 = box
114
+ w, h = x2 - x1, y2 - y1
115
+ if w <= 0 or h <= 0:
116
+ return box
117
+ mx = w * margin_ratio
118
+ my = h * margin_ratio
119
+ x1 = max(0, x1 - mx)
120
+ y1 = max(0, y1 - my)
121
+ x2 = min(img_w, x2 + mx)
122
+ y2 = min(img_h, y2 + my)
123
+ return [x1, y1, x2, y2]
124
+
125
+
126
+ # 10% margin on person/car group crop (expand crop before running D-FINE on it)
127
+ PERSON_CAR_GROUP_MARGIN = 0.10
128
+ # Min side (px) for object crops extracted from person/car crop before sending to classifier (objects in crop are larger)
129
+ MIN_OBJECT_CROP_SIDE = 112
130
+
131
+
132
  def squarify_crop_box(bx1, by1, bx2, by2, img_w, img_h):
133
  """
134
  Expand the shorter side to match the longer (same ratio / square), centered, clamped to image.
 
197
  p = argparse.ArgumentParser(
198
  description="D-FINE (person/car) → group → Jina-CLIP-v2 on crops inside groups"
199
  )
200
+ p.add_argument("--refs", required=True, help="Reference images folder for Jina (e.g. refs/)")
201
  p.add_argument("--input", required=True, help="Full-frame images folder")
202
  p.add_argument("--output", default="pipeline_results", help="Output folder (CSV, etc.)")
203
  p.add_argument("--det-threshold", type=float, default=0.13, help="D-FINE score threshold")
 
211
  p.add_argument("--text-weight", type=float, default=0.3)
212
  p.add_argument("--max-images", type=int, default=None)
213
  p.add_argument("--device", default=None)
214
+ p.add_argument("--dfine-model", choices=["medium", "large"], default="large", help="D-FINE model size")
215
  return p.parse_args()
216
 
217
 
 
303
  raise SystemExit(f"No images in {input_dir}")
304
 
305
  # Load D-FINE
306
+ dfine_model_id = DFINE_MODEL_IDS.get(args.dfine_model, DFINE_MODEL_IDS["large"])
307
+ print(f"[*] Loading D-FINE ({dfine_model_id})...")
308
  t0 = time.perf_counter()
309
+ image_processor = AutoImageProcessor.from_pretrained(dfine_model_id)
310
+ dfine_model = DFineForObjectDetection.from_pretrained(dfine_model_id)
311
  dfine_model = dfine_model.to(device).eval()
312
  person_car_ids = get_person_car_label_ids(dfine_model)
313
  print(f" Person/car label IDs: {person_car_ids} ({time.perf_counter()-t0:.1f}s)")
 
325
  )
326
  print(f" Jina refs: {ref_labels} ({time.perf_counter()-t0:.1f}s)\n")
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  jina_crops_dir = output_dir / "jina_crops"
 
329
  jina_crops_dir.mkdir(parents=True, exist_ok=True)
 
330
 
331
  # CSV
332
  csv_path = output_dir / "results.csv"
 
349
  "jina_prediction",
350
  "jina_confidence",
351
  "jina_status",
 
 
 
352
  ])
353
 
354
  for img_path in paths:
 
365
  args.det_threshold
366
  )
367
 
368
+ person_car = [d for d in detections if d["cls"] in person_car_ids and d["conf"] > PERSON_CAR_MIN_CONF]
369
  if not person_car:
370
  continue
371
 
 
381
  for gidx, grp in enumerate(top_groups):
382
  x1, y1, x2, y2 = grp["box"]
383
  group_box = [x1, y1, x2, y2]
384
+ group_box_with_margin = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
385
 
386
  inside = [
387
  d for d in detections
388
+ if box_center_inside(d["box"], group_box_with_margin) and d["cls"] not in person_car_ids
389
  ]
390
  inside = deduplicate_by_iou(inside, iou_threshold=0.9)
391
 
 
395
  if obj_w <= 0 or obj_h <= 0:
396
  continue
397
 
398
+ # Small objects (min side < 24 px): expand by 60%; larger: 30%
399
+ min_side_obj = min(obj_w, obj_h)
400
+ pad_ratio = 0.6 if min_side_obj < 24 else 0.3
401
+ pad_x = obj_w * pad_ratio
402
+ pad_y = obj_h * pad_ratio
403
  bx1 = max(0, int(bx1 - pad_x))
404
  by1 = max(0, int(by1 - pad_y))
405
  bx2 = min(img_w, int(bx2 + pad_x))
 
434
  if not any(is_same_object(expanded_box, k[0]) for k in kept):
435
  kept.append(c)
436
 
437
+ # 5) Optionally squarify, then run Jina on kept crops
438
  for i, (expanded_box, d, gidx, crop_idx, x1, y1, x2, y2) in enumerate(kept):
439
  if not args.no_squarify:
440
  bx1, by1, bx2, by2 = squarify_crop_box(
 
470
  ann_jina = draw_label_on_image(crop_pil, label_jina, conf_jina)
471
  ann_jina.save(jina_crops_dir / crop_name)
472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  w.writerow([
474
  img_path.name,
475
  crop_name,
 
487
  result_jina["prediction"],
488
  f"{result_jina['confidence']:.4f}",
489
  result_jina["status"],
 
 
 
490
  ])
491
 
492
  f.close()
493
  print(f"[*] Wrote {csv_path}")
494
  print(f"[*] Jina crops: {jina_crops_dir}")
 
495
 
496
 
497
  # -----------------------------------------------------------------------------
498
+ # Single-image runner for Gradio app: D-FINE first, then Jina
499
  # -----------------------------------------------------------------------------
500
 
501
+ _APP_DFINE = None # (model_id, image_processor, dfine_model, person_car_ids)
502
  _APP_JINA = None
 
503
  _APP_REFS_JINA = None
504
+
505
+ DFINE_MODEL_IDS = {"medium": "ustc-community/dfine-medium-obj365", "large": "ustc-community/dfine-large-obj365"}
506
 
507
 
508
  def run_single_image(
509
  pil_image,
510
  refs_dir,
511
  device=None,
512
+ dfine_model="large",
513
  det_threshold=0.3,
514
  conf_threshold=0.75,
515
  gap_threshold=0.05,
 
519
  min_display_conf=None,
520
  ):
521
  """
522
+ Run D-FINE on one image, then classify small-object crops with Jina.
523
 
524
  refs_dir: path to refs folder (str or Path).
525
+ dfine_model: "medium" or "large".
526
 
527
  Returns (group_crop_images, known_crop_composites, status_message).
 
 
 
528
  """
529
  import numpy as np
530
 
 
532
  min_display_conf = MIN_DISPLAY_CONF
533
  from PIL import Image
534
 
535
+ global _APP_DFINE, _APP_JINA, _APP_REFS_JINA
536
 
537
  refs_dir = Path(refs_dir)
538
  if not refs_dir.is_dir():
539
  return [], [], f"Refs folder not found: {refs_dir}"
540
 
541
+ dfine_model = (dfine_model or "large").strip().lower()
542
+ if dfine_model not in DFINE_MODEL_IDS:
543
+ dfine_model = "large"
544
+ model_id = DFINE_MODEL_IDS[dfine_model]
545
+
546
  device = device or ("cuda" if torch.cuda.is_available() else "cpu")
547
  print(f"[*] Device: {device}")
548
 
 
550
  img_w, img_h = pil.size
551
  group_dist = 0.1 * max(img_h, img_w)
552
 
553
+ # Load D-FINE (reload if user switched model)
554
+ if _APP_DFINE is None or _APP_DFINE[0] != dfine_model:
555
+ print(f"[*] Loading D-FINE ({model_id})...")
556
+ image_processor = AutoImageProcessor.from_pretrained(model_id)
557
+ dfine_model_obj = DFineForObjectDetection.from_pretrained(model_id)
558
+ dfine_model_obj = dfine_model_obj.to(device).eval()
559
+ person_car_ids = get_person_car_label_ids(dfine_model_obj)
560
+ _APP_DFINE = (dfine_model, image_processor, dfine_model_obj, person_car_ids)
561
 
562
+ _model_id, image_processor, dfine_model_obj, person_car_ids = _APP_DFINE
563
 
564
+ # Apply user's D-FINE detection threshold to the chosen model (medium or large)
565
+ detections = run_dfine(pil, image_processor, dfine_model_obj, device, det_threshold)
566
+ person_car = [d for d in detections if d["cls"] in person_car_ids and d["conf"] > PERSON_CAR_MIN_CONF]
567
  if not person_car:
568
+ return [], [], "No person/car detected (or none with confidence > 0.9). No small-object crops."
569
 
570
  grouped = group_detections(person_car, group_dist)
571
  grouped.sort(key=lambda x: x["conf"], reverse=True)
572
  top_groups = grouped[:10]
573
 
574
+ # Load Jina encoder + refs (needed for classification)
575
+ if _APP_JINA is None or _APP_REFS_JINA != str(refs_dir):
576
+ jina_encoder = JinaCLIPv2Encoder(device)
577
+ ref_labels, ref_embs = build_refs(jina_encoder, refs_dir, TRUNCATE_DIM, 0.3, batch_size=16)
578
+ _APP_JINA = (jina_encoder, ref_labels, ref_embs)
579
+ _APP_REFS_JINA = str(refs_dir)
580
+
581
+ jina_encoder, ref_labels, ref_embs = _APP_JINA
582
 
583
+ results_per_crop = []
584
+ group_crop_images = []
585
+
586
+ # For each person/car group: crop (with 10% margin), run D-FINE on crop, detect objects, then classify each
587
  for gidx, grp in enumerate(top_groups):
588
+ group_box = [grp["box"][0], grp["box"][1], grp["box"][2], grp["box"][3]]
589
+ crop_box = expand_box_by_margin(group_box, PERSON_CAR_GROUP_MARGIN, img_w, img_h)
590
+ gx1 = max(0, int(crop_box[0]))
591
+ gy1 = max(0, int(crop_box[1]))
592
+ gx2 = min(img_w, int(crop_box[2]))
593
+ gy2 = min(img_h, int(crop_box[3]))
594
+ if gx2 <= gx1 or gy2 <= gy1:
595
+ continue
596
+ crop_pil = pil.crop((gx1, gy1, gx2, gy2)).copy().convert("RGB")
597
+ crop_w, crop_h = crop_pil.size
598
 
599
+ # Run D-FINE on person/car crop to detect objects inside
600
+ detections_crop = run_dfine(crop_pil, image_processor, dfine_model_obj, device, det_threshold)
601
+ inside = [d for d in detections_crop if d["cls"] not in person_car_ids]
 
602
  inside = deduplicate_by_iou(inside, iou_threshold=0.9)
603
 
604
+ candidates = []
605
+ for d in inside:
606
  bx1, by1, bx2, by2 = [float(x) for x in d["box"]]
607
  obj_w, obj_h = bx2 - bx1, by2 - by1
608
  if obj_w <= 0 or obj_h <= 0:
609
  continue
610
+ min_side_obj = min(obj_w, obj_h)
611
+ pad_ratio = 0.6 if min_side_obj < 24 else 0.3
612
+ pad_x = obj_w * pad_ratio
613
+ pad_y = obj_h * pad_ratio
614
+ bx1 = max(0.0, bx1 - pad_x)
615
+ by1 = max(0.0, by1 - pad_y)
616
+ bx2 = min(crop_w, bx2 + pad_x)
617
+ by2 = min(crop_h, by2 + pad_y)
618
  if bx2 <= bx1 or by2 <= by1:
619
  continue
620
+ w, h = bx2 - bx1, by2 - by1
621
+ if min(w, h) < MIN_OBJECT_CROP_SIDE:
622
+ need = MIN_OBJECT_CROP_SIDE - min(w, h)
623
+ half = need / 2.0
624
+ if w < h:
625
+ bx1 = max(0, bx1 - half)
626
+ bx2 = min(crop_w, bx2 + half)
627
+ else:
628
+ by1 = max(0, by1 - half)
629
+ by2 = min(crop_h, by2 + half)
630
+ w, h = bx2 - bx1, by2 - by1
631
+ if w < MIN_OBJECT_CROP_SIDE:
632
+ add = (MIN_OBJECT_CROP_SIDE - w) / 2
633
+ bx1 = max(0, bx1 - add)
634
+ bx2 = min(crop_w, bx2 + add)
635
+ if h < MIN_OBJECT_CROP_SIDE:
636
+ add = (MIN_OBJECT_CROP_SIDE - h) / 2
637
+ by1 = max(0, by1 - add)
638
+ by2 = min(crop_h, by2 + add)
639
+ bx1, by1, bx2, by2 = int(bx1), int(by1), int(bx2), int(by2)
640
+ if bx2 <= bx1 or by2 <= by1:
641
  continue
642
+ candidates.append(([bx1, by1, bx2, by2], d, gidx))
643
 
644
+ def crop_area(box):
645
+ return (box[2] - box[0]) * (box[3] - box[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
647
+ candidates.sort(key=lambda c: -crop_area(c[0]))
648
+ kept = []
649
+ for c in candidates:
650
+ expanded_box = c[0]
651
+ if not any(
652
+ box_iou(expanded_box, k[0]) >= crop_dedup_iou
653
+ or box_center_inside(expanded_box, k[0])
654
+ or box_center_inside(k[0], expanded_box)
655
+ for k in kept
656
+ ):
657
+ kept.append(c)
658
 
659
+ for (bx1, by1, bx2, by2), d, _ in kept:
660
+ if squarify:
661
+ bx1, by1, bx2, by2 = squarify_crop_box(bx1, by1, bx2, by2, crop_w, crop_h)
662
+ small_crop = crop_pil.crop((bx1, by1, bx2, by2))
663
+ q = jina_encoder.encode_images([small_crop], TRUNCATE_DIM)
664
  result = jina_classify(q, ref_labels, ref_embs, conf_threshold, gap_threshold)
665
+ pred = result["prediction"] if result["prediction"] in ref_labels else f"unknown ({d['label']})"
666
+ conf = result["confidence"]
667
+ results_per_crop.append((gidx, (bx1, by1, bx2, by2), small_crop, pred, conf))
668
+
669
+ # Draw bboxes on this group crop (bboxes already in crop coords)
670
+ boxes_to_draw = [
671
+ (bx1, by1, bx2, by2, pred, conf)
672
+ for (gidx2, (bx1, by1, bx2, by2), _sc, pred, conf) in results_per_crop
673
+ if gidx2 == gidx
674
+ ]
675
+ if boxes_to_draw:
676
+ crop_pil_drawn = draw_bboxes_on_image(crop_pil.copy(), boxes_to_draw)
677
  else:
678
+ crop_pil_drawn = crop_pil
679
+ group_crop_images.append(np.array(crop_pil_drawn))
 
 
 
 
680
 
681
+ if not results_per_crop:
682
+ return group_crop_images if group_crop_images else [], [], "No small-object crops: D-FINE on person/car crops did not detect any object (gun/phone/etc.), or all were below min size."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
684
  # Build known-only gallery: only objects with conf >= min_display_conf
685
  known_crop_composites = []