""" Gradio app: Tab 1 = Object Detection (YOLO models/v1), Tab 2 = D-FINE + Classify (Jina or Nomic). """ import os os.environ["YOLO_CONFIG_DIR"] = os.environ.get("YOLO_CONFIG_DIR", "/tmp") import json import numpy as np import gradio as gr from ultralytics import YOLO from pathlib import Path # Tab 2: D-FINE runs first, then user chooses Jina or Nomic for crop classification from dfine_jina_pipeline import run_single_image # --- Object Detection (Tab 1) --- PERSON_CLASS = 0 CAR_CLASS = 2 KNIFE_CLASS = 80 WEAPON_CLASS = 81 DRAW_CLASSES = [PERSON_CLASS, CAR_CLASS, KNIFE_CLASS, WEAPON_CLASS] CLASS_NAMES = { PERSON_CLASS: "person", CAR_CLASS: "car", KNIFE_CLASS: "knife", WEAPON_CLASS: "weapon", } CONF = 0.25 IMGSZ = 640 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) MODELS_DIR = os.path.join(BASE_DIR, "models") REFS_DIR = os.path.join(BASE_DIR, "refs") def _load_model(version: str): path = os.path.join(MODELS_DIR, version, "best.pt") if not os.path.isfile(path): raise FileNotFoundError(f"Model not found: {path}") return YOLO(path) MODELS = {"v1": _load_model("v1")} MODEL_CLASSES = {"v1": ["person", "car", "knife", "weapon"]} def run_detection(image, model): if image is None: return None, "{}" img = image if isinstance(image, np.ndarray) else np.array(image) if img.ndim == 2: img = np.stack([img] * 3, axis=-1) results = model.predict( source=img, imgsz=IMGSZ, conf=CONF, device="cpu", verbose=False, ) r = results[0] if r.boxes is None or len(r.boxes) == 0: return image, json.dumps({"detections": []}, indent=2) clss = r.boxes.cls.cpu().numpy() confs = r.boxes.conf.cpu().numpy() keep = [i for i in range(len(r.boxes)) if int(clss[i]) in DRAW_CLASSES] if not keep: return image, json.dumps({"detections": []}, indent=2) detections = [] for i in keep: cls_id = int(clss[i]) detections.append({ "class": CLASS_NAMES.get(cls_id, str(cls_id)), "confidence": round(float(confs[i]), 3), "bbox": r.boxes.xyxy[i].cpu().numpy().tolist(), }) r.boxes = r.boxes[keep] out_img = r.plot() det_json = json.dumps({"detections": detections}, indent=2) return out_img, det_json def run_dfine_classify(image, encoder_choice, refs_path): """Tab 2: D-FINE first, then classify crops with Jina or Nomic.""" if image is None: return None, "Upload an image." refs = Path(refs_path.strip()) if refs_path and refs_path.strip() else Path(REFS_DIR) if not refs.is_dir(): return None, f"Refs folder not found: {refs}" # Tuned on COCO GT: conf=0.5, gap=0.02. Lower det_threshold/min_side so D-FINE picks up more objects (gun, phone, etc.) like local. out_img, text = run_single_image( image, refs_dir=refs, encoder_choice=encoder_choice.lower(), det_threshold=0.15, conf_threshold=0.5, gap_threshold=0.02, min_side=24, crop_dedup_iou=0.4, ) if out_img is None: return None, text return out_img, text IMG_HEIGHT = 400 TAB_STYLE = """ """ with gr.Blocks(title="Small Object Detection") as app: gr.HTML(TAB_STYLE) gr.Markdown("# Small Object Detection") with gr.Tabs(): with gr.TabItem("Object Detection"): gr.Markdown("**Classes:** " + ", ".join(MODEL_CLASSES["v1"])) with gr.Row(): with gr.Column(scale=1): inp_det = gr.Image(label="Input image", height=IMG_HEIGHT) btn_det = gr.Button("Detect", variant="primary") out_img_det = gr.Image(label="Output", height=IMG_HEIGHT) det_output = gr.JSON(label="Detections") btn_det.click( fn=lambda img: run_detection(img, MODELS["v1"]), inputs=inp_det, outputs=[out_img_det, det_output], ) with gr.TabItem("D-FINE + Classify"): gr.Markdown( "**D-FINE** runs first (person/car grouping), then small-object crops are classified. " "Choose **Jina** or **Nomic** for the embedding/classification model. " "Uses the **refs** folder (one subfolder per class, e.g. refs/phone/, refs/cigarette/) with reference images." ) with gr.Row(): with gr.Column(scale=1): inp_dfine = gr.Image(type="pil", label="Input image", height=IMG_HEIGHT) encoder_choice = gr.Radio( choices=["Jina", "Nomic"], value="Jina", label="Embedding / classification model", ) refs_path = gr.Textbox( label="Refs folder path", value=REFS_DIR, placeholder="e.g. refs or /path/to/refs", ) btn_dfine = gr.Button("Run D-FINE + Classify", variant="primary") with gr.Column(scale=1): out_img_dfine = gr.Image(label="Output (crops with labels)", height=IMG_HEIGHT) out_text_dfine = gr.Textbox(label="Crop predictions", lines=10, interactive=False) btn_dfine.click( fn=run_dfine_classify, inputs=[inp_dfine, encoder_choice, refs_path], outputs=[out_img_dfine, out_text_dfine], concurrency_limit=1, ) app.launch( server_name=os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0"), server_port=int(os.environ.get("PORT", os.environ.get("GRADIO_SERVER_PORT", 7860))), )