""" models/detector.py ─────────────────── Open-vocabulary object detector using OWL-ViT (via HuggingFace transformers). Replaces GroundingDINO which requires custom CUDA C++ ops that fail to compile on HuggingFace Spaces and many cloud environments. OWL-ViT: - Pure Python / PyTorch — no custom ops, works on CPU and CUDA - Open-vocabulary: detects any object described in natural language - Comparable accuracy to GroundingDINO for common objects - Auto-downloaded from HuggingFace Hub on first use (~600 MB) """ import os import sys from typing import List, Tuple, Dict, Any import torch from PIL import Image sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from config import DEVICE, GDINO_BOX_THRESHOLD OWLVIT_MODEL = "google/owlvit-base-patch32" # smaller/faster # Alternative: "google/owlvit-large-patch14" # more accurate, ~1.2 GB class GroundingDINODetector: """ Drop-in replacement for the original GroundingDINODetector. Internally uses OWL-ViT — same public API, no CUDA custom ops. """ def __init__(self) -> None: self._model = None self._processor = None def _load(self) -> None: if self._model is not None: return print(" Loading OWL-ViT detector …") try: from transformers import OwlViTProcessor, OwlViTForObjectDetection except ImportError: raise RuntimeError("OWL-ViT requires 'transformers'. Run: pip install transformers") self._processor = OwlViTProcessor.from_pretrained(OWLVIT_MODEL) self._model = OwlViTForObjectDetection.from_pretrained(OWLVIT_MODEL) self._model = self._model.to(DEVICE) self._model.eval() print(" OWL-ViT ready.") @torch.no_grad() def detect( self, image_pil: Image.Image, text_prompt: str, box_threshold: float = GDINO_BOX_THRESHOLD, text_threshold: float = 0.0, # unused, kept for API compatibility ) -> List[Dict[str, Any]]: """ Detect objects matching `text_prompt` in `image_pil`. Returns list of dicts: { "box": (x1,y1,x2,y2), "score": float, "label": str } Coordinates are absolute pixels. """ self._load() # OWL-ViT accepts a list of text queries per image # Wrap single prompt as a list-of-lists (batch_size=1) texts = [[text_prompt]] inputs = self._processor( text=texts, images=image_pil, return_tensors="pt", truncation=True, ).to(DEVICE) outputs = self._model(**inputs) # Post-process → absolute bounding boxes w, h = image_pil.size target_sizes = torch.tensor([[h, w]], device=DEVICE) results = self._processor.post_process_object_detection( outputs=outputs, target_sizes=target_sizes, threshold=box_threshold, )[0] # index 0 = first (only) image in batch boxes = results["boxes"].cpu() # N × 4 (x1 y1 x2 y2, absolute) scores = results["scores"].cpu() # N labels = results["labels"].cpu() # N (index into texts[0]) detections = [] for box, score, label_idx in zip(boxes, scores, labels): x1, y1, x2, y2 = box.tolist() x1, y1 = max(0, int(x1)), max(0, int(y1)) x2, y2 = min(w, int(x2)), min(h, int(y2)) detections.append({ "box": (x1, y1, x2, y2), "score": float(score), "label": texts[0][int(label_idx)] if int(label_idx) < len(texts[0]) else text_prompt, }) print(f" OWL-ViT found {len(detections)} candidate(s) " f"for prompt '{text_prompt}'") return detections @torch.no_grad() def detect_from_image( self, image_pil: Image.Image, query_pil: Image.Image, box_threshold: float = GDINO_BOX_THRESHOLD, ) -> List[Dict[str, Any]]: """ Detect objects in `image_pil` that visually match `query_pil`. Returns list of dicts: { "box": (x1,y1,x2,y2), "score": float, "label": str } Coordinates are absolute pixels. """ self._load() inputs = self._processor( images=image_pil, query_images=query_pil, return_tensors="pt", ).to(DEVICE) outputs = self._model.image_guided_detection(**inputs) # Post-process → absolute bounding boxes w, h = image_pil.size target_sizes = torch.tensor([[h, w]], device=DEVICE) results = self._processor.post_process_image_guided_detection( outputs=outputs, target_sizes=target_sizes, threshold=0.05, # Use very low internal threshold to find any small matches )[0] # index 0 = first (only) image in batch boxes = results["boxes"].cpu() # N × 4 (x1 y1 x2 y2, absolute) scores = results["scores"].cpu() # N detections = [] for box, score in zip(boxes, scores): x1, y1, x2, y2 = box.tolist() x1, y1 = max(0, int(x1)), max(0, int(y1)) x2, y2 = min(w, int(x2)), min(h, int(y2)) # Filter out boxes that cover too much of the image (likely hallucinations) # Most objects to be removed are small. 15% is a generous limit. box_area = (x2 - x1) * (y2 - y1) image_area = w * h ratio = box_area / image_area if ratio > 0.15: # Still log it so we know it was there if score > 0.05: print(f" [!] Skipping suspicious large area: {box} (score {score:.3f}, ratio {ratio:.2f})") continue # Only accept boxes that meet the actual user threshold if score < box_threshold: continue detections.append({ "box": (x1, y1, x2, y2), "score": float(score), "label": "Image Match", }) # Sort by score descending detections.sort(key=lambda x: x["score"], reverse=True) print(f" OWL-ViT found {len(detections)} candidate(s) via image query") return detections def boxes_only( self, image_pil: Image.Image, text_prompt: str, **kwargs, ) -> List[Tuple[int, int, int, int]]: """Convenience wrapper — returns only list of (x1,y1,x2,y2) boxes.""" return [d["box"] for d in self.detect(image_pil, text_prompt, **kwargs)]