""" models/clip_matcher.py ----------------------- Uses CLIP to compute similarity between a reference-object crop and candidate regions in the scene, and to generate text prompts for GroundingDINO. """ import os from typing import List, Tuple, Optional import numpy as np import torch import torch.nn.functional as F from PIL import Image from transformers import CLIPProcessor, CLIPModel import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from config import CLIP_MODEL, DEVICE, CLIP_SIMILARITY_THRESHOLD class CLIPMatcher: CANDIDATE_LABELS = [ "a person", "a car", "a chair", "a table", "a dog", "a cat", "a bottle", "a cup", "a book", "a laptop", "a phone", "a bag", "a bicycle", "a motorcycle", "a bus", "a truck", "a tree", "a flower", "a ball", "a clock", "a lamp", "a vase", "a backpack", "furniture", "an electronic device", "a toy", "a plant", "a statue", "a sign", "a box", "an object", "a tap", "a faucet", "a pipe", "a bucket", "a hose", "a valve", "trash", "garbage", "waste", "litter", "rubbish", "a plastic bag", "a polythene bag", "a leaf", "dry leaves", "crumpled paper", "a wrapper", "debris", ] def __init__(self) -> None: print(" Loading CLIP ...") self.model = CLIPModel.from_pretrained(CLIP_MODEL).to(DEVICE) self.processor = CLIPProcessor.from_pretrained(CLIP_MODEL) self.model.eval() def _encode_text(self, texts: List[str]) -> torch.Tensor: """Encode a list of text strings → normalised embeddings.""" inputs = self.processor( text=texts, return_tensors="pt", padding=True, # text-only: OK truncation=True, # text-only: OK max_length=77, # text-only: OK ).to(DEVICE) features = self.model.get_text_features(**inputs) return F.normalize(features, dim=-1) def _encode_image(self, image: Image.Image) -> torch.Tensor: """Encode a single PIL image → normalised embedding.""" # Hugging Face Spaces Transformer bug REQUIRES these flags even for single images! inputs = self.processor( images=image, return_tensors="pt", padding=True, truncation=True ).to(DEVICE) features = self.model.get_image_features(**inputs) return F.normalize(features, dim=-1) @torch.no_grad() def predict_label(self, ref_image: Image.Image) -> str: """Zero-shot classify the reference object → best text label.""" text_embs = self._encode_text(self.CANDIDATE_LABELS) # N × D image_emb = self._encode_image(ref_image) # 1 × D logits = (image_emb @ text_embs.T) * 100.0 probs = logits.softmax(dim=-1)[0] best_idx = probs.argmax().item() best_label = self.CANDIDATE_LABELS[best_idx] best_prob = probs[best_idx].item() print(f" CLIP label: '{best_label}' (conf={best_prob:.2f})") return best_label @torch.no_grad() def image_embedding(self, image: Image.Image) -> torch.Tensor: """Public alias — normalised CLIP image embedding (1 × D).""" return self._encode_image(image) @torch.no_grad() def score_crops( self, ref_image: Image.Image, scene_image: Image.Image, boxes: List[Tuple[int, int, int, int]], ) -> List[float]: """ For each box crop the scene and compute cosine similarity to the reference image. Returns one float score per box. """ if not boxes: return [] ref_emb = self._encode_image(ref_image) # 1 × D scores = [] for x1, y1, x2, y2 in boxes: crop = scene_image.crop((x1, y1, x2, y2)) # skip degenerate boxes if crop.width < 2 or crop.height < 2: scores.append(0.0) continue crop_emb = self._encode_image(crop) # 1 × D sim = (ref_emb @ crop_emb.T).squeeze() scores.append(float(sim.item() if sim.dim() == 0 else sim[0].item())) return scores def filter_boxes_by_similarity( self, ref_image: Image.Image, scene_image: Image.Image, boxes: List[Tuple[int, int, int, int]], threshold: float = CLIP_SIMILARITY_THRESHOLD, ) -> List[Tuple[int, int, int, int]]: scores = self.score_crops(ref_image, scene_image, boxes) filtered = [] for box, score in zip(boxes, scores): print(f" box {box} sim={score:.3f}", end="") if score >= threshold: print(" [v]") filtered.append(box) else: print(" [x]") return filtered