"""
models/clip_matcher.py
-----------------------
Uses CLIP to compute similarity between a reference-object crop
and candidate regions in the scene, and to generate text prompts
for GroundingDINO.
"""
 
import os
from typing import List, Tuple, Optional
 
import numpy as np
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
 
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import CLIP_MODEL, DEVICE, CLIP_SIMILARITY_THRESHOLD
 
 
class CLIPMatcher:
    CANDIDATE_LABELS = [
        "a person", "a car", "a chair", "a table", "a dog", "a cat",
        "a bottle", "a cup", "a book", "a laptop", "a phone", "a bag",
        "a bicycle", "a motorcycle", "a bus", "a truck", "a tree",
        "a flower", "a ball", "a clock", "a lamp", "a vase",
        "a backpack", "furniture", "an electronic device", "a toy",
        "a plant", "a statue", "a sign", "a box", "an object",
        "a tap", "a faucet", "a pipe", "a bucket", "a hose", "a valve",
        "trash", "garbage", "waste", "litter", "rubbish",
        "a plastic bag", "a polythene bag", "a leaf", "dry leaves",
        "crumpled paper", "a wrapper", "debris",
    ]
 
    def __init__(self) -> None:
        print("  Loading CLIP ...")
        self.model = CLIPModel.from_pretrained(CLIP_MODEL).to(DEVICE)
        self.processor = CLIPProcessor.from_pretrained(CLIP_MODEL)
        self.model.eval()
 
    def _encode_text(self, texts: List[str]) -> torch.Tensor:
        """Encode a list of text strings → normalised embeddings."""
        inputs = self.processor(
            text=texts,
            return_tensors="pt",
            padding=True,       # text-only: OK
            truncation=True,    # text-only: OK
            max_length=77,      # text-only: OK
        ).to(DEVICE)
        features = self.model.get_text_features(**inputs)
        return F.normalize(features, dim=-1)
 
    def _encode_image(self, image: Image.Image) -> torch.Tensor:
        """Encode a single PIL image → normalised embedding."""
        # Hugging Face Spaces Transformer bug REQUIRES these flags even for single images!
        inputs = self.processor(
            images=image,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)
        features = self.model.get_image_features(**inputs)
        return F.normalize(features, dim=-1)
 
    @torch.no_grad()
    def predict_label(self, ref_image: Image.Image) -> str:
        """Zero-shot classify the reference object → best text label."""
        text_embs  = self._encode_text(self.CANDIDATE_LABELS)   # N × D
        image_emb  = self._encode_image(ref_image)              # 1 × D
 
        logits = (image_emb @ text_embs.T) * 100.0
        probs  = logits.softmax(dim=-1)[0]
 
        best_idx   = probs.argmax().item()
        best_label = self.CANDIDATE_LABELS[best_idx]
        best_prob  = probs[best_idx].item()
 
        print(f"    CLIP label: '{best_label}' (conf={best_prob:.2f})")
        return best_label
 
    @torch.no_grad()
    def image_embedding(self, image: Image.Image) -> torch.Tensor:
        """Public alias — normalised CLIP image embedding (1 × D)."""
        return self._encode_image(image)
 
    @torch.no_grad()
    def score_crops(
        self,
        ref_image: Image.Image,
        scene_image: Image.Image,
        boxes: List[Tuple[int, int, int, int]],
    ) -> List[float]:
        """
        For each box crop the scene and compute cosine similarity
        to the reference image. Returns one float score per box.
        """
        if not boxes:
            return []
 
        ref_emb = self._encode_image(ref_image)   # 1 × D
        scores  = []
 
        for x1, y1, x2, y2 in boxes:
            crop = scene_image.crop((x1, y1, x2, y2))
 
            # skip degenerate boxes
            if crop.width < 2 or crop.height < 2:
                scores.append(0.0)
                continue
 
            crop_emb = self._encode_image(crop)    # 1 × D
            sim = (ref_emb @ crop_emb.T).squeeze()
            scores.append(float(sim.item() if sim.dim() == 0 else sim[0].item()))
 
        return scores
 
    def filter_boxes_by_similarity(
        self,
        ref_image: Image.Image,
        scene_image: Image.Image,
        boxes: List[Tuple[int, int, int, int]],
        threshold: float = CLIP_SIMILARITY_THRESHOLD,
    ) -> List[Tuple[int, int, int, int]]:
        scores   = self.score_crops(ref_image, scene_image, boxes)
        filtered = []
        for box, score in zip(boxes, scores):
            print(f"    box {box}  sim={score:.3f}", end="")
            if score >= threshold:
                print(" [v]")
                filtered.append(box)
            else:
                print(" [x]")
        return filtered