"""
models/detector.py
───────────────────
Open-vocabulary object detector using OWL-ViT (via HuggingFace transformers).

Replaces GroundingDINO which requires custom CUDA C++ ops that fail to compile
on HuggingFace Spaces and many cloud environments.

OWL-ViT:
  - Pure Python / PyTorch — no custom ops, works on CPU and CUDA
  - Open-vocabulary: detects any object described in natural language
  - Comparable accuracy to GroundingDINO for common objects
  - Auto-downloaded from HuggingFace Hub on first use (~600 MB)
"""

import os
import sys
from typing import List, Tuple, Dict, Any

import torch
from PIL import Image

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import DEVICE, GDINO_BOX_THRESHOLD


OWLVIT_MODEL = "google/owlvit-base-patch32"   # smaller/faster
# Alternative: "google/owlvit-large-patch14"  # more accurate, ~1.2 GB


class GroundingDINODetector:
    """
    Drop-in replacement for the original GroundingDINODetector.
    Internally uses OWL-ViT — same public API, no CUDA custom ops.
    """

    def __init__(self) -> None:
        self._model     = None
        self._processor = None

    def _load(self) -> None:
        if self._model is not None:
            return

        print("  Loading OWL-ViT detector …")
        try:
            from transformers import OwlViTProcessor, OwlViTForObjectDetection
        except ImportError:
            raise RuntimeError("OWL-ViT requires 'transformers'. Run: pip install transformers")

        self._processor = OwlViTProcessor.from_pretrained(OWLVIT_MODEL)
        self._model     = OwlViTForObjectDetection.from_pretrained(OWLVIT_MODEL)
        self._model     = self._model.to(DEVICE)
        self._model.eval()
        print("  OWL-ViT ready.")

    @torch.no_grad()
    def detect(
        self,
        image_pil:      Image.Image,
        text_prompt:    str,
        box_threshold:  float = GDINO_BOX_THRESHOLD,
        text_threshold: float = 0.0,   # unused, kept for API compatibility
    ) -> List[Dict[str, Any]]:
        """
        Detect objects matching `text_prompt` in `image_pil`.

        Returns list of dicts: { "box": (x1,y1,x2,y2), "score": float, "label": str }
        Coordinates are absolute pixels.
        """
        self._load()

        # OWL-ViT accepts a list of text queries per image
        # Wrap single prompt as a list-of-lists (batch_size=1)
        texts = [[text_prompt]]

        inputs = self._processor(
            text=texts,
            images=image_pil,
            return_tensors="pt",
            truncation=True,
        ).to(DEVICE)

        outputs = self._model(**inputs)

        # Post-process → absolute bounding boxes
        w, h = image_pil.size
        target_sizes = torch.tensor([[h, w]], device=DEVICE)

        results = self._processor.post_process_object_detection(
            outputs=outputs,
            target_sizes=target_sizes,
            threshold=box_threshold,
        )[0]   # index 0 = first (only) image in batch

        boxes  = results["boxes"].cpu()    # N × 4  (x1 y1 x2 y2, absolute)
        scores = results["scores"].cpu()   # N
        labels = results["labels"].cpu()   # N  (index into texts[0])

        detections = []
        for box, score, label_idx in zip(boxes, scores, labels):
            x1, y1, x2, y2 = box.tolist()
            x1, y1 = max(0, int(x1)), max(0, int(y1))
            x2, y2 = min(w, int(x2)), min(h, int(y2))

            detections.append({
                "box":   (x1, y1, x2, y2),
                "score": float(score),
                "label": texts[0][int(label_idx)] if int(label_idx) < len(texts[0]) else text_prompt,
            })

        print(f"    OWL-ViT found {len(detections)} candidate(s) "
              f"for prompt '{text_prompt}'")
        return detections


    @torch.no_grad()
    def detect_from_image(
        self,
        image_pil:      Image.Image,
        query_pil:      Image.Image,
        box_threshold:  float = GDINO_BOX_THRESHOLD,
    ) -> List[Dict[str, Any]]:
        """
        Detect objects in `image_pil` that visually match `query_pil`.

        Returns list of dicts: { "box": (x1,y1,x2,y2), "score": float, "label": str }
        Coordinates are absolute pixels.
        """
        self._load()

        inputs = self._processor(
            images=image_pil,
            query_images=query_pil,
            return_tensors="pt",
        ).to(DEVICE)

        outputs = self._model.image_guided_detection(**inputs)

        # Post-process → absolute bounding boxes
        w, h = image_pil.size
        target_sizes = torch.tensor([[h, w]], device=DEVICE)

        results = self._processor.post_process_image_guided_detection(
            outputs=outputs,
            target_sizes=target_sizes,
            threshold=0.05,  # Use very low internal threshold to find any small matches
        )[0]   # index 0 = first (only) image in batch

        boxes  = results["boxes"].cpu()    # N × 4  (x1 y1 x2 y2, absolute)
        scores = results["scores"].cpu()   # N

        detections = []
        for box, score in zip(boxes, scores):
            x1, y1, x2, y2 = box.tolist()
            x1, y1 = max(0, int(x1)), max(0, int(y1))
            x2, y2 = min(w, int(x2)), min(h, int(y2))

            # Filter out boxes that cover too much of the image (likely hallucinations)
            # Most objects to be removed are small. 15% is a generous limit.
            box_area = (x2 - x1) * (y2 - y1)
            image_area = w * h
            ratio = box_area / image_area
            
            if ratio > 0.15:
                # Still log it so we know it was there
                if score > 0.05:
                    print(f"    [!] Skipping suspicious large area: {box} (score {score:.3f}, ratio {ratio:.2f})")
                continue

            # Only accept boxes that meet the actual user threshold
            if score < box_threshold:
                continue

            detections.append({
                "box":   (x1, y1, x2, y2),
                "score": float(score),
                "label": "Image Match",
            })

        # Sort by score descending
        detections.sort(key=lambda x: x["score"], reverse=True)


        print(f"    OWL-ViT found {len(detections)} candidate(s) via image query")
        return detections

    def boxes_only(
        self,
        image_pil:   Image.Image,
        text_prompt: str,
        **kwargs,
    ) -> List[Tuple[int, int, int, int]]:
        """Convenience wrapper — returns only list of (x1,y1,x2,y2) boxes."""
        return [d["box"] for d in self.detect(image_pil, text_prompt, **kwargs)]