"""
models/inpainter_sd.py
-----------------------
Stable Diffusion inpainting via diffusers.

Produces high-quality, context-aware fills for complex backgrounds.
Requires a CUDA GPU with  8 GB VRAM for reasonable speed.
CPU inference is supported but slow (~5 min per image).
"""

import os
import sys
from typing import Optional

import numpy as np
import torch
from PIL import Image

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import (
    DEVICE, DTYPE,
    SD_INPAINT_MODEL, SD_INPAINT_STEPS,
    SD_INPAINT_GUIDANCE, SD_INPAINT_PROMPT,
)


# SD inpainting works best at multiples of 64; we tile/rescale to this
_SD_SIZE = 512


def _fit_to_multiple(value: int, multiple: int = 64) -> int:
    return ((value + multiple - 1) // multiple) * multiple


class SDInpainter:
    """
    Stable Diffusion inpainting wrapper using diffusers.
    The pipeline is loaded lazily on first use.
    """

    def __init__(self) -> None:
        self._pipe = None

    def _load(self) -> None:
        if self._pipe is not None:
            return

        print("  Loading Stable Diffusion inpainting pipeline ...")
        try:
            from diffusers import StableDiffusionInpaintPipeline, AutoPipelineForInpainting

            # SD 2.x uses a different pipeline; try auto-detect first
            try:
                self._pipe = AutoPipelineForInpainting.from_pretrained(
                    SD_INPAINT_MODEL,
                    torch_dtype=DTYPE,
                ).to(DEVICE)
            except Exception:
                self._pipe = StableDiffusionInpaintPipeline.from_pretrained(
                    SD_INPAINT_MODEL,
                    torch_dtype=DTYPE,
                    safety_checker=None,
                ).to(DEVICE)

            # Enable optimisations
            self._pipe.enable_model_cpu_offload()
            self._pipe.enable_attention_slicing(1)

        except ImportError as e:
            raise RuntimeError(
                "diffusers is not installed.\n"
                "Run: pip install diffusers accelerate\n"
                f"Original error: {e}"
            )

    def inpaint(
        self,
        image_pil:   Image.Image,
        mask:        np.ndarray,
        prompt:      str = SD_INPAINT_PROMPT,
        num_steps:   int = SD_INPAINT_STEPS,
        guidance:    float = SD_INPAINT_GUIDANCE,
    ) -> Image.Image:
        import gc
        gc.collect()
        torch.cuda.empty_cache()
        """
        Fill in the masked region using Stable Diffusion inpainting.

        Args:
            image_pil:  RGB PIL image.
            mask:       Binary mask (HW uint8, 255 = inpaint here).
            prompt:     Text prompt guiding the fill (empty  unconditioned).
            num_steps:  Diffusion steps.
            guidance:   Classifier-free guidance scale.

        Returns:
            Inpainted RGB PIL image at the original resolution.
        """
        self._load()

        original_size = image_pil.size  # (W, H)
        orig_w, orig_h = original_size

        # SD needs H, W to be multiples of 8; work at _SD_SIZE on the short side
        scale = _SD_SIZE / min(orig_w, orig_h)
        proc_w = _fit_to_multiple(int(orig_w * scale))
        proc_h = _fit_to_multiple(int(orig_h * scale))

        image_resized = image_pil.resize((proc_w, proc_h), Image.LANCZOS)
        mask_pil = Image.fromarray(mask).resize(
            (proc_w, proc_h), Image.NEAREST
        ).convert("L")

        print(f"  SD inpainting at {proc_w}{proc_h} ({num_steps} steps) ...")

        out = self._pipe(
            prompt=prompt or "background, clean, seamless",
            image=image_resized,
            mask_image=mask_pil,
            num_inference_steps=num_steps,
            guidance_scale=guidance,
            height=proc_h,
            width=proc_w,
        ).images[0]

        # Restore to original resolution
        result = out.resize(original_size, Image.LANCZOS)

        import cv2
        result_np = np.array(result)
        orig_np   = np.array(image_pil)

        # Hard mask for interior, feathered only at the 1-2px boundary
        hard_mask    = (mask > 0).astype(np.uint8)
        feather_size = 3
        feathered    = cv2.GaussianBlur(
            hard_mask.astype(np.float32),
            (feather_size * 2 + 1, feather_size * 2 + 1), 0
        )
        interior = cv2.erode(hard_mask, np.ones((5, 5), np.uint8)).astype(np.float32)
        alpha    = np.clip(feathered + interior, 0, 1)
        alpha_3  = np.stack([alpha] * 3, axis=-1)

        blended = (result_np * alpha_3 + orig_np * (1 - alpha_3)).astype(np.uint8)
        return Image.fromarray(blended)