# Copyright 2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import json
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import torch

from diffusers.image_processor import VaeImageProcessor
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
from diffusers.utils.torch_utils import randn_tensor

DEFAULT_NATIVE_RESOLUTION = 512

EXAMPLE_DOC_STRING = """
    Examples:
        ```py
        >>> from pathlib import Path
        >>> from diffusers import DiffusionPipeline
        >>> import torch

        >>> model_dir = Path("./PixNerd-XL-16-512").resolve()
        >>> pipe = DiffusionPipeline.from_pretrained(
        ...     str(model_dir),
        ...     local_files_only=True,
        ...     custom_pipeline=str(model_dir / "pipeline.py"),
        ...     trust_remote_code=True,
        ...     torch_dtype=torch.bfloat16,
        ... )
        >>> pipe.to("cuda")

        >>> print(pipe.id2label[207])
        >>> print(pipe.get_label_ids("golden retriever"))

        >>> generator = torch.Generator(device="cuda").manual_seed(42)
        >>> # timeshift=3.0 and order=2 are defaults in scheduler/scheduler_config.json
        >>> image = pipe(
        ...     class_labels="golden retriever",
        ...     height=512,
        ...     width=512,
        ...     num_inference_steps=25,
        ...     guidance_scale=4.0,
        ...     generator=generator,
        ... ).images[0]
        >>> image.save("demo.png")
        ```
"""

ConditioningInput = Union[int, str, List[Union[int, str]], torch.LongTensor]


class PixNerdPipeline(DiffusionPipeline):
    r"""
    Pipeline for class-conditional PixNerd pixel-space image generation.

    Parameters:
        transformer ([`PixNerdTransformer2DModel`]):
            Class-conditional PixNerd denoiser operating in pixel space.
        scheduler ([`PixNerdFlowMatchScheduler`]):
            Flow-matching scheduler with AdamLM multi-step coefficients.
        vae ([`PixNerdPixelVAE`], *optional*):
            Identity pixel autoencoder. May also be attached to `transformer.vae`.
        conditioner ([`PixNerdLabelConditioner`], *optional*):
            ImageNet class-label conditioner. May also be attached to `transformer.conditioner`.
        id2label (`dict[int, str]`, *optional*):
            ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
    """

    model_cpu_offload_seq = "conditioner->transformer->vae"
    _callback_tensor_inputs = ["latents"]
    _optional_components = ["vae", "conditioner"]

    def __init__(
        self,
        transformer,
        scheduler,
        vae=None,
        conditioner=None,
        id2label: Optional[Dict[Union[int, str], str]] = None,
    ):
        super().__init__()
        if vae is None:
            vae = getattr(transformer, "vae", None)
        if conditioner is None:
            conditioner = getattr(transformer, "conditioner", None)
        if vae is None or conditioner is None:
            raise ValueError("Pipeline requires `vae` and `conditioner` either explicitly or from `transformer`.")
        self.register_modules(
            vae=vae,
            conditioner=conditioner,
            transformer=transformer,
            scheduler=scheduler,
        )
        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
        if id2label is None:
            id2label = self._read_id2label_from_model_index(
                getattr(getattr(self, "config", None), "_name_or_path", None)
            )
        self._id2label = self._normalize_id2label(id2label)
        self.labels = self._build_label2id(self._id2label)
        self._labels_loaded_from_model_index = bool(self._id2label)

    def _get_device(self) -> torch.device:
        try:
            return self._execution_device
        except AttributeError:
            pass
        for name in ("transformer", "vae", "scheduler"):
            module = getattr(self, name, None)
            if isinstance(module, torch.nn.Module):
                parameter = next(module.parameters(), None)
                if parameter is not None:
                    return parameter.device
        return torch.device("cpu")

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path=None, *args, **kwargs):
        id2label_override = kwargs.pop("id2label", None)
        pipe = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
        id2label = id2label_override or cls._read_id2label_from_model_index(pretrained_model_name_or_path)
        if id2label:
            pipe._id2label = cls._normalize_id2label(id2label)
            pipe.labels = cls._build_label2id(pipe._id2label)
            pipe._labels_loaded_from_model_index = True
        return pipe

    def _ensure_labels_loaded(self) -> None:
        if self._labels_loaded_from_model_index:
            return
        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
        if loaded:
            self._id2label = loaded
            self.labels = self._build_label2id(self._id2label)
        self._labels_loaded_from_model_index = True

    @staticmethod
    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
        if not id2label:
            return {}
        return {int(key): value for key, value in id2label.items()}

    @staticmethod
    def _read_id2label_from_model_index(variant_path: Optional[Union[str, Path]]) -> Dict[int, str]:
        if not variant_path:
            return {}
        model_index_path = Path(variant_path).resolve() / "model_index.json"
        if not model_index_path.exists():
            return {}
        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
        id2label = raw.get("id2label")
        if not isinstance(id2label, dict):
            return {}
        return {int(key): value for key, value in id2label.items()}

    @staticmethod
    def _build_label2id(id2label: Dict[int, str]) -> Dict[str, int]:
        label2id: Dict[str, int] = {}
        for class_id, value in id2label.items():
            for synonym in value.split(","):
                synonym = synonym.strip()
                if synonym:
                    label2id[synonym] = int(class_id)
        return dict(sorted(label2id.items()))

    @property
    def id2label(self) -> Dict[int, str]:
        r"""ImageNet class id to English label string (comma-separated synonyms)."""
        self._ensure_labels_loaded()
        return self._id2label

    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
        r"""
        Map ImageNet label strings to class ids.

        Args:
            label (`str` or `list[str]`):
                One or more English label strings. Each string must match a synonym in `id2label`.
        """
        self._ensure_labels_loaded()
        if isinstance(label, str):
            label = [label]
        if not self.labels:
            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
        missing = [item for item in label if item not in self.labels]
        if missing:
            preview = ", ".join(list(self.labels.keys())[:8])
            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
        return [self.labels[item] for item in label]

    def _normalize_class_labels(
        self,
        class_labels: ConditioningInput,
        num_images_per_prompt: int = 1,
    ) -> List[int]:
        if torch.is_tensor(class_labels):
            values = class_labels.to(dtype=torch.long).reshape(-1).tolist()
        elif isinstance(class_labels, int):
            values = [class_labels]
        elif isinstance(class_labels, str):
            values = self.get_label_ids(class_labels)
        elif class_labels and isinstance(class_labels[0], str):
            values = self.get_label_ids(list(class_labels))
        else:
            values = [int(entry) for entry in class_labels]

        if num_images_per_prompt == 1:
            return values
        expanded: List[int] = []
        for value in values:
            expanded.extend([value] * num_images_per_prompt)
        return expanded

    def _get_patch_size(self) -> int:
        patch_size = getattr(self.transformer, "patch_size", None)
        if patch_size is not None:
            return int(patch_size)
        return int(getattr(self.transformer.config, "patch_size", 16))

    def _get_in_channels(self) -> int:
        in_channels = getattr(self.transformer, "in_channels", None)
        if in_channels is not None:
            return int(in_channels)
        return int(getattr(self.transformer.config, "in_channels", 3))

    def check_inputs(
        self,
        height: int,
        width: int,
        num_inference_steps: int,
        output_type: str,
    ) -> None:
        if num_inference_steps < 1:
            raise ValueError("num_inference_steps must be >= 1.")
        if output_type not in {"pil", "np", "pt", "latent"}:
            raise ValueError("output_type must be one of: 'pil', 'np', 'pt', 'latent'.")
        order = int(getattr(self.scheduler.config, "order", getattr(self.scheduler, "order", 2)))
        if order < 1:
            raise ValueError("scheduler.config.order must be >= 1.")

        patch_size = self._get_patch_size()
        if height % patch_size != 0 or width % patch_size != 0:
            raise ValueError(f"height and width must be divisible by patch_size={patch_size}.")

    def encode_condition(
        self,
        class_label_ids: List[int],
        negative_class_label_ids: Optional[List[int]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        metadata = {"device": self._get_device()}
        with torch.no_grad():
            cond, default_uncond = self.conditioner(class_label_ids, metadata)
            if negative_class_label_ids is not None:
                _, uncond = self.conditioner(negative_class_label_ids, metadata)
            else:
                uncond = default_uncond
        return cond, uncond

    def prepare_latents(
        self,
        batch_size: int,
        num_channels: int,
        height: int,
        width: int,
        dtype: torch.dtype,
        device: torch.device,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if latents is not None:
            return latents.to(device=device, dtype=dtype)
        return randn_tensor(
            (batch_size, num_channels, height, width),
            generator=generator,
            device=device,
            dtype=dtype,
        )

    @staticmethod
    def _fp_to_uint8(image: torch.Tensor) -> torch.Tensor:
        return torch.clip_((image + 1) * 127.5 + 0.5, 0, 255).to(torch.uint8)

    def decode_latents(self, latents: torch.Tensor, output_type: str = "pil"):
        if output_type == "latent":
            return latents

        image = self.vae.decode(latents)
        if output_type == "pt":
            return image
        images_uint8 = self._fp_to_uint8(image).permute(0, 2, 3, 1).cpu().numpy()
        if output_type == "np":
            return images_uint8
        if output_type == "pil":
            from PIL import Image

            return [Image.fromarray(img) for img in images_uint8]
        raise ValueError(f"Unsupported output_type: {output_type}")

    def _apply_decoder_patch_scaling(self, height: int, width: int) -> None:
        denoiser = getattr(self.transformer, "denoiser", self.transformer)
        if hasattr(denoiser, "decoder_patch_scaling_h"):
            denoiser.decoder_patch_scaling_h = height / DEFAULT_NATIVE_RESOLUTION
            denoiser.decoder_patch_scaling_w = width / DEFAULT_NATIVE_RESOLUTION

    @torch.inference_mode()
    def __call__(
        self,
        class_labels: Optional[ConditioningInput] = None,
        negative_class_labels: Optional[ConditioningInput] = None,
        num_images_per_prompt: int = 1,
        height: Optional[int] = None,
        width: Optional[int] = None,
        num_inference_steps: int = 25,
        guidance_scale: float = 4.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.Tensor] = None,
        output_type: str = "pil",
        return_dict: bool = True,
        prompt: Optional[ConditioningInput] = None,
        negative_prompt: Optional[ConditioningInput] = None,
    ) -> Union[ImagePipelineOutput, Tuple]:
        r"""
        Generate class-conditional images with PixNerd.

        Args:
            class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.LongTensor`):
                ImageNet class indices or human-readable English label strings.
            negative_class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.LongTensor`, *optional*):
                Optional negative class labels for classifier-free guidance.
            num_images_per_prompt (`int`, defaults to `1`):
                Number of images to generate per class label.
            height (`int`, *optional*):
                Output image height in pixels. Defaults to `512`.
            width (`int`, *optional*):
                Output image width in pixels. Defaults to `512`.
            num_inference_steps (`int`, defaults to `25`):
                Number of denoising steps.
            guidance_scale (`float`, defaults to `4.0`):
                Classifier-free guidance scale applied by the scheduler.
            generator (`torch.Generator`, *optional*):
                RNG for reproducibility.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy pixel tensor.
            output_type (`str`, defaults to `"pil"`):
                `"pil"`, `"np"`, `"pt"`, or `"latent"`.
            return_dict (`bool`, defaults to `True`):
                Return [`ImagePipelineOutput`] if True.
            prompt (`int`, `str`, `list`, *optional*):
                Deprecated alias for `class_labels`.
            negative_prompt (`int`, `str`, `list`, *optional*):
                Deprecated alias for `negative_class_labels`.
        """
        if class_labels is None:
            class_labels = prompt
        if negative_class_labels is None:
            negative_class_labels = negative_prompt
        if class_labels is None:
            raise ValueError("`class_labels` (or deprecated `prompt`) must be provided.")

        height = int(height or DEFAULT_NATIVE_RESOLUTION)
        width = int(width or DEFAULT_NATIVE_RESOLUTION)
        self.check_inputs(height, width, num_inference_steps, output_type)

        patch_size = self._get_patch_size()
        height = (height // patch_size) * patch_size
        width = (width // patch_size) * patch_size
        self._apply_decoder_patch_scaling(height, width)

        class_label_ids = self._normalize_class_labels(class_labels, num_images_per_prompt)
        negative_label_ids = None
        if negative_class_labels is not None:
            negative_label_ids = self._normalize_class_labels(negative_class_labels, num_images_per_prompt)

        device = self._get_device()
        model_dtype = next(self.transformer.parameters()).dtype
        batch_size = len(class_label_ids)

        cond, uncond = self.encode_condition(class_label_ids, negative_label_ids)
        latents = self.prepare_latents(
            batch_size=batch_size,
            num_channels=self._get_in_channels(),
            height=height,
            width=width,
            dtype=model_dtype,
            device=device,
            generator=generator,
            latents=latents,
        )

        self.scheduler.set_timesteps(
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            device=device,
        )

        for timestep in self.progress_bar(self.scheduler.timesteps):
            cfg_latents = torch.cat([latents, latents], dim=0)
            cfg_t = timestep.repeat(cfg_latents.shape[0]).to(device=device, dtype=latents.dtype)
            cfg_condition = torch.cat([uncond, cond], dim=0)
            model_output = self.transformer(
                sample=cfg_latents.to(dtype=model_dtype),
                timestep=cfg_t,
                encoder_hidden_states=cfg_condition,
            ).sample
            model_output = self.scheduler.classifier_free_guidance(model_output)
            latents = self.scheduler.step(
                model_output=model_output,
                timestep=timestep,
                sample=latents,
            ).prev_sample

        image = self.decode_latents(latents, output_type=output_type)

        self.maybe_free_model_hooks()
        if not return_dict:
            return (image,)
        return ImagePipelineOutput(images=image)


PixNerdPipelineOutput = ImagePipelineOutput