"""
Cosmos Transfer2.5 Inference
World-to-world translation with multiple control inputs
"""
import torch
import time
from typing import Optional, Union, List, Dict
from PIL import Image
from pathlib import Path
import numpy as np

from .loaders import load_transfer_pipeline, get_device_info
from .utils_video import save_video, load_video_frames, extract_edges, extract_depth_map


# Supported control types
CONTROL_TYPES = ["blur", "edge", "depth", "segmentation"]


def transfer_video(
    input_video: Union[str, List[Image.Image]],
    prompt: str,
    control_type: str = "blur",
    negative_prompt: str = "",
    num_inference_steps: int = 30,
    guidance_scale: float = 7.0,
    controlnet_conditioning_scale: float = 1.0,
    seed: int = 42,
    output_path: Optional[str] = None
) -> dict:
    """
    Transfer video to new domain/style (World-to-world translation)

    Args:
        input_video: Input video path or frames
        prompt: Text description of target domain/style
        control_type: Type of control signal ("blur", "edge", "depth", "segmentation")
        negative_prompt: What to avoid
        num_inference_steps: Diffusion steps
        guidance_scale: CFG scale
        controlnet_conditioning_scale: How strongly to follow control signal
        seed: Random seed
        output_path: Path to save output

    Returns:
        dict with video_path, frames, metadata
    """
    start_time = time.time()

    if control_type not in CONTROL_TYPES:
        raise ValueError(f"control_type must be one of {CONTROL_TYPES}")

    # Load video frames
    if isinstance(input_video, str):
        input_frames = load_video_frames(input_video)
    else:
        input_frames = input_video

    # Extract control signal based on type
    control_frames = prepare_control_frames(input_frames, control_type)

    pipe = load_transfer_pipeline()
    generator = torch.Generator(device="cuda").manual_seed(seed)

    device_info = get_device_info()
    print(f"\n=== Transfer2.5 Video Transfer ===")
    print(f"Control type: {control_type}")
    print(f"Input frames: {len(input_frames)}")
    print(f"Prompt: {prompt[:100]}...")
    print(f"Steps: {num_inference_steps}, CFG: {guidance_scale}, Seed: {seed}")
    print(f"Device: {device_info['name']}")

    output = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        control_video=control_frames,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        controlnet_conditioning_scale=controlnet_conditioning_scale,
        generator=generator
    )

    frames = output.frames[0]
    inference_time = time.time() - start_time

    if output_path is None:
        output_path = f"output_transfer_{control_type}_seed{seed}.mp4"

    save_video(frames, output_path, fps=16)

    result = {
        "video_path": output_path,
        "control_type": control_type,
        "input_frames": len(input_frames),
        "output_frames": len(frames),
        "inference_time_s": round(inference_time, 2),
        "seed": seed,
        "prompt": prompt,
        "device": device_info['name']
    }

    print(f"Video saved: {output_path} ({len(frames)} frames in {inference_time:.2f}s)")
    return result


def prepare_control_frames(
    frames: List[Image.Image],
    control_type: str
) -> List[Image.Image]:
    """
    Prepare control signal frames from input video

    Args:
        frames: Input video frames
        control_type: Type of control to extract

    Returns:
        List of control frames
    """
    print(f"Extracting {control_type} control signal from {len(frames)} frames...")

    if control_type == "blur":
        # Gaussian blur for structure preservation
        from PIL import ImageFilter
        return [f.filter(ImageFilter.GaussianBlur(radius=10)) for f in frames]

    elif control_type == "edge":
        # Canny edge detection
        return [extract_edges(f) for f in frames]

    elif control_type == "depth":
        # Depth estimation (simplified - uses gradient as proxy)
        return [extract_depth_map(f) for f in frames]

    elif control_type == "segmentation":
        # For segmentation, we'd need a separate model
        # Using edge detection as fallback
        print("Note: Using edge detection as segmentation proxy")
        return [extract_edges(f) for f in frames]

    else:
        raise ValueError(f"Unknown control type: {control_type}")


def transfer_style(
    input_video: Union[str, List[Image.Image]],
    source_style: str,
    target_style: str,
    control_type: str = "blur",
    num_inference_steps: int = 30,
    seed: int = 42,
    output_path: Optional[str] = None
) -> dict:
    """
    Transfer video from source style/domain to target style/domain

    Common style transfers:
    - "day" -> "night"
    - "sunny" -> "rainy"
    - "clear" -> "foggy"
    - "urban" -> "rural"

    Args:
        input_video: Input video
        source_style: Description of source style
        target_style: Description of target style
        control_type: Control signal type
        num_inference_steps: Diffusion steps
        seed: Random seed
        output_path: Output path

    Returns:
        dict with video_path and metadata
    """
    prompt = f"Transform from {source_style} to {target_style}. High quality, realistic, smooth motion."
    negative_prompt = f"{source_style}, low quality, blurry, artifacts, flickering"

    return transfer_video(
        input_video=input_video,
        prompt=prompt,
        control_type=control_type,
        negative_prompt=negative_prompt,
        num_inference_steps=num_inference_steps,
        seed=seed,
        output_path=output_path
    )


def batch_transfer_styles(
    input_video: Union[str, List[Image.Image]],
    style_pairs: List[tuple],
    control_type: str = "blur",
    num_inference_steps: int = 30,
    seed: int = 42
) -> List[dict]:
    """
    Apply multiple style transfers to the same input video

    Args:
        input_video: Input video
        style_pairs: List of (source_style, target_style) tuples
        control_type: Control signal type
        num_inference_steps: Diffusion steps
        seed: Base random seed

    Returns:
        List of result dicts
    """
    results = []

    # Load frames once
    if isinstance(input_video, str):
        frames = load_video_frames(input_video)
    else:
        frames = input_video

    for i, (source, target) in enumerate(style_pairs):
        print(f"\n=== Style Transfer {i+1}/{len(style_pairs)}: {source} -> {target} ===")

        output_path = f"output_transfer_{source}_to_{target}_seed{seed}.mp4"

        result = transfer_style(
            input_video=frames,
            source_style=source,
            target_style=target,
            control_type=control_type,
            num_inference_steps=num_inference_steps,
            seed=seed + i,  # Different seed for each
            output_path=output_path
        )

        results.append(result)

    return results