# NVIDIA Cosmos Predict2.5 & Transfer2.5 Demo
# HuggingFace Space with ZeroGPU (H200 70GB)

import gradio as gr
import torch
import time
import os
import tempfile
from PIL import Image
from pathlib import Path

# Import spaces for ZeroGPU decorator
try:
    import spaces
    HAS_SPACES = True
except ImportError:
    HAS_SPACES = False
    print("Warning: spaces module not available. Running without ZeroGPU.")

# Global model tracking
current_model = None


def get_device_info():
    """Get GPU device information"""
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        props = torch.cuda.get_device_properties(device)
        free_mem, total_mem = torch.cuda.mem_get_info(device)
        return {
            "device": f"cuda:{device}",
            "name": props.name,
            "total_vram_gb": round(total_mem / (1024**3), 2),
            "free_vram_gb": round(free_mem / (1024**3), 2),
        }
    return {"device": "cpu", "name": "CPU", "total_vram_gb": 0, "free_vram_gb": 0}


# Wrapper for ZeroGPU compatibility
def gpu_decorator(duration=300):
    if HAS_SPACES:
        return spaces.GPU(duration=duration)
    else:
        return lambda f: f


@gpu_decorator(duration=300)
def run_predict_text2world(
    prompt: str,
    negative_prompt: str,
    num_frames: int,
    height: int,
    width: int,
    num_inference_steps: int,
    guidance_scale: float,
    seed: int
):
    """Run Cosmos Predict2.5 Text2World inference"""
    global current_model

    start_time = time.time()
    device_info = get_device_info()

    log_lines = [
        "=" * 50,
        "COSMOS PREDICT2.5 - TEXT2WORLD",
        "=" * 50,
        f"Device: {device_info['name']}",
        f"VRAM: {device_info['free_vram_gb']}/{device_info['total_vram_gb']} GB free",
        f"Prompt: {prompt[:100]}...",
        f"Resolution: {width}x{height}, Frames: {num_frames}",
        f"Steps: {num_inference_steps}, CFG: {guidance_scale}, Seed: {seed}",
        "",
        "Loading model..."
    ]

    try:
        from diffusers import DiffusionPipeline
        import gc

        # Clear previous model if different type
        if current_model and current_model.get("type") != "predict":
            del current_model["pipe"]
            gc.collect()
            torch.cuda.empty_cache()
            current_model = None

        # Load or reuse pipeline
        if current_model is None or current_model.get("type") != "predict":
            log_lines.append("Loading Cosmos-Predict2.5-2B from HuggingFace...")
            pipe = DiffusionPipeline.from_pretrained(
                "nvidia/Cosmos-Predict2.5-2B",
                torch_dtype=torch.bfloat16,
                trust_remote_code=True
            )
            pipe.to("cuda")
            current_model = {"type": "predict", "pipe": pipe}
            log_lines.append("Model loaded successfully!")
        else:
            pipe = current_model["pipe"]
            log_lines.append("Using cached model")

        log_lines.append("")
        log_lines.append("Running inference...")

        # Set seed
        generator = torch.Generator(device="cuda").manual_seed(seed)

        # Run inference
        output = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_frames=num_frames,
            height=height,
            width=width,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator
        )

        frames = output.frames[0]
        inference_time = time.time() - start_time

        # Save video
        output_path = tempfile.mktemp(suffix=".mp4")
        from cosmos.utils_video import save_video
        save_video(frames, output_path, fps=16)

        log_lines.extend([
            "",
            f"Inference completed in {inference_time:.2f}s",
            f"Output: {len(frames)} frames at 16 fps",
            f"Video saved: {output_path}",
            "=" * 50
        ])

        return output_path, "\n".join(log_lines)

    except Exception as e:
        log_lines.append(f"\nERROR: {str(e)}")
        import traceback
        log_lines.append(traceback.format_exc())
        return None, "\n".join(log_lines)


@gpu_decorator(duration=300)
def run_predict_image2world(
    image,
    prompt: str,
    negative_prompt: str,
    num_frames: int,
    num_inference_steps: int,
    guidance_scale: float,
    seed: int
):
    """Run Cosmos Predict2.5 Image2World inference"""
    global current_model

    if image is None:
        return None, "Error: Please upload an image"

    start_time = time.time()
    device_info = get_device_info()

    log_lines = [
        "=" * 50,
        "COSMOS PREDICT2.5 - IMAGE2WORLD",
        "=" * 50,
        f"Device: {device_info['name']}",
        f"VRAM: {device_info['free_vram_gb']}/{device_info['total_vram_gb']} GB free",
        f"Input image: {image.size if hasattr(image, 'size') else 'uploaded'}",
        f"Prompt: {prompt[:100]}...",
        f"Frames: {num_frames}, Steps: {num_inference_steps}, Seed: {seed}",
        "",
        "Loading model..."
    ]

    try:
        from diffusers import DiffusionPipeline
        import gc

        # Ensure image is PIL
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image).convert("RGB")

        # Resize to supported dimensions
        width, height = image.size
        width = min(1280, (width // 8) * 8)
        height = min(720, (height // 8) * 8)
        image = image.resize((width, height))

        log_lines.append(f"Resized to: {width}x{height}")

        if current_model is None or current_model.get("type") != "predict":
            if current_model:
                del current_model["pipe"]
                gc.collect()
                torch.cuda.empty_cache()

            log_lines.append("Loading Cosmos-Predict2.5-2B from HuggingFace...")
            pipe = DiffusionPipeline.from_pretrained(
                "nvidia/Cosmos-Predict2.5-2B",
                torch_dtype=torch.bfloat16,
                trust_remote_code=True
            )
            pipe.to("cuda")
            current_model = {"type": "predict", "pipe": pipe}
            log_lines.append("Model loaded successfully!")
        else:
            pipe = current_model["pipe"]
            log_lines.append("Using cached model")

        log_lines.append("")
        log_lines.append("Running inference...")

        generator = torch.Generator(device="cuda").manual_seed(seed)

        output = pipe(
            image=image,
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_frames=num_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator
        )

        frames = output.frames[0]
        inference_time = time.time() - start_time

        output_path = tempfile.mktemp(suffix=".mp4")
        from cosmos.utils_video import save_video
        save_video(frames, output_path, fps=16)

        log_lines.extend([
            "",
            f"Inference completed in {inference_time:.2f}s",
            f"Output: {len(frames)} frames at 16 fps",
            "=" * 50
        ])

        return output_path, "\n".join(log_lines)

    except Exception as e:
        log_lines.append(f"\nERROR: {str(e)}")
        import traceback
        log_lines.append(traceback.format_exc())
        return None, "\n".join(log_lines)


@gpu_decorator(duration=420)
def run_transfer_video(
    video,
    prompt: str,
    control_type: str,
    negative_prompt: str,
    num_inference_steps: int,
    guidance_scale: float,
    controlnet_scale: float,
    seed: int
):
    """Run Cosmos Transfer2.5 video transfer"""
    global current_model

    if video is None:
        return None, "Error: Please upload a video"

    start_time = time.time()
    device_info = get_device_info()

    log_lines = [
        "=" * 50,
        "COSMOS TRANSFER2.5 - VIDEO TRANSFER",
        "=" * 50,
        f"Device: {device_info['name']}",
        f"VRAM: {device_info['free_vram_gb']}/{device_info['total_vram_gb']} GB free",
        f"Control type: {control_type}",
        f"Prompt: {prompt[:100]}...",
        f"Steps: {num_inference_steps}, CFG: {guidance_scale}, Seed: {seed}",
        "",
    ]

    try:
        from diffusers import DiffusionPipeline
        from cosmos.utils_video import load_video_frames, save_video
        import gc

        # Load input video frames
        log_lines.append(f"Loading video: {video}")
        input_frames = load_video_frames(video, max_frames=49)
        log_lines.append(f"Loaded {len(input_frames)} frames")

        # Prepare control frames
        log_lines.append(f"Extracting {control_type} control signal...")
        from cosmos.infer_transfer import prepare_control_frames
        control_frames = prepare_control_frames(input_frames, control_type)
        log_lines.append(f"Control frames prepared: {len(control_frames)}")

        # Load model (requires ~65GB VRAM)
        if current_model and current_model.get("type") != "transfer":
            del current_model["pipe"]
            gc.collect()
            torch.cuda.empty_cache()
            current_model = None

        if current_model is None or current_model.get("type") != "transfer":
            log_lines.append("")
            log_lines.append("Loading Cosmos-Transfer2.5-2B (requires ~65GB VRAM)...")
            log_lines.append("This may take several minutes...")

            pipe = DiffusionPipeline.from_pretrained(
                "nvidia/Cosmos-Transfer2.5-2B",
                torch_dtype=torch.bfloat16,
                trust_remote_code=True
            )
            pipe.to("cuda")
            current_model = {"type": "transfer", "pipe": pipe}
            log_lines.append("Model loaded successfully!")
        else:
            pipe = current_model["pipe"]
            log_lines.append("Using cached model")

        log_lines.append("")
        log_lines.append("Running inference...")

        generator = torch.Generator(device="cuda").manual_seed(seed)

        output = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            control_video=control_frames,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            controlnet_conditioning_scale=controlnet_scale,
            generator=generator
        )

        frames = output.frames[0]
        inference_time = time.time() - start_time

        output_path = tempfile.mktemp(suffix=".mp4")
        save_video(frames, output_path, fps=16)

        log_lines.extend([
            "",
            f"Inference completed in {inference_time:.2f}s",
            f"Output: {len(frames)} frames at 16 fps",
            "=" * 50
        ])

        return output_path, "\n".join(log_lines)

    except Exception as e:
        log_lines.append(f"\nERROR: {str(e)}")
        import traceback
        log_lines.append(traceback.format_exc())
        return None, "\n".join(log_lines)


# Build Gradio UI
with gr.Blocks(title="NVIDIA Cosmos World Foundation Models", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # NVIDIA Cosmos World Foundation Models Demo
    ## Predict2.5 & Transfer2.5

    Generate and transform videos using NVIDIA's World Foundation Models for Physical AI.

    **Models:**
    - **Cosmos Predict2.5-2B**: Text/Image/Video to World generation (32GB VRAM)
    - **Cosmos Transfer2.5-2B**: World-to-world translation with control inputs (65GB VRAM)

    **References:**
    - [Paper: arXiv 2511.00062](https://arxiv.org/abs/2511.00062)
    - [GitHub: cosmos-predict2.5](https://github.com/nvidia-cosmos/cosmos-predict2.5)
    - [GitHub: cosmos-transfer2.5](https://github.com/nvidia-cosmos/cosmos-transfer2.5)
    - License: [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license)
    """)

    with gr.Tabs():
        # Tab 1: Predict2.5
        with gr.TabItem("Predict 2.5"):
            gr.Markdown("### World Generation from Text, Image, or Video")

            with gr.Tabs():
                # Text2World
                with gr.TabItem("Text2World"):
                    with gr.Row():
                        with gr.Column():
                            p_t2w_prompt = gr.Textbox(
                                label="Prompt",
                                placeholder="Describe the world you want to generate...",
                                lines=3,
                                value="A futuristic city street at sunset, with flying cars and neon signs, cinematic quality"
                            )
                            p_t2w_negative = gr.Textbox(
                                label="Negative Prompt",
                                placeholder="What to avoid...",
                                value="low quality, blurry, distorted, artifacts"
                            )
                            with gr.Row():
                                p_t2w_width = gr.Dropdown([720, 1280], label="Width", value=720)
                                p_t2w_height = gr.Dropdown([480, 720], label="Height", value=480)
                            with gr.Row():
                                p_t2w_frames = gr.Slider(17, 97, step=8, value=49, label="Frames (~3s at 16fps)")
                                p_t2w_steps = gr.Slider(10, 50, step=5, value=30, label="Inference Steps")
                            with gr.Row():
                                p_t2w_cfg = gr.Slider(1.0, 15.0, step=0.5, value=7.0, label="Guidance Scale")
                                p_t2w_seed = gr.Number(value=42, label="Seed", precision=0)
                            p_t2w_btn = gr.Button("Generate Video", variant="primary")

                        with gr.Column():
                            p_t2w_video = gr.Video(label="Generated Video")
                            p_t2w_log = gr.Textbox(label="Log", lines=15, max_lines=20)

                    p_t2w_btn.click(
                        fn=run_predict_text2world,
                        inputs=[p_t2w_prompt, p_t2w_negative, p_t2w_frames, p_t2w_height, p_t2w_width, p_t2w_steps, p_t2w_cfg, p_t2w_seed],
                        outputs=[p_t2w_video, p_t2w_log]
                    )

                # Image2World
                with gr.TabItem("Image2World"):
                    with gr.Row():
                        with gr.Column():
                            p_i2w_image = gr.Image(label="Input Image", type="pil")
                            p_i2w_prompt = gr.Textbox(
                                label="Prompt",
                                placeholder="Describe how the world should evolve...",
                                lines=2,
                                value="The scene comes to life with movement, realistic motion, cinematic"
                            )
                            p_i2w_negative = gr.Textbox(
                                label="Negative Prompt",
                                value="static, frozen, low quality, blurry"
                            )
                            with gr.Row():
                                p_i2w_frames = gr.Slider(17, 97, step=8, value=49, label="Frames")
                                p_i2w_steps = gr.Slider(10, 50, step=5, value=30, label="Steps")
                            with gr.Row():
                                p_i2w_cfg = gr.Slider(1.0, 15.0, step=0.5, value=7.0, label="CFG")
                                p_i2w_seed = gr.Number(value=42, label="Seed", precision=0)
                            p_i2w_btn = gr.Button("Generate Video", variant="primary")

                        with gr.Column():
                            p_i2w_video = gr.Video(label="Generated Video")
                            p_i2w_log = gr.Textbox(label="Log", lines=15, max_lines=20)

                    p_i2w_btn.click(
                        fn=run_predict_image2world,
                        inputs=[p_i2w_image, p_i2w_prompt, p_i2w_negative, p_i2w_frames, p_i2w_steps, p_i2w_cfg, p_i2w_seed],
                        outputs=[p_i2w_video, p_i2w_log]
                    )

        # Tab 2: Transfer2.5
        with gr.TabItem("Transfer 2.5"):
            gr.Markdown("""
            ### World-to-World Translation
            Transform videos between domains/styles while preserving structure.

            **Control Types:**
            - **blur**: Gaussian blur for general structure
            - **edge**: Canny edge detection
            - **depth**: Depth estimation
            - **segmentation**: Semantic segmentation

            **Common Transforms:** day→night, sunny→rainy, clear→foggy, urban→rural
            """)

            with gr.Row():
                with gr.Column():
                    t_video = gr.Video(label="Input Video")
                    t_prompt = gr.Textbox(
                        label="Prompt",
                        placeholder="Describe the target domain/style...",
                        lines=2,
                        value="Transform to nighttime scene with city lights, photorealistic"
                    )
                    t_negative = gr.Textbox(
                        label="Negative Prompt",
                        value="daytime, bright sun, low quality, artifacts"
                    )
                    t_control = gr.Dropdown(
                        ["blur", "edge", "depth", "segmentation"],
                        label="Control Type",
                        value="blur"
                    )
                    with gr.Row():
                        t_steps = gr.Slider(10, 50, step=5, value=30, label="Steps")
                        t_cfg = gr.Slider(1.0, 15.0, step=0.5, value=7.0, label="CFG")
                    with gr.Row():
                        t_ctrl_scale = gr.Slider(0.5, 2.0, step=0.1, value=1.0, label="Control Scale")
                        t_seed = gr.Number(value=42, label="Seed", precision=0)
                    t_btn = gr.Button("Transform Video", variant="primary")

                with gr.Column():
                    t_output = gr.Video(label="Transformed Video")
                    t_log = gr.Textbox(label="Log", lines=15, max_lines=20)

            t_btn.click(
                fn=run_transfer_video,
                inputs=[t_video, t_prompt, t_control, t_negative, t_steps, t_cfg, t_ctrl_scale, t_seed],
                outputs=[t_output, t_log]
            )

    gr.Markdown("""
    ---
    **Hardware:** ZeroGPU (NVIDIA H200, 70GB VRAM) | **Precision:** BF16 only

    **Note:** First inference will download and load the model (~5-10 minutes).
    Subsequent inferences will be faster with cached models.
    """)


if __name__ == "__main__":
    demo.launch()