#!/usr/bin/env python3
"""Motif-Video 2B — Text-to-Video & Image-to-Video inference.

GPU requirements: ~24GB VRAM for 720p (1280x736, 121 frames).
Tested with: torch>=2.0, diffusers>=0.35.2, transformers>=5.0.0

Uses Adaptive Projected Guidance (APG) by default for best quality.
"""

import argparse

import torch
from diffusers import AdaptiveProjectedGuidance, DiffusionPipeline
from diffusers.utils import export_to_video


def parse_args():
    parser = argparse.ArgumentParser(description="Motif-Video 2B Inference (T2V / I2V)")
    parser.add_argument(
        "--model-path",
        type=str,
        default="Motif-Technologies/Motif-Video-2B",
        help="HuggingFace model ID or local checkpoint path (uses trust_remote_code=True)",
    )
    parser.add_argument(
        "--prompt",
        type=str,
        default="A category-five hurricane, viewed from inside the eye, reveals a circular stadium of cloud walls rising to fifty thousand feet with an eerie disk of blue sky directly overhead. Shot from a NOAA reconnaissance aircraft mounted camera, the perspective looks outward toward the eyewall — a near-vertical curtain of rotating cloud and lightning that is simultaneously terrifying and transcendent. The inner surface of the eyewall catches the setting sun, painting it in improbable shades of peach and rose. The camera slowly pans 360 degrees to complete one full revolution, capturing the entire coliseum of the storm. Below, the ocean surface is a white blur of foam and spray. The documentary-style cinematography strips away all artifice to present the storm as an entity of pure elemental power.",
        help="Text prompt for video generation",
    )
    parser.add_argument(
        "--image",
        type=str,
        default=None,
        help="Path to input image for I2V mode (omit for T2V)",
    )
    parser.add_argument(
        "--negative-prompt",
        type=str,
        default=None,
        help="Negative prompt (default: built-in pipeline default)",
    )
    parser.add_argument("--output", type=str, default="output.mp4", help="Output video file path")
    parser.add_argument("--num-frames", type=int, default=121, help="Number of frames to generate (121 = ~5s at 24fps)")
    parser.add_argument("--height", type=int, default=736, help="Video height in pixels")
    parser.add_argument("--width", type=int, default=1280, help="Video width in pixels")
    parser.add_argument("--guidance-scale", type=float, default=8.0, help="Classifier-free guidance scale")
    parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of denoising steps")
    parser.add_argument("--fps", type=int, default=24, help="Output video frame rate")
    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
    parser.add_argument(
        "--dtype",
        type=str,
        default="bfloat16",
        choices=["float16", "bfloat16", "float32"],
        help="Model dtype",
    )
    return parser.parse_args()


def main():
    args = parse_args()

    dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
    torch_dtype = dtype_map[args.dtype]

    mode = "I2V" if args.image else "T2V"
    print(f"[{mode}] Loading model from: {args.model_path}")

    guider = AdaptiveProjectedGuidance(
        guidance_scale=args.guidance_scale,
        adaptive_projected_guidance_rescale=12.0,
        adaptive_projected_guidance_momentum=0.1,
        eta=0.0,
        use_original_formulation=True,
    )

    pipe = DiffusionPipeline.from_pretrained(
        args.model_path,
        custom_pipeline="pipeline_motif_video",
        trust_remote_code=True,
        torch_dtype=torch_dtype,
        guider=guider,
    )
    pipe = pipe.to("cuda")

    generator = torch.Generator(device="cuda").manual_seed(args.seed)

    # Load image for I2V mode
    image = None
    if args.image:
        from PIL import Image

        image = Image.open(args.image).convert("RGB")
        print(f"[I2V] Input image: {args.image} ({image.size[0]}x{image.size[1]})")

    print(f"Generating video: {args.width}x{args.height}, {args.num_frames} frames, {args.num_inference_steps} steps")
    pipe_kwargs = dict(
        prompt=args.prompt,
        image=image,
        height=args.height,
        width=args.width,
        num_frames=args.num_frames,
        num_inference_steps=args.num_inference_steps,
        generator=generator,
        frame_rate=args.fps,
    )
    if args.negative_prompt is not None:
        pipe_kwargs["negative_prompt"] = args.negative_prompt

    output = pipe(**pipe_kwargs)

    video_frames = output.frames[0]
    export_to_video(video_frames, args.output, fps=args.fps)
    print(f"Video saved to: {args.output}")


if __name__ == "__main__":
    main()