# NVIDIA Cosmos Predict2.5 & Transfer2.5 Demo # HuggingFace Space with ZeroGPU (H200 70GB) import gradio as gr import torch import time import os import tempfile from PIL import Image from pathlib import Path # Import spaces for ZeroGPU decorator try: import spaces HAS_SPACES = True except ImportError: HAS_SPACES = False print("Warning: spaces module not available. Running without ZeroGPU.") # Global model tracking current_model = None def get_device_info(): """Get GPU device information""" if torch.cuda.is_available(): device = torch.cuda.current_device() props = torch.cuda.get_device_properties(device) free_mem, total_mem = torch.cuda.mem_get_info(device) return { "device": f"cuda:{device}", "name": props.name, "total_vram_gb": round(total_mem / (1024**3), 2), "free_vram_gb": round(free_mem / (1024**3), 2), } return {"device": "cpu", "name": "CPU", "total_vram_gb": 0, "free_vram_gb": 0} # Wrapper for ZeroGPU compatibility def gpu_decorator(duration=300): if HAS_SPACES: return spaces.GPU(duration=duration) else: return lambda f: f @gpu_decorator(duration=300) def run_predict_text2world( prompt: str, negative_prompt: str, num_frames: int, height: int, width: int, num_inference_steps: int, guidance_scale: float, seed: int ): """Run Cosmos Predict2.5 Text2World inference""" global current_model start_time = time.time() device_info = get_device_info() log_lines = [ "=" * 50, "COSMOS PREDICT2.5 - TEXT2WORLD", "=" * 50, f"Device: {device_info['name']}", f"VRAM: {device_info['free_vram_gb']}/{device_info['total_vram_gb']} GB free", f"Prompt: {prompt[:100]}...", f"Resolution: {width}x{height}, Frames: {num_frames}", f"Steps: {num_inference_steps}, CFG: {guidance_scale}, Seed: {seed}", "", "Loading model..." ] try: from diffusers import DiffusionPipeline import gc # Clear previous model if different type if current_model and current_model.get("type") != "predict": del current_model["pipe"] gc.collect() torch.cuda.empty_cache() current_model = None # Load or reuse pipeline if current_model is None or current_model.get("type") != "predict": log_lines.append("Loading Cosmos-Predict2.5-2B from HuggingFace...") pipe = DiffusionPipeline.from_pretrained( "nvidia/Cosmos-Predict2.5-2B", torch_dtype=torch.bfloat16, trust_remote_code=True ) pipe.to("cuda") current_model = {"type": "predict", "pipe": pipe} log_lines.append("Model loaded successfully!") else: pipe = current_model["pipe"] log_lines.append("Using cached model") log_lines.append("") log_lines.append("Running inference...") # Set seed generator = torch.Generator(device="cuda").manual_seed(seed) # Run inference output = pipe( prompt=prompt, negative_prompt=negative_prompt, num_frames=num_frames, height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator ) frames = output.frames[0] inference_time = time.time() - start_time # Save video output_path = tempfile.mktemp(suffix=".mp4") from cosmos.utils_video import save_video save_video(frames, output_path, fps=16) log_lines.extend([ "", f"Inference completed in {inference_time:.2f}s", f"Output: {len(frames)} frames at 16 fps", f"Video saved: {output_path}", "=" * 50 ]) return output_path, "\n".join(log_lines) except Exception as e: log_lines.append(f"\nERROR: {str(e)}") import traceback log_lines.append(traceback.format_exc()) return None, "\n".join(log_lines) @gpu_decorator(duration=300) def run_predict_image2world( image, prompt: str, negative_prompt: str, num_frames: int, num_inference_steps: int, guidance_scale: float, seed: int ): """Run Cosmos Predict2.5 Image2World inference""" global current_model if image is None: return None, "Error: Please upload an image" start_time = time.time() device_info = get_device_info() log_lines = [ "=" * 50, "COSMOS PREDICT2.5 - IMAGE2WORLD", "=" * 50, f"Device: {device_info['name']}", f"VRAM: {device_info['free_vram_gb']}/{device_info['total_vram_gb']} GB free", f"Input image: {image.size if hasattr(image, 'size') else 'uploaded'}", f"Prompt: {prompt[:100]}...", f"Frames: {num_frames}, Steps: {num_inference_steps}, Seed: {seed}", "", "Loading model..." ] try: from diffusers import DiffusionPipeline import gc # Ensure image is PIL if not isinstance(image, Image.Image): image = Image.fromarray(image).convert("RGB") # Resize to supported dimensions width, height = image.size width = min(1280, (width // 8) * 8) height = min(720, (height // 8) * 8) image = image.resize((width, height)) log_lines.append(f"Resized to: {width}x{height}") if current_model is None or current_model.get("type") != "predict": if current_model: del current_model["pipe"] gc.collect() torch.cuda.empty_cache() log_lines.append("Loading Cosmos-Predict2.5-2B from HuggingFace...") pipe = DiffusionPipeline.from_pretrained( "nvidia/Cosmos-Predict2.5-2B", torch_dtype=torch.bfloat16, trust_remote_code=True ) pipe.to("cuda") current_model = {"type": "predict", "pipe": pipe} log_lines.append("Model loaded successfully!") else: pipe = current_model["pipe"] log_lines.append("Using cached model") log_lines.append("") log_lines.append("Running inference...") generator = torch.Generator(device="cuda").manual_seed(seed) output = pipe( image=image, prompt=prompt, negative_prompt=negative_prompt, num_frames=num_frames, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator ) frames = output.frames[0] inference_time = time.time() - start_time output_path = tempfile.mktemp(suffix=".mp4") from cosmos.utils_video import save_video save_video(frames, output_path, fps=16) log_lines.extend([ "", f"Inference completed in {inference_time:.2f}s", f"Output: {len(frames)} frames at 16 fps", "=" * 50 ]) return output_path, "\n".join(log_lines) except Exception as e: log_lines.append(f"\nERROR: {str(e)}") import traceback log_lines.append(traceback.format_exc()) return None, "\n".join(log_lines) @gpu_decorator(duration=420) def run_transfer_video( video, prompt: str, control_type: str, negative_prompt: str, num_inference_steps: int, guidance_scale: float, controlnet_scale: float, seed: int ): """Run Cosmos Transfer2.5 video transfer""" global current_model if video is None: return None, "Error: Please upload a video" start_time = time.time() device_info = get_device_info() log_lines = [ "=" * 50, "COSMOS TRANSFER2.5 - VIDEO TRANSFER", "=" * 50, f"Device: {device_info['name']}", f"VRAM: {device_info['free_vram_gb']}/{device_info['total_vram_gb']} GB free", f"Control type: {control_type}", f"Prompt: {prompt[:100]}...", f"Steps: {num_inference_steps}, CFG: {guidance_scale}, Seed: {seed}", "", ] try: from diffusers import DiffusionPipeline from cosmos.utils_video import load_video_frames, save_video import gc # Load input video frames log_lines.append(f"Loading video: {video}") input_frames = load_video_frames(video, max_frames=49) log_lines.append(f"Loaded {len(input_frames)} frames") # Prepare control frames log_lines.append(f"Extracting {control_type} control signal...") from cosmos.infer_transfer import prepare_control_frames control_frames = prepare_control_frames(input_frames, control_type) log_lines.append(f"Control frames prepared: {len(control_frames)}") # Load model (requires ~65GB VRAM) if current_model and current_model.get("type") != "transfer": del current_model["pipe"] gc.collect() torch.cuda.empty_cache() current_model = None if current_model is None or current_model.get("type") != "transfer": log_lines.append("") log_lines.append("Loading Cosmos-Transfer2.5-2B (requires ~65GB VRAM)...") log_lines.append("This may take several minutes...") pipe = DiffusionPipeline.from_pretrained( "nvidia/Cosmos-Transfer2.5-2B", torch_dtype=torch.bfloat16, trust_remote_code=True ) pipe.to("cuda") current_model = {"type": "transfer", "pipe": pipe} log_lines.append("Model loaded successfully!") else: pipe = current_model["pipe"] log_lines.append("Using cached model") log_lines.append("") log_lines.append("Running inference...") generator = torch.Generator(device="cuda").manual_seed(seed) output = pipe( prompt=prompt, negative_prompt=negative_prompt, control_video=control_frames, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, controlnet_conditioning_scale=controlnet_scale, generator=generator ) frames = output.frames[0] inference_time = time.time() - start_time output_path = tempfile.mktemp(suffix=".mp4") save_video(frames, output_path, fps=16) log_lines.extend([ "", f"Inference completed in {inference_time:.2f}s", f"Output: {len(frames)} frames at 16 fps", "=" * 50 ]) return output_path, "\n".join(log_lines) except Exception as e: log_lines.append(f"\nERROR: {str(e)}") import traceback log_lines.append(traceback.format_exc()) return None, "\n".join(log_lines) # Build Gradio UI with gr.Blocks(title="NVIDIA Cosmos World Foundation Models", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # NVIDIA Cosmos World Foundation Models Demo ## Predict2.5 & Transfer2.5 Generate and transform videos using NVIDIA's World Foundation Models for Physical AI. **Models:** - **Cosmos Predict2.5-2B**: Text/Image/Video to World generation (32GB VRAM) - **Cosmos Transfer2.5-2B**: World-to-world translation with control inputs (65GB VRAM) **References:** - [Paper: arXiv 2511.00062](https://arxiv.org/abs/2511.00062) - [GitHub: cosmos-predict2.5](https://github.com/nvidia-cosmos/cosmos-predict2.5) - [GitHub: cosmos-transfer2.5](https://github.com/nvidia-cosmos/cosmos-transfer2.5) - License: [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license) """) with gr.Tabs(): # Tab 1: Predict2.5 with gr.TabItem("Predict 2.5"): gr.Markdown("### World Generation from Text, Image, or Video") with gr.Tabs(): # Text2World with gr.TabItem("Text2World"): with gr.Row(): with gr.Column(): p_t2w_prompt = gr.Textbox( label="Prompt", placeholder="Describe the world you want to generate...", lines=3, value="A futuristic city street at sunset, with flying cars and neon signs, cinematic quality" ) p_t2w_negative = gr.Textbox( label="Negative Prompt", placeholder="What to avoid...", value="low quality, blurry, distorted, artifacts" ) with gr.Row(): p_t2w_width = gr.Dropdown([720, 1280], label="Width", value=720) p_t2w_height = gr.Dropdown([480, 720], label="Height", value=480) with gr.Row(): p_t2w_frames = gr.Slider(17, 97, step=8, value=49, label="Frames (~3s at 16fps)") p_t2w_steps = gr.Slider(10, 50, step=5, value=30, label="Inference Steps") with gr.Row(): p_t2w_cfg = gr.Slider(1.0, 15.0, step=0.5, value=7.0, label="Guidance Scale") p_t2w_seed = gr.Number(value=42, label="Seed", precision=0) p_t2w_btn = gr.Button("Generate Video", variant="primary") with gr.Column(): p_t2w_video = gr.Video(label="Generated Video") p_t2w_log = gr.Textbox(label="Log", lines=15, max_lines=20) p_t2w_btn.click( fn=run_predict_text2world, inputs=[p_t2w_prompt, p_t2w_negative, p_t2w_frames, p_t2w_height, p_t2w_width, p_t2w_steps, p_t2w_cfg, p_t2w_seed], outputs=[p_t2w_video, p_t2w_log] ) # Image2World with gr.TabItem("Image2World"): with gr.Row(): with gr.Column(): p_i2w_image = gr.Image(label="Input Image", type="pil") p_i2w_prompt = gr.Textbox( label="Prompt", placeholder="Describe how the world should evolve...", lines=2, value="The scene comes to life with movement, realistic motion, cinematic" ) p_i2w_negative = gr.Textbox( label="Negative Prompt", value="static, frozen, low quality, blurry" ) with gr.Row(): p_i2w_frames = gr.Slider(17, 97, step=8, value=49, label="Frames") p_i2w_steps = gr.Slider(10, 50, step=5, value=30, label="Steps") with gr.Row(): p_i2w_cfg = gr.Slider(1.0, 15.0, step=0.5, value=7.0, label="CFG") p_i2w_seed = gr.Number(value=42, label="Seed", precision=0) p_i2w_btn = gr.Button("Generate Video", variant="primary") with gr.Column(): p_i2w_video = gr.Video(label="Generated Video") p_i2w_log = gr.Textbox(label="Log", lines=15, max_lines=20) p_i2w_btn.click( fn=run_predict_image2world, inputs=[p_i2w_image, p_i2w_prompt, p_i2w_negative, p_i2w_frames, p_i2w_steps, p_i2w_cfg, p_i2w_seed], outputs=[p_i2w_video, p_i2w_log] ) # Tab 2: Transfer2.5 with gr.TabItem("Transfer 2.5"): gr.Markdown(""" ### World-to-World Translation Transform videos between domains/styles while preserving structure. **Control Types:** - **blur**: Gaussian blur for general structure - **edge**: Canny edge detection - **depth**: Depth estimation - **segmentation**: Semantic segmentation **Common Transforms:** day→night, sunny→rainy, clear→foggy, urban→rural """) with gr.Row(): with gr.Column(): t_video = gr.Video(label="Input Video") t_prompt = gr.Textbox( label="Prompt", placeholder="Describe the target domain/style...", lines=2, value="Transform to nighttime scene with city lights, photorealistic" ) t_negative = gr.Textbox( label="Negative Prompt", value="daytime, bright sun, low quality, artifacts" ) t_control = gr.Dropdown( ["blur", "edge", "depth", "segmentation"], label="Control Type", value="blur" ) with gr.Row(): t_steps = gr.Slider(10, 50, step=5, value=30, label="Steps") t_cfg = gr.Slider(1.0, 15.0, step=0.5, value=7.0, label="CFG") with gr.Row(): t_ctrl_scale = gr.Slider(0.5, 2.0, step=0.1, value=1.0, label="Control Scale") t_seed = gr.Number(value=42, label="Seed", precision=0) t_btn = gr.Button("Transform Video", variant="primary") with gr.Column(): t_output = gr.Video(label="Transformed Video") t_log = gr.Textbox(label="Log", lines=15, max_lines=20) t_btn.click( fn=run_transfer_video, inputs=[t_video, t_prompt, t_control, t_negative, t_steps, t_cfg, t_ctrl_scale, t_seed], outputs=[t_output, t_log] ) gr.Markdown(""" --- **Hardware:** ZeroGPU (NVIDIA H200, 70GB VRAM) | **Precision:** BF16 only **Note:** First inference will download and load the model (~5-10 minutes). Subsequent inferences will be faster with cached models. """) if __name__ == "__main__": demo.launch()