odb9402 commited on Apr 17

Commit

ef79f27

0 Parent(s):

Duplicate from Motif-Technologies/Motif-Video-2B

Browse files

Co-authored-by: Dongpin oh <odb9402@users.noreply.huggingface.co>

Files changed (23) hide show

.gitattributes +42 -0
.gitignore +29 -0
README.md +314 -0
_fm_solvers_unipc.py +759 -0
assets/architecture.png +3 -0
assets/banner.png +3 -0
assets/showcase_i2v.png +3 -0
assets/showcase_t2v.png +3 -0
feature_extractor/preprocessor_config.json +23 -0
inference.py +119 -0
model_index.json +28 -0
motif-video-technical-report.pdf +3 -0
pipeline_motif_video.py +1321 -0
scheduler/scheduler_config.json +18 -0
text_encoder/config.json +252 -0
text_encoder/model.safetensors +3 -0
tokenizer/tokenizer.json +3 -0
tokenizer/tokenizer_config.json +26 -0
transformer/config.json +30 -0
transformer/diffusion_pytorch_model.safetensors +3 -0
transformer/transformer_motif_video.py +1350 -0
vae/config.json +64 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,42 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/architecture.png filter=lfs diff=lfs merge=lfs -text
+assets/banner.png filter=lfs diff=lfs merge=lfs -text
+assets/showcase_i2v.png filter=lfs diff=lfs merge=lfs -text
+assets/showcase_t2v.png filter=lfs diff=lfs merge=lfs -text
+tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+assets/i2v_sample.jpg filter=lfs diff=lfs merge=lfs -text
+motif-video-technical-report.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,29 @@

+# Claude Code / Codex
+.claude/
+.codex/
+.codex-review-latest.md
+.hook-state
+.hook-state.lock
+.plans/
+.manuals/
+CLAUDE.md
+_old_claude_files/
+# Internal / test
+test_local.py
+# Environment
+.env
+tmp/
+# Python
+*.pyc
+__pycache__/
+# Experiments
+results/
+experiments/**/outputs/
+experiments/**/checkpoints/
+experiments/**/*.pt
+experiments/**/*.ckpt
+assets/i2v_sample.jpg

README.md ADDED Viewed

	@@ -0,0 +1,314 @@

+---
+license: apache-2.0
+language:
+- en
+tags:
+- text-to-video
+- image-to-video
+- video-generation
+- diffusion-transformer
+pipeline_tag: text-to-video
+library_name: diffusers
+---
+<p align="center">
+  <img src="assets/banner.png" width="100%" alt="Motif-Video 2B teaser"/>
+</p>
+<p align="center">
+  <h1 align="center">Motif-Video 2B</h1>
+</p>
+<p align="center">
+  <b>A micro-budget text-to-video diffusion transformer from Motif Technologies</b>
+</p>
+<p align="center">
+  📑 <a href="https://huggingface.co/Motif-Technologies/Motif-Video-2B/blob/main/motif-video-technical-report.pdf">Technical Report</a> &nbsp;|&nbsp;
+  🤗 <a href="">Hugging Face</a> &nbsp;|&nbsp;
+  🌐 <a href="https://motiftech.io/videoshowcase">Project Page</a>
+</p>
+---
+## 🔥 News
+- **[2026-04-14]** We release **Motif-Video 2B**, our 2B-parameter text-to-video and image-to-video diffusion transformer, together with the full [technical report](https://huggingface.co/Motif-Technologies/Motif-Video-2B/blob/main/motif-video-technical-report.pdf).
+---
+## 📖 Introduction
+Training strong video generation models usually requires massive datasets, large parameter counts, and substantial compute. **Motif-Video 2B** asks whether competitive text-to-video quality is reachable at a much smaller budget — fewer than **10M training clips** and under **100,000 H200 GPU hours** — and shows that the answer is yes, provided the model design explicitly separates objectives that scaling would otherwise leave entangled.
+Our central observation is that prompt alignment, temporal consistency, and fine-detail recovery interfere with one another when handled through the same pathway. Motif-Video 2B addresses this **objective interference** architecturally rather than relying on scale alone, through two contributions:
+- **Shared Cross-Attention.** A residual cross-attention mechanism that reuses self-attention K/V weights to stabilize text–video alignment under long-context token sparsity, where standard joint attention dilutes text influence as the video token sequence grows.
+- **Three-stage DDT-style backbone.** 12 dual-stream + 16 single-stream + 8 DDT decoder layers, separating early modality fusion, joint representation learning, and high-frequency detail reconstruction into dedicated components. Per-block attention analysis shows that the DDT decoder spontaneously develops inter-frame attention structure absent from the encoder layers.
+These are paired with a micro-budget training recipe combining **TREAD token routing** and early-phase **REPA** with a frozen **V-JEPA** teacher — to our knowledge, the first time this combination has been applied to text-to-video training.
+On VBench, Motif-Video 2B reaches **83.76%**, the highest Total Score among open-source models we evaluate, surpassing Wan2.1-14B at **7× fewer parameters** and roughly an order of magnitude less training data.
+<!--
+  Architecture figure — replace with Figure 2 from the technical report
+  (the three-stage backbone + Shared Cross-Attention diagram).
+-->
+<p align="center">
+  <img src="assets/architecture.png" width="90%" alt="Motif-Video 2B architecture"/>
+</p>
+---
+## ✨ Highlights
+- **Two tasks, one set of weights.** A single checkpoint handles both **text-to-video (T2V)** and **image-to-video (I2V)** generation, trained jointly without a learnable task-type embedding.
+- **Up to 720p, 121 frames.** The final model generates 720p video at 121 frames under the standard rectified flow-matching sampler.
+- **Architectural specialization over brute-force scale.** Three-stage backbone with role-separated dual-stream / single-stream / DDT decoder layers.
+- **Shared Cross-Attention.** Stabilizes text alignment under long video-token sequences by grounding cross-attention K/V in the self-attention manifold.
+- **Micro-budget recipe.** TREAD token routing (≈27% per-step FLOP reduction) + early-phase REPA with V-JEPA teacher + offline bucket-balanced sampler (≈90% data utilization, up from ≈20% baseline).
+- **Open and reproducible.** Trained on ~64×H200 GPUs with FSDP2, full curriculum and recipe documented in the technical report.
+---
+## 🏗️ Architecture
+Motif-Video 2B is a flow-matching diffusion transformer organized around a single principle: each component is assigned a well-defined responsibility, and components with conflicting objectives are not asked to share capacity.
+| Component | Choice |
+|---|---|
+| Text encoder | T5Gemma2 (encoder–decoder, UL2-adapted Gemma 3) |
+| Video tokenizer | Wan2.1 VAE (8×8 spatial, 4× temporal compression), 2×2×1 patchify |
+| Backbone | 12 dual-stream + 16 single-stream + 8 DDT decoder layers |
+| Hidden dim / heads | 1536 / 12 heads × 128 |
+| Normalization | QK-normalization throughout |
+| Position encoding | RoPE |
+| Cross-attention | **Shared Cross-Attention** in the single-stream stage |
+| Objective | Rectified flow matching (velocity prediction) |
+| I2V conditioning | First-frame latent + SigLIP image embeddings, with timestep-aware blur |
+A high-level walkthrough of the role separation:
+1. **Dual-stream stage (12 layers).** Text and video tokens are processed through separate self-attention pathways, exchanging information via cross-attention. This prevents premature feature entanglement before either modality has formed coherent representations.
+2. **Single-stream stage (16 layers).** Text and video tokens attend freely in a joint sequence. **Shared Cross-Attention** is attached here to repair the text-attention dilution that emerges as the video token sequence grows.
+3. **DDT decoder (8 layers).** A dedicated velocity decoder atop the 28-layer encoder, freeing the encoder from high-frequency detail reconstruction. Per-block attention analysis shows that the DDT decoder develops inter-frame attention structure that single-stream layers do not.
+For the full derivation of why Shared Cross-Attention shares K/V but not Q, and why this is necessary in addition to standard zero-init of W_O, see Section 3.3 of the [technical report](https://huggingface.co/Motif-Technologies/Motif-Video-2B/blob/main/motif-video-technical-report.pdf).
+<!--
+  Optional: insert Figure 3 (attention heatmaps across the three stages)
+  here as a secondary architecture figure. It is the strongest visual
+  evidence for the role-separation argument.
+-->
+---
+## 🚀 Quickstart / Usage
+### Requirements
+- Python 3.10+
+- CUDA-capable GPU with **24GB+ VRAM** (e.g., A100, H100, RTX 4090)
+```bash
+pip install "diffusers>=0.35.2" "transformers>=5.0.0" torch accelerate ftfy einops sentencepiece regex Pillow
+```
+### Text-to-Video (T2V)
+```python
+import torch
+from diffusers import AdaptiveProjectedGuidance, DiffusionPipeline
+from diffusers.utils import export_to_video
+guider = AdaptiveProjectedGuidance(
+    guidance_scale=8.0,
+    adaptive_projected_guidance_rescale=12.0,
+    adaptive_projected_guidance_momentum=0.1,
+    use_original_formulation=True,
+)
+pipe = DiffusionPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    custom_pipeline="pipeline_motif_video",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    guider=guider,
+)
+pipe = pipe.to("cuda")
+output = pipe(
+    prompt="A category-five hurricane, viewed from inside the eye, reveals a circular stadium of cloud walls rising to fifty thousand feet with an eerie disk of blue sky directly overhead. Shot from a NOAA reconnaissance aircraft mounted camera, the perspective looks outward toward the eyewall — a near-vertical curtain of rotating cloud and lightning that is simultaneously terrifying and transcendent. The inner surface of the eyewall catches the setting sun, painting it in improbable shades of peach and rose. The camera slowly pans 360 degrees to complete one full revolution, capturing the entire coliseum of the storm. Below, the ocean surface is a white blur of foam and spray. The documentary-style cinematography strips away all artifice to present the storm as an entity of pure elemental power.",
+    height=736,
+    width=1280,
+    num_frames=121,
+    num_inference_steps=50,
+)
+export_to_video(output.frames[0], "output.mp4", fps=24)
+```
+### Image-to-Video (I2V)
+```python
+import torch
+from diffusers import AdaptiveProjectedGuidance, DiffusionPipeline
+from diffusers.utils import export_to_video, load_image
+guider = AdaptiveProjectedGuidance(
+    guidance_scale=8.0,
+    adaptive_projected_guidance_rescale=12.0,
+    adaptive_projected_guidance_momentum=0.1,
+    use_original_formulation=True,
+)
+pipe = DiffusionPipeline.from_pretrained(
+    "Motif-Technologies/Motif-Video-2B",
+    custom_pipeline="pipeline_motif_video",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    guider=guider,
+)
+pipe = pipe.to("cuda")
+image = load_image("https://huggingface.co/Motif-Technologies/Motif-Video-2B/resolve/main/assets/i2v_sample.jpg")
+output = pipe(
+    prompt="Three friends stride through a sun-bleached meadow as a warm breeze ripples the tall dry grass around their legs. The woman on the left turns her head to share a quiet laugh, the woman in the center pushes a loose curl behind her ear, and the man on the right tilts his face toward the sky. The camera drifts gently alongside them at walking pace, handheld, with soft overcast light.",
+    image=image,
+    height=736,
+    width=1280,
+    num_frames=121,
+    num_inference_steps=50,
+)
+export_to_video(output.frames[0], "output.mp4", fps=24)
+```
+### CLI Inference
+```bash
+# Text-to-Video
+python inference.py \
+  --prompt "A time-lapse of a flower blooming in a dark room, dramatic lighting" \
+  --output t2v_output.mp4
+# Image-to-Video
+python inference.py \
+  --image assets/i2v_sample.jpg \
+  --prompt "Three friends stride through a meadow as a warm breeze ripples the tall grass" \
+  --output i2v_output.mp4
+```
+See `inference.py` for all available options (`--help`).
+### Recommended Settings
+| Parameter | Default | Notes |
+|---|---|---|
+| Resolution | 1280x736 | 720p, best quality |
+| Frames | 121 | ~5 seconds at 24fps |
+| Guidance scale | 8.0 | |
+| Scheduler shift | 15.0 | Pre-configured in scheduler config |
+| Inference steps | 50 | |
+| dtype | bfloat16 | Recommended for H100/A100 |
+---
+## 📊 Performance
+### VBench
+Motif-Video 2B achieves the highest **Total Score** among open-source models we evaluate.
+| Model | Params | Total | Quality | Semantic |
+|---|---|---|---|---|
+| Wan2.2-T2V (prompt-opt.) | A14B | 84.23 | 85.42 | 79.50 |
+| **Motif-Video 2B (Ours)** | **2B** | **83.76** | **84.59** | **80.44** |
+| SANA-Video | 2B | 83.71 | 84.35 | 81.35 |
+| Wan2.1-T2V | 14B | 83.69 | 85.59 | 76.11 |
+| OpenSora 2.0 (T2I2V) | 11B | 83.60 | 84.40 | 80.30 |
+| Wan2.1-T2V | 1.3B | 83.31 | 85.23 | 75.65 |
+| HunyuanVideo | 13B | 83.24 | 85.09 | 75.82 |
+| CogVideoX1.5-5B (prompt-opt.) | 5B | 82.17 | 82.78 | 79.76 |
+| Step-Video-T2V | 30B | 81.83 | 84.46 | 71.28 |
+| LTX-Video | 2B | 80.00 | 82.30 | 70.79 |
+Notable per-dimension highlights for Motif-Video 2B (open-source):
+- **Spatial Relationship: 83.02%** — best among open-source models
+- **Semantic Score: 80.44%** — highest among open-source models reporting per-dimension results
+- **Object Class: 92.93%**, **Multiple Objects: 77.29%**, **Imaging Quality: 70.50%** — second-best in their categories
+The full 16-dimension breakdown is in Table 3 of the [technical report](https://huggingface.co/Motif-Technologies/Motif-Video-2B/blob/main/motif-video-technical-report.pdf).
+> **A note on VBench vs. perceptual quality.** Motif-Video 2B leads on VBench Total Score, but in our internal side-by-side comparisons against Wan2.1-T2V-14B we observe a perceptual gap in favor of the larger model on temporal stability and fine human anatomy. We discuss the sources of this gap (uniform dimension weighting, near-correct semantic credit) in Section 7 of the report. We report the gap explicitly rather than smoothing it over.
+### Human evaluation
+In a blind pairwise study against six contemporaneous open-source baselines (SANA-Video, LTX-Video 2, Wan2.1-14B, Wan2.1-1.3B, Wan2.2-5B, CogVideoX-5B) on 40 LLM-generated prompts, Motif-Video 2B is preferred over both **SANA-Video** (similar parameter count) and **Wan2.1-1.3B** (similar parameter count, larger training corpus) on prompt-following and video-fidelity axes. Wan2.1-14B remains the preferred model overall, consistent with its 7× larger parameter count and substantially larger training data.
+---
+## 🎬 Showcase
+<!--
+  Insert the qualitative grids from the technical report here:
+    - Figure 1 / Figure 12: T2V multi-prompt frame strips
+    - Figure 13: I2V example (input image + generated frames)
+  Use full-width or 2-column layout, matching Wan2.1's "Showcase" section.
+-->
+### Text-to-Video
+<p align="center">
+  <img src="assets/showcase_t2v.png" width="100%" alt="Motif-Video 2B T2V samples"/>
+</p>
+### Image-to-Video
+<p align="center">
+  <img src="assets/showcase_i2v.png" width="100%" alt="Motif-Video 2B I2V samples"/>
+</p>
+---
+## ⚠️ Limitations
+We report limitations as the boundary conditions under which the design decisions in this report should be interpreted, not as caveats.
+- **Micro-scale semantic distortion.** Motif-Video 2B occasionally produces sub-object-level artifacts that leave the category label intact but break perceptual plausibility — distorted hands on close-up human subjects, degraded body structure under high-displacement motion, and attribute leakage between visually similar co-present subjects. We attribute these primarily to data coverage rather than backbone design.
+- **Temporal failures.** Three distinct modes that frame-level metrics do not surface: (i) physically implausible liquid / cloth / collision dynamics, (ii) coherence loss under high scene complexity (multi-agent crowds), and (iii) unintended mid-clip scene transitions in long sequences.
+- **Recipe components are evaluated jointly, not in isolation.** We do not present per-component ablations for Shared Cross-Attention, the DDT decoder, REPA phasing, or TREAD routing at full scale. Readers should interpret our results as evidence that the *composed* recipe works at 2B, not as a marginal-contribution claim about any single component.
+We view temporal stability and data coverage — not architectural depth — as the primary remaining ceilings on this model. Both are the most natural axes for a future iteration that the current architecture is built to absorb.
+---
+## 📚 Citation
+If you find Motif-Video 2B useful in your research, please cite:
+```bibtex
+@techreport{motifvideo2b2026,
+  title  = {Motif-Video 2B: Technical Report},
+  author = {Motif Technologies},
+  year   = {2026},
+  institution = {Motif Technologies},
+  url    = {https://huggingface.co/Motif-Technologies/Motif-Video-2B/blob/main/motif-video-technical-report.pdf}
+}
+```
+---
+## 🙏 Acknowledgements
+We build on a number of excellent open-source projects, including the **Wan2.1 VAE** [Wan Team, 2025], **T5Gemma / Gemma 3** [Google], **TREAD** [Krause et al., 2025], **REPA** with the **V-JEPA** family of visual encoders [Bardes et al.], **DDT** [Wang et al.], and the broader **diffusers** and **Accelerate** ecosystems. Compute was provisioned on Microsoft Azure and orchestrated with **SkyPilot** on Kubernetes.
+---
+## 📄 License
+<!-- TODO: confirm final license — apache-2.0 placeholder above. -->
+This model is released under the Apache 2.0 License. See `LICENSE` for details.

_fm_solvers_unipc.py ADDED Viewed

	@@ -0,0 +1,759 @@

+# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
+# Convert unipc for flow matching
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import (
+    KarrasDiffusionSchedulers,
+    SchedulerMixin,
+    SchedulerOutput,
+)
+from diffusers.utils import deprecate, is_scipy_available
+if is_scipy_available():
+    pass
+class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to "flow_prediction"):
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
+            the flow of the diffusion process.
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        solver_order: int = 2,
+        prediction_type: str = "flow_prediction",
+        shift: Optional[float] = 1.0,
+        use_dynamic_shifting=False,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        predict_x0: bool = True,
+        solver_type: str = "bh2",
+        lower_order_final: bool = True,
+        disable_corrector: List[int] = [],
+        solver_p: Optional[SchedulerMixin] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy()
+        sigmas = 1.0 - alphas
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
+        self.sigmas = sigmas
+        self.timesteps = sigmas * num_train_timesteps
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Union[int, None] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[Union[float, None]] = None,
+        shift: Optional[Union[float, None]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                Total number of the spacing of the time steps.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+        if sigmas is None:
+            sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1]  # pyright: ignore
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
+        else:
+            if shift is None:
+                shift = self.config.shift
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = self.config.sigma_min
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+        timesteps = sigmas * self.config.num_train_timesteps
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)  # pyright: ignore
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+        self.num_inference_steps = len(timesteps)
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+        return sample
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        return 1 - sigma, sigma
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        if self.predict_x0:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+            return x0_pred
+        else:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                epsilon = sample - (1 - sigma_t) * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+            if self.config.thresholding:
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = model_output + x0_pred
+            return epsilon
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: Optional[torch.Tensor] = None,
+        order: Optional[int] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = sample.device
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - i  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype)
+        else:
+            D1s = None
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: Optional[torch.Tensor] = None,
+        this_sample: Optional[torch.Tensor] = None,
+        order: Optional[int] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniC (B(h) version).
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # pyright: ignore
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+        generator=None,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        use_corrector = (
+            self.step_index > 0 and self.step_index - 1 not in self.disable_corrector and self.last_sample is not None  # pyright: ignore
+        )
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+        if use_corrector:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                this_sample=sample,
+                order=self.this_order,
+            )
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep  # pyright: ignore
+        if self.config.lower_order_final:
+            this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index)  # pyright: ignore
+        else:
+            this_order = self.config.solver_order
+        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+        # upon completion increase step index by one
+        self._step_index += 1  # pyright: ignore
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+    def __len__(self):
+        return self.config.num_train_timesteps

assets/architecture.png ADDED Viewed

Git LFS Details

SHA256: 33f619ed4c78c185e5e40fec1b774dee5573e3f8f3a405785ebe9552b2a02c33
Pointer size: 131 Bytes
Size of remote file: 260 kB

assets/banner.png ADDED Viewed

Git LFS Details

SHA256: c01efdcc5579a31fb8926717fbc8ef24c317c9e2d852b4c084132b60da1ca602
Pointer size: 132 Bytes
Size of remote file: 7.57 MB

assets/showcase_i2v.png ADDED Viewed

Git LFS Details

SHA256: 2b605ed63797b53532df7a0a52e1168c7847b3f2e6c5c4a4dfad0b901648c2f7
Pointer size: 133 Bytes
Size of remote file: 11.4 MB

assets/showcase_t2v.png ADDED Viewed

Git LFS Details

SHA256: c78f3dd58ad8562a082275a955bbbfbfc85cb7e48ce3c28ca55fb1fb625ef139
Pointer size: 132 Bytes
Size of remote file: 3.85 MB

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": false,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

inference.py ADDED Viewed

	@@ -0,0 +1,119 @@

+#!/usr/bin/env python3
+"""Motif-Video 2B — Text-to-Video & Image-to-Video inference.
+GPU requirements: ~24GB VRAM for 720p (1280x736, 121 frames).
+Tested with: torch>=2.0, diffusers>=0.35.2, transformers>=5.0.0
+Uses Adaptive Projected Guidance (APG) by default for best quality.
+"""
+import argparse
+import torch
+from diffusers import AdaptiveProjectedGuidance, DiffusionPipeline
+from diffusers.utils import export_to_video
+def parse_args():
+    parser = argparse.ArgumentParser(description="Motif-Video 2B Inference (T2V / I2V)")
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="Motif-Technologies/Motif-Video-2B",
+        help="HuggingFace model ID or local checkpoint path (uses trust_remote_code=True)",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="A category-five hurricane, viewed from inside the eye, reveals a circular stadium of cloud walls rising to fifty thousand feet with an eerie disk of blue sky directly overhead. Shot from a NOAA reconnaissance aircraft mounted camera, the perspective looks outward toward the eyewall — a near-vertical curtain of rotating cloud and lightning that is simultaneously terrifying and transcendent. The inner surface of the eyewall catches the setting sun, painting it in improbable shades of peach and rose. The camera slowly pans 360 degrees to complete one full revolution, capturing the entire coliseum of the storm. Below, the ocean surface is a white blur of foam and spray. The documentary-style cinematography strips away all artifice to present the storm as an entity of pure elemental power.",
+        help="Text prompt for video generation",
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        default=None,
+        help="Path to input image for I2V mode (omit for T2V)",
+    )
+    parser.add_argument(
+        "--negative-prompt",
+        type=str,
+        default=None,
+        help="Negative prompt (default: built-in pipeline default)",
+    )
+    parser.add_argument("--output", type=str, default="output.mp4", help="Output video file path")
+    parser.add_argument("--num-frames", type=int, default=121, help="Number of frames to generate (121 = ~5s at 24fps)")
+    parser.add_argument("--height", type=int, default=736, help="Video height in pixels")
+    parser.add_argument("--width", type=int, default=1280, help="Video width in pixels")
+    parser.add_argument("--guidance-scale", type=float, default=8.0, help="Classifier-free guidance scale")
+    parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of denoising steps")
+    parser.add_argument("--fps", type=int, default=24, help="Output video frame rate")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        choices=["float16", "bfloat16", "float32"],
+        help="Model dtype",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
+    torch_dtype = dtype_map[args.dtype]
+    mode = "I2V" if args.image else "T2V"
+    print(f"[{mode}] Loading model from: {args.model_path}")
+    guider = AdaptiveProjectedGuidance(
+        guidance_scale=args.guidance_scale,
+        adaptive_projected_guidance_rescale=12.0,
+        adaptive_projected_guidance_momentum=0.1,
+        eta=0.0,
+        use_original_formulation=True,
+    )
+    pipe = DiffusionPipeline.from_pretrained(
+        args.model_path,
+        custom_pipeline="pipeline_motif_video",
+        trust_remote_code=True,
+        torch_dtype=torch_dtype,
+        guider=guider,
+    )
+    pipe = pipe.to("cuda")
+    generator = torch.Generator(device="cuda").manual_seed(args.seed)
+    # Load image for I2V mode
+    image = None
+    if args.image:
+        from PIL import Image
+        image = Image.open(args.image).convert("RGB")
+        print(f"[I2V] Input image: {args.image} ({image.size[0]}x{image.size[1]})")
+    print(f"Generating video: {args.width}x{args.height}, {args.num_frames} frames, {args.num_inference_steps} steps")
+    pipe_kwargs = dict(
+        prompt=args.prompt,
+        image=image,
+        height=args.height,
+        width=args.width,
+        num_frames=args.num_frames,
+        num_inference_steps=args.num_inference_steps,
+        generator=generator,
+        frame_rate=args.fps,
+    )
+    if args.negative_prompt is not None:
+        pipe_kwargs["negative_prompt"] = args.negative_prompt
+    output = pipe(**pipe_kwargs)
+    video_frames = output.frames[0]
+    export_to_video(video_frames, args.output, fps=args.fps)
+    print(f"Video saved to: {args.output}")
+if __name__ == "__main__":
+    main()

model_index.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "_class_name": "MotifVideoPipeline",
+  "_diffusers_version": "0.35.2",
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "T5Gemma2Model"
+  ],
+  "tokenizer": [
+    "transformers",
+    "GemmaTokenizer"
+  ],
+  "transformer": [
+    "transformer_motif_video",
+    "MotifVideoTransformer3DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLWan"
+  ],
+  "feature_extractor": [
+    "transformers",
+    "SiglipImageProcessor"
+  ]
+}

motif-video-technical-report.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f931b222d303bee5b62053b9734a021bb9310388cf575fbab83ffdcceca378ab
+size 17260596

pipeline_motif_video.py ADDED Viewed

	@@ -0,0 +1,1321 @@

+# Copyright 2026 Motif Technologies, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import html
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import ftfy
+import numpy as np
+import regex as re
+import torch
+from diffusers import (
+    AdaptiveProjectedGuidance,
+    AutoencoderKLWan,
+    ClassifierFreeGuidance,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    FlowMatchEulerDiscreteScheduler,
+    SkipLayerGuidance,
+    UniPCMultistepScheduler,
+)
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.utils import BaseOutput, is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from einops import rearrange
+from PIL import Image
+from torch import Tensor
+from diffusers.guiders.adaptive_projected_guidance import MomentumBuffer
+from diffusers.guiders.guider_utils import GuiderOutput
+from ._fm_solvers_unipc import FlowUniPCMultistepScheduler
+from transformers import BatchEncoding, PreTrainedTokenizerBase, SiglipImageProcessor, T5Gemma2Model
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MotifVideoPipeline
+        >>> from diffusers.utils import export_to_video
+        >>> # Load the Motif Video pipeline
+        >>> motif_video_model_id = "MotifTechnologies/Motif-Video"
+        >>> pipe = MotifVideoPipeline.from_pretrained(motif_video_model_id, torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
+        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+        >>> video = pipe(
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     width=640,
+        ...     height=352,
+        ...     num_frames=65,
+        ...     num_inference_steps=50,
+        ... ).frames[0]
+        >>> export_to_video(video, "output.mp4", fps=16)
+        ```
+"""
+@dataclass
+class MotifVideoPipelineOutput(BaseOutput):
+    r"""
+    Output class for Motif Video pipelines.
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+    frames: torch.Tensor
+"""Video-aware Adaptive Projected Guidance (APG).
+Standard APG normalizes over all spatial dimensions [C, T, H, W], which collapses
+temporal variation. This module normalizes over [C, H, W] only, preserving
+per-frame independence.
+"""
+def video_normalized_guidance(
+    pred_cond: torch.Tensor,
+    pred_uncond: torch.Tensor,
+    guidance_scale: float,
+    momentum_buffer: MomentumBuffer | None = None,
+    eta: float = 1.0,
+    norm_threshold: float = 0.0,
+    use_original_formulation: bool = False,
+) -> torch.Tensor:
+    """APG with video-aware normalization: normalize over [C, H, W], exclude T.
+    For 5D input [B, C, T, H, W], dim=[-1, -2, -4] normalizes per-frame (W, H, C),
+    keeping the T dimension independent. For 4D input [B, C, H, W], falls back to
+    standard [-1, -2, -3] behavior.
+    """
+    diff = pred_cond - pred_uncond
+    if len(diff.shape) == 5:
+        # [B, C, T, H, W] → normalize over W(-1), H(-2), C(-4), skip T(-3)
+        dim = [-1, -2, -4]
+    else:
+        # [B, C, H, W] → standard behavior
+        dim = [-i for i in range(1, len(diff.shape))]
+    if momentum_buffer is not None:
+        momentum_buffer.update(diff)
+        diff = momentum_buffer.running_average
+    if norm_threshold > 0:
+        ones = torch.ones_like(diff)
+        diff_norm = diff.norm(p=2, dim=dim, keepdim=True)
+        scale_factor = torch.minimum(ones, norm_threshold / diff_norm)
+        diff = diff * scale_factor
+    v0, v1 = diff.double(), pred_cond.double()
+    v1 = torch.nn.functional.normalize(v1, dim=dim)
+    v0_parallel = (v0 * v1).sum(dim=dim, keepdim=True) * v1
+    v0_orthogonal = v0 - v0_parallel
+    diff_parallel, diff_orthogonal = v0_parallel.type_as(diff), v0_orthogonal.type_as(diff)
+    normalized_update = diff_orthogonal + eta * diff_parallel
+    pred = pred_cond if use_original_formulation else pred_uncond
+    pred = pred + guidance_scale * normalized_update
+    return pred
+class VideoAdaptiveProjectedGuidance(AdaptiveProjectedGuidance):
+    """APG variant that normalizes over [C, H, W] per frame, excluding the T dimension."""
+    def forward(self, pred_cond: torch.Tensor, pred_uncond: torch.Tensor | None = None) -> GuiderOutput:
+        pred = None
+        if not self._is_apg_enabled():
+            pred = pred_cond
+        else:
+            pred = video_normalized_guidance(
+                pred_cond,
+                pred_uncond,
+                self.guidance_scale,
+                self.momentum_buffer,
+                self.eta,
+                self.adaptive_projected_guidance_rescale,
+                self.use_original_formulation,
+            )
+        if self.guidance_rescale > 0.0:
+            from diffusers.guiders.classifier_free_guidance import rescale_noise_cfg
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+        return GuiderOutput(pred=pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def get_linear_quadratic_sigmas(
+    num_inference_steps: int,
+    linear_quadratic_emulating_steps: int = 250,
+) -> np.ndarray:
+    """
+    Compute a linear-quadratic sigma schedule for flow matching.
+    This schedule combines:
+    - First half: Linear interpolation from high noise to medium noise (slow denoising)
+    - Second half: Quadratic interpolation from medium noise to clean (faster denoising)
+    Convention:
+    - sigma=1.0 represents pure noise
+    - sigma=0.0 represents clean image
+    - Output sigmas are in descending order (1.0 → ~0)
+    Args:
+        num_inference_steps: Total number of denoising steps (must be even).
+        linear_quadratic_emulating_steps: Controls the slope of linear interpolation.
+            Higher values result in gentler slope in the first half.
+    Returns:
+        np.ndarray: Array of sigma values with shape (num_inference_steps,).
+            The scheduler will append a terminal 0.
+    Raises:
+        ValueError: If num_inference_steps is not even.
+    Reference:
+        Linear-quadratic timestep schedule for improved flow matching inference.
+    """
+    if num_inference_steps % 2 != 0:
+        raise ValueError(
+            f"num_inference_steps must be even for linear-quadratic schedule, but got {num_inference_steps}"
+        )
+    steps = num_inference_steps
+    N = linear_quadratic_emulating_steps
+    half_steps = steps // 2
+    # First half: linear interpolation from 1 toward 0
+    # Takes first half_steps values from linspace(1, 0, N+1)
+    linear_part = np.linspace(1.0, 0.0, N + 1)[:half_steps]
+    # Second half: quadratic interpolation
+    # Formula: x^2 * (half_steps/N - 1) - (half_steps/N - 1)
+    #        = (half_steps/N - 1) * (x^2 - 1)
+    # This maps x=0 to (half_steps/N - 1) * (-1) = 1 - half_steps/N
+    # and maps x=1 to 0
+    x = np.linspace(0.0, 1.0, half_steps + 1)
+    scale_factor = half_steps / N - 1  # negative value
+    quadratic_part = x**2 * scale_factor - scale_factor
+    # Concatenate and exclude the last 0 (scheduler appends terminal 0)
+    sigmas = np.concatenate([linear_part, quadratic_part])
+    sigmas = sigmas[:-1]  # Remove trailing 0, scheduler will append it
+    return sigmas.astype(np.float32)
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    use_linear_quadratic_schedule: bool = False,
+    linear_quadratic_emulating_steps: int = 250,
+    **kwargs,
+):
+    """
+    Retrieve timesteps from the scheduler.
+    Args:
+        scheduler: The noise scheduler to use.
+        num_inference_steps: Number of denoising steps.
+        device: Device to place timesteps on.
+        timesteps: Custom timestep values (mutually exclusive with sigmas).
+        sigmas: Custom sigma values (mutually exclusive with timesteps).
+        use_linear_quadratic_schedule: If True, use linear-quadratic sigma schedule.
+            This overrides the default linear schedule. Requires num_inference_steps
+            to be even.
+        linear_quadratic_emulating_steps: Controls the linear portion slope.
+            Higher values result in gentler slope in the first half. Default: 250.
+        **kwargs: Additional arguments passed to scheduler.set_timesteps().
+    Returns:
+        Tuple of (timesteps, num_inference_steps).
+    Raises:
+        ValueError: If both timesteps and sigmas are provided, or if
+            use_linear_quadratic_schedule is True but num_inference_steps is odd.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    # Handle linear-quadratic schedule: compute sigmas if flag is set
+    if use_linear_quadratic_schedule:
+        if sigmas is not None:
+            raise ValueError(
+                "Cannot use both `sigmas` and `use_linear_quadratic_schedule`. "
+                "The linear-quadratic schedule computes sigmas automatically."
+            )
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` must be provided when using `use_linear_quadratic_schedule`.")
+        sigmas = get_linear_quadratic_sigmas(
+            num_inference_steps=num_inference_steps,
+            linear_quadratic_emulating_steps=linear_quadratic_emulating_steps,
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+class MotifVideoPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using MotifVideoTransformer.
+    Args:
+        transformer ([`MotifVideoTransformer3DModel`]):
+            Conditional Transformer architecture to denoise the encoded video latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5Gemma2Model`]):
+            Primary text encoder for encoding text prompts into embeddings.
+        tokenizer ([`PreTrainedTokenizerBase`]):
+            Tokenizer corresponding to the primary text encoder.
+        guider ([`ClassifierFreeGuidance`] or [`SkipLayerGuidance`] or [`AdaptiveProjectedGuidance`] or [`VideoAdaptiveProjectedGuidance`], *optional*):
+            The guidance method to use. If `None`, it defaults to `ClassifierFreeGuidance()`.
+    """
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = ["feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: Union[
+            FlowMatchEulerDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+            UniPCMultistepScheduler,
+            FlowUniPCMultistepScheduler,
+        ],
+        vae: AutoencoderKLWan,
+        text_encoder: T5Gemma2Model,
+        tokenizer: PreTrainedTokenizerBase,
+        transformer,
+        guider: Optional[
+            Union[ClassifierFreeGuidance, SkipLayerGuidance, AdaptiveProjectedGuidance, VideoAdaptiveProjectedGuidance]
+        ] = None,
+        feature_extractor: Optional[SiglipImageProcessor] = None,
+    ):
+        super().__init__()
+        self.guider = ClassifierFreeGuidance() if guider is None else guider
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.transformer_spatial_patch_size = (
+            self.transformer.config.patch_size if getattr(self, "transformer", None) is not None else 2
+        )
+        self.transformer_temporal_patch_size = (
+            self.transformer.config.patch_size_t if getattr(self, "transformer") is not None else 1
+        )
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if getattr(self, "tokenizer", None) is not None else 512
+        )
+    def _get_default_embeds(
+        self,
+        text_encoder,
+        tokenizer: PreTrainedTokenizerBase,
+        prompt: Union[str, List[str]],
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        dtype = dtype or text_encoder.dtype
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_inputs = BatchEncoding(
+            {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in text_inputs.items()}
+        )
+        prompt_embeds = text_encoder(**text_inputs)[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        return prompt_embeds, text_inputs.attention_mask
+    def _average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+        denom = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)  # avoid div by zero
+        return last_hidden.sum(dim=1) / denom
+    def _get_prompt_embeds(
+        self,
+        text_encoder: T5Gemma2Model,
+        tokenizer: PreTrainedTokenizerBase,
+        prompt: Union[str, List[str]] | None = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt_embeds_kwargs = {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "prompt": prompt,
+            "max_sequence_length": max_sequence_length,
+            "device": device,
+            "dtype": dtype,
+        }
+        # T5Gemma2Model bundles encoder and decoder/LM head, while _get_default_embeds expects an encoder-only model
+        # (similar to T5EncoderModel/T5GemmaEncoderModel), so we pass the encoder submodule explicitly here.
+        if isinstance(text_encoder, T5Gemma2Model):
+            prompt_embeds_kwargs["text_encoder"] = text_encoder.encoder
+        prompt_embeds, prompt_attention_mask = self._get_default_embeds(**prompt_embeds_kwargs)
+        pooled_prompt_embeds = self._average_pool(prompt_embeds, prompt_attention_mask)
+        return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds
+    # Keep encode_prompt structure, uses _get_prompt_embeds internally
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        prompt_embeds_kwargs = {
+            "device": device,
+            "dtype": dtype,
+        }
+        if prompt_embeds is None:
+            prompt_embeds, prompt_attention_mask, pooled_prompt_embeds = self._get_prompt_embeds(
+                text_encoder=self.text_encoder,
+                tokenizer=self.tokenizer,
+                prompt=prompt,
+                max_sequence_length=max_sequence_length,
+                **prompt_embeds_kwargs,
+            )
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        seq_len = prompt_embeds.shape[1]
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        if pooled_prompt_embeds is not None:
+            pooled_prompt_embeds = pooled_prompt_embeds.repeat_interleave(num_videos_per_prompt, dim=0)
+        # Keep attention mask handling
+        prompt_attention_mask = prompt_attention_mask.bool()
+        prompt_attention_mask = prompt_attention_mask.view(batch_size, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat_interleave(num_videos_per_prompt, dim=0)
+        return (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            prompt_attention_mask,
+        )
+    @property
+    def vision_encoder(self):
+        """Get the vision encoder from T5Gemma2.
+        T5Gemma2 has vision_tower.vision_model structure.
+        Will raise AttributeError if not available.
+        """
+        return self.text_encoder.encoder.vision_tower.vision_model
+    def encode_image(
+        self,
+        image: Image.Image,
+        batch_size: int = 1,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> torch.Tensor:
+        """Encode image to embeddings using SigLIP vision encoder."""
+        device = device or self._execution_device
+        dtype = dtype or self.transformer.dtype
+        image_embeds = self._get_image_embeds(
+            image_encoder=self.vision_encoder,
+            feature_extractor=self.feature_extractor,
+            image=image,
+            device=device,
+        )
+        image_embeds = image_embeds.repeat(batch_size, 1, 1)
+        return image_embeds.to(device=device, dtype=dtype)
+    @staticmethod
+    def _get_image_embeds(
+        image_encoder,
+        feature_extractor: SiglipImageProcessor,
+        image,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """Helper to encode single image with SigLIP.
+        Args:
+            image_encoder: The SigLIP vision encoder model.
+            feature_extractor: SiglipImageProcessor for preprocessing.
+            image: Can be either:
+                - PIL.Image.Image: Will be preprocessed by feature_extractor
+                - torch.Tensor: Assumed to be in [0, 1] range, will be normalized and passed to encoder
+            device: Device to place tensors on.
+        Returns:
+            Image embeddings from the vision encoder.
+        """
+        image_encoder_dtype = next(image_encoder.parameters()).dtype
+        if isinstance(image, torch.Tensor):
+            image = feature_extractor.preprocess(
+                images=image.float(),
+                do_resize=True,
+                do_rescale=False,
+                do_normalize=True,
+                do_convert_rgb=True,
+                return_tensors="pt",
+            )
+        else:
+            image = feature_extractor.preprocess(
+                images=image,
+                do_resize=True,
+                do_rescale=False,
+                do_normalize=True,
+                do_convert_rgb=True,
+                return_tensors="pt",
+            )
+        image = image.to(device, dtype=image_encoder_dtype)
+        return image_encoder(**image).last_hidden_state
+    @torch.compiler.disable
+    def _prepare_first_frame_conditioning(
+        self,
+        video: torch.Tensor,
+        latents: torch.Tensor,
+        use_conditioning: bool,
+        generator: Optional[torch.Generator] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+        """Prepare first frame conditioning tensors.
+        This method implements batch-level conditioning where entire
+        batches are either I2V (all samples conditioned) or T2V (no conditioning). This
+        prevents mode confusion within batches.
+        For I2V mode:
+        1. Extract and VAE-encode first frame from video
+        2. Create latent_condition by repeating first frame across time (frame 0 only)
+        3. Create latent_mask with 1.0 at frame 0
+        4. Get image_embeds from vision encoder
+        For T2V mode:
+        1. Pad with zeros for latent_condition and latent_mask
+        Args:
+            video: Input video tensor [batch_size, frames, channels, height, width] in [-1, 1]
+            latents: Latents [batch_size, lantent_channels, latent_num_frames, latent_height, latent_width]
+            use_conditioning: Whether to use first-frame conditioning (True for I2V, False for T2V)
+            generator: Optional random number generator for reproducibility
+        Returns:
+            Tuple of (latent_condition, latent_mask, image_embeds).
+            - latent_condition: [B, C, F, H, W] conditioning signal (zeros for T2V)
+            - latent_mask: [B, 1, F, H, W] binary mask (zeros for T2V)
+            - image_embeds: [B, N, D] image embeddings from vision encoder or None for T2V
+        """
+        batch_size, lantent_channels, latent_num_frames, latent_height, latent_width = latents.shape
+        device = latents.device
+        dtype = latents.dtype
+        # Determine if we should use conditioning
+        use_conditioning = use_conditioning and (latent_num_frames > 1)
+        # Initialize conditioning tensors
+        latent_condition = torch.zeros(
+            batch_size, lantent_channels, latent_num_frames, latent_height, latent_width, device=device, dtype=dtype
+        )
+        latent_mask = torch.zeros(
+            batch_size, 1, latent_num_frames, latent_height, latent_width, device=device, dtype=dtype
+        )
+        image_embeds = None
+        if use_conditioning:
+            with torch.no_grad():
+                # Encode first frame for latent_condition
+                first_frame_latents = self.vae.encode(
+                    rearrange(video[:, 0:1], "b f c h w -> b c f h w")
+                ).latent_dist.sample(generator=generator)
+            first_frame_latents = self._normalize_latents(
+                latents=first_frame_latents,
+                latents_mean=self.vae.config.latents_mean,
+                latents_std=self.vae.config.latents_std,
+            )
+            # Create latent_condition by repeating first frame across time
+            latent_condition = first_frame_latents.repeat(1, 1, latent_num_frames, 1, 1)
+            latent_condition[:, :, 1:, :, :] = 0
+            # latent_mask: 1.0 at frame 0, 0.0 elsewhere
+            latent_mask[:, :, 0] = 1.0
+            # image_embeds from vision encoder
+            first_frame_vision = video[:, 0]  # [B, C, H, W]
+            first_frame_vision = ((first_frame_vision + 1) / 2).clamp(0, 1)
+            with torch.no_grad():
+                image_embeds = self._get_image_embeds(
+                    image_encoder=self.vision_encoder,
+                    feature_extractor=self.feature_extractor,
+                    image=first_frame_vision,
+                    device=device,
+                )
+        return latent_condition, latent_mask, image_embeds
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        batch_size,
+        callback_on_step_end_tensor_inputs=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        # Resolution must be divisible by VAE scale factor * transformer patch size
+        # (e.g. 8 * 2 = 16 for default config) to avoid latent/patch dimension mismatch.
+        spatial_divisor = self.vae_scale_factor_spatial * self.transformer_spatial_patch_size
+        if height % spatial_divisor != 0 or width % spatial_divisor != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {spatial_divisor} "
+                f"(vae_scale={self.vae_scale_factor_spatial} * patch_size={self.transformer_spatial_patch_size}) "
+                f"but are {height} and {width}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        # Validate negative_prompt: must be None, str, or list with matching batch_size
+        if negative_prompt is not None:
+            if not isinstance(negative_prompt, (str, list)):
+                raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+            if isinstance(negative_prompt, list) and len(negative_prompt) != batch_size:
+                raise ValueError(
+                    f"`negative_prompt` list length ({len(negative_prompt)}) must match batch_size ({batch_size})."
+                )
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+    def _prepare_negative_prompt(
+        self,
+        negative_prompt: Optional[Union[str, List[str]]],
+        batch_size: int,
+    ) -> List[str]:
+        """
+        Prepare negative_prompt to match batch_size.
+        Args:
+            negative_prompt: None, a single string, or a list of strings matching batch_size.
+            batch_size: The number of prompts in the batch.
+        Returns:
+            A list of negative prompts with length equal to batch_size.
+        """
+        if negative_prompt is None:
+            return [""] * batch_size
+        if isinstance(negative_prompt, str):
+            return [negative_prompt] * batch_size
+        return negative_prompt
+    @staticmethod
+    def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = latents.shape
+        post_patch_num_frames = num_frames // patch_size_t
+        post_patch_height = height // patch_size
+        post_patch_width = width // patch_size
+        latents = latents.reshape(
+            batch_size,
+            -1,
+            post_patch_num_frames,
+            patch_size_t,
+            post_patch_height,
+            patch_size,
+            post_patch_width,
+            patch_size,
+        )
+        latents = latents.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7).flatten(1, 3)
+        return latents
+    @staticmethod
+    def _unpack_latents(
+        latents: torch.Tensor,
+        num_frames: int,
+        height: int,
+        width: int,
+        patch_size: int = 1,
+        patch_size_t: int = 1,
+    ) -> torch.Tensor:
+        batch_size = latents.size(0)
+        latents = latents.reshape(
+            batch_size,
+            num_frames,
+            height,
+            width,
+            -1,
+            patch_size_t,
+            patch_size,
+            patch_size,
+        )
+        latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        return latents
+    @staticmethod
+    def _normalize_latents(
+        latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor
+    ) -> torch.Tensor:
+        # Normalize latents across the channel dimension [B, C, F, H, W]
+        latents_mean = torch.tensor(latents_mean).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+        latents_std = torch.tensor(latents_std).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+        latents = (latents - latents_mean) / latents_std
+        return latents
+    @staticmethod
+    def _denormalize_latents(
+        latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor
+    ) -> torch.Tensor:
+        # Denormalize latents across the channel dimension [B, C, F, H, W]
+        latents_mean = torch.tensor(latents_mean).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+        latents_std = torch.tensor(latents_std).view(1, -1, 1, 1, 1).to(latents.device, latents.dtype)
+        latents = latents * latents_std + latents_mean
+        return latents
+    def prepare_latents(
+        self,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        height: int = 352,
+        width: int = 640,
+        num_frames: int = 65,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        shape = (
+            batch_size,
+            num_channels_latents,
+            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] | None = None,
+        image=None,
+        negative_prompt: Optional[Union[str, List[str]]] = "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, broadcast graphics, UI elements, random letters, frozen pose, rigid, static expression, jerky motion, mechanical motion, discontinuous motion, flat framing, depthless, dull lighting, monotone, crushed shadows, blown-out highlights, shifting background, fading background, poor continuity, identity drift, deformation, flickering, ghosting, smearing, duplication, mutated proportions, inconsistent clothing, flat colors, desaturated, tonally compressed, poor background separation, exposure shift, uneven brightness, color balance shift",
+        height: int = 736,
+        width: int = 1280,
+        num_frames: int = 121,
+        frame_rate: int = 24,
+        num_inference_steps: int = 50,
+        timesteps: List[int] | None = None,
+        use_linear_quadratic_schedule: bool = False,
+        linear_quadratic_emulating_steps: int = 250,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        use_attention_mask: bool = True,
+        vae_batch_size: int | None = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance.
+            height (`int`, defaults to `352`):
+                The height in pixels of the generated image.
+            width (`int`, defaults to `640`):
+                The width in pixels of the generated image.
+            num_frames (`int`, defaults to `65`):
+                The number of video frames to generate
+            frame_rate (`int`, defaults to `25`):
+                Frame rate for the output video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process.
+            use_linear_quadratic_schedule (`bool`, defaults to `True`):
+                Whether to use a linear-quadratic sigma schedule instead of the default linear schedule.
+                This schedule combines linear interpolation in the first half (slow denoising at high noise)
+                with quadratic interpolation in the second half (faster denoising toward clean image).
+                Requires `num_inference_steps` to be even.
+            linear_quadratic_emulating_steps (`int`, defaults to `250`):
+                Controls the slope of linear interpolation in the first half of the linear-quadratic schedule.
+                Higher values result in a gentler slope. Only used when `use_linear_quadratic_schedule=True`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                PyTorch Generator object(s) for deterministic generation.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.Tensor`, *optional*):
+                Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format ("pil" or "np").
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a `MotifVideoPipelineOutput`.
+            attention_kwargs (`dict`, *optional*):
+                Arguments passed to the attention processor.
+            callback_on_step_end (`Callable`, *optional*):
+                Callback function called at the end of each step.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                Tensors to include in the callback.
+            max_sequence_length (`int` defaults to `512`):
+                Maximum sequence length for the tokenizer.
+        Examples:
+        Returns:
+            [`~pipelines.motif_video.MotifVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, returns [`~pipelines.motif_video.MotifVideoPipelineOutput`],
+                otherwise returns a tuple where the first element is a list of generated video frames.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 1. Define call parameters (batch_size needed for check_inputs)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # 2. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            batch_size=batch_size,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        self._current_timestep = None
+        # Auto-upgrade AdaptiveProjectedGuidance to VideoAdaptiveProjectedGuidance
+        # for video generation. Video-aware APG normalizes per-frame [C,H,W] instead
+        # of collapsing the temporal axis, preserving motion quality.
+        if type(self.guider) is AdaptiveProjectedGuidance:
+            self.guider = VideoAdaptiveProjectedGuidance(
+                guidance_scale=self.guider.guidance_scale,
+                adaptive_projected_guidance_rescale=self.guider.adaptive_projected_guidance_rescale,
+                adaptive_projected_guidance_momentum=self.guider.adaptive_projected_guidance_momentum,
+                eta=self.guider.eta,
+                use_original_formulation=self.guider.use_original_formulation,
+            )
+        device = self._execution_device
+        # 3. Prepare text embeddings
+        prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
+            prompt=prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if self.guider._enabled:
+            negative_prompt = self._prepare_negative_prompt(negative_prompt, batch_size)
+            negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                prompt_attention_mask=negative_prompt_attention_mask,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        num_channels_latents = self.vae.config.z_dim
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            self.transformer.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 4.5 Preprocess image for I2V conditioning
+        if image is not None:
+            from PIL import Image as PILImage
+            if isinstance(image, PILImage.Image):
+                image = image.convert("RGB").resize((width, height), PILImage.LANCZOS)
+                image = torch.from_numpy(np.array(image)).permute(2, 0, 1).float() / 255.0
+                image = image * 2.0 - 1.0  # [0,1] -> [-1,1]
+                image = image.unsqueeze(0)  # [1, C, H, W]
+            # Handle [C, H, W] -> [1, C, H, W]
+            if image.dim() == 3:
+                image = image.unsqueeze(0)
+            # [B, C, H, W] -> [B, 1, C, H, W] for video format
+            if image.dim() == 4:
+                image = image.unsqueeze(1)
+            image = image.to(device=device, dtype=self.vae.dtype)
+        # 5. Prepare timesteps (including mu calculation)
+        # Recalculate latent dims based on VAE for mu calculation
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+        latent_num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        # Calculate sequence length based on *packed* dimensions if transformer uses packing
+        # Packed dims: H/patch, W/patch, F/patch_t
+        packed_latent_height = latent_height // self.transformer_spatial_patch_size
+        packed_latent_width = latent_width // self.transformer_spatial_patch_size
+        packed_latent_num_frames = latent_num_frames // self.transformer_temporal_patch_size
+        video_sequence_length = packed_latent_num_frames * packed_latent_height * packed_latent_width
+        # Compute sigmas: use linear-quadratic schedule if enabled, otherwise default linear
+        _is_flow_multistep = isinstance(
+            self.scheduler,
+            (DPMSolverMultistepScheduler, UniPCMultistepScheduler, FlowUniPCMultistepScheduler),
+        )
+        # Compute mu once, shared by both branches (required by FlowUniPCMultistepScheduler)
+        mu = calculate_shift(
+            video_sequence_length,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        if _is_flow_multistep:
+            # DPMSolver/UniPC manage their own sigma schedule via use_flow_sigmas + flow_shift.
+            # Pass mu for dynamic shifting support (required by FlowUniPCMultistepScheduler).
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler,
+                num_inference_steps,
+                device,
+                timesteps,
+                mu=mu,
+            )
+        else:
+            if use_linear_quadratic_schedule:
+                # Linear-quadratic schedule computes sigmas internally in retrieve_timesteps
+                sigmas = None
+            else:
+                sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler,
+                num_inference_steps,
+                device,
+                timesteps,
+                sigmas=sigmas,
+                use_linear_quadratic_schedule=use_linear_quadratic_schedule,
+                linear_quadratic_emulating_steps=linear_quadratic_emulating_steps,
+                mu=mu,
+            )
+        # Get conditioning tensors
+        latent_condition, latent_mask, image_embeds = self._prepare_first_frame_conditioning(
+            image,
+            latents,
+            use_conditioning=image is not None,
+            generator=generator,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                # Concatenate current latents with conditioning for this timestep
+                # [latents | latent_condition | latent_mask]
+                hidden_states = torch.cat([latents, latent_condition, latent_mask], dim=1)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0])
+                # Step 1: Collect model inputs needed for the guidance method
+                # conditional inputs should always be first element in the tuple
+                guider_inputs = {
+                    "encoder_hidden_states": (prompt_embeds, negative_prompt_embeds),
+                }
+                if use_attention_mask:
+                    guider_inputs["encoder_attention_mask"] = (prompt_attention_mask, negative_prompt_attention_mask)
+                if self.transformer.config.pooled_projection_dim is not None:
+                    guider_inputs["pooled_projections"] = (pooled_prompt_embeds, negative_pooled_prompt_embeds)
+                if image_embeds is not None:
+                    guider_inputs["image_embeds"] = (image_embeds, image_embeds)
+                # Step 2: Update guider's internal state for this denoising step
+                self.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
+                # Sigma injection for guiders that support sigma-based gating
+                # (Kynkäänniemi 2024). Must precede `prepare_inputs` because
+                # `num_conditions` → `_is_cfg_enabled()` reads `_current_sigma`.
+                # Duck-typed so diffusers-native guiders are unaffected; guard
+                # on scheduler too since some schedulers don't expose `sigmas`.
+                if hasattr(self.guider, "_current_sigma") and hasattr(self.scheduler, "sigmas"):
+                    self.guider._current_sigma = float(self.scheduler.sigmas[i])
+                # Step 3: Prepare batched model inputs based on the guidance method
+                # The guider splits model inputs into separate batches for conditional/unconditional predictions.
+                # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
+                # you will get a guider_state with two batches:
+                #   guider_state = [
+                #       {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"},      # conditional batch
+                #       {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"},  # unconditional batch
+                #   ]
+                # Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
+                guider_state = self.guider.prepare_inputs(guider_inputs)
+                # Step 4: Run the denoiser for each batch
+                # Each batch in guider_state represents a different conditioning (conditional, unconditional, etc.).
+                # We run the model once per batch and store the noise prediction in guider_state_batch.noise_pred.
+                for guider_state_batch in guider_state:
+                    self.guider.prepare_models(self.transformer)
+                    # Extract conditioning kwargs for this batch (e.g., encoder_hidden_states)
+                    cond_kwargs = {
+                        input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()
+                    }
+                    tread_disabled = getattr(self.guider, "_current_tread_disabled", False)
+                    # Override TREAD selection ratio per batch if the guider provides one
+                    selection_ratio = getattr(self.guider, "_current_selection_ratio", None)
+                    tread_mixin = getattr(self.transformer, "_inference_tread_mixin", None)
+                    if (
+                        selection_ratio is not None
+                        and tread_mixin is not None
+                        and tread_mixin._tread_route is not None
+                    ):
+                        tread_mixin._tread_route["sel"] = selection_ratio
+                    # e.g. "pred_cond"/"pred_uncond"
+                    context_name = getattr(guider_state_batch, self.guider._identifier_key)
+                    with self.transformer.cache_context(context_name):
+                        # Run denoiser and store noise prediction in this batch
+                        noise_pred = self.transformer(
+                            hidden_states=hidden_states,
+                            timestep=timestep,
+                            attention_kwargs=self.attention_kwargs,
+                            return_dict=False,
+                            tread_disabled=tread_disabled,
+                            **cond_kwargs,
+                        )[0].clone()
+                        guider_state_batch.noise_pred = noise_pred
+                    # Cleanup model (e.g., remove hooks)
+                    self.guider.cleanup_models(self.transformer)
+                # Step 5: Combine predictions using the guidance method
+                # The guider takes all noise predictions from guider_state and combines them according to the guidance algorithm.
+                # Continuing the CFG example, the guider receives:
+                #   guider_state = [
+                #       {"encoder_hidden_states": prompt_embeds, "noise_pred": noise_pred_cond, "__guidance_identifier__": "pred_cond"},      # batch 0
+                #       {"encoder_hidden_states": negative_prompt_embeds, "noise_pred": noise_pred_uncond, "__guidance_identifier__": "pred_uncond"},  # batch 1
+                #   ]
+                # And extracts predictions using the __guidance_identifier__:
+                #   pred_cond = guider_state[0]["noise_pred"]      # extracts noise_pred_cond
+                #   pred_uncond = guider_state[1]["noise_pred"]    # extracts noise_pred_uncond
+                # Then applies CFG formula:
+                #   noise_pred = pred_uncond + guidance_scale * (pred_cond - pred_uncond)
+                # Returns GuiderOutput(pred=noise_pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+                noise_pred = self.guider(guider_state)[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    # Handle negative embeds if needed by callback
+                    if "negative_prompt_embeds" in callback_outputs:
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds")
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            video = latents
+        else:
+            latents = latents.to(self.vae.dtype)
+            latents = self._denormalize_latents(latents, self.vae.config.latents_mean, self.vae.config.latents_std)
+            if vae_batch_size is not None and latents.shape[0] > vae_batch_size:
+                video_chunks = []
+                for i in range(0, latents.shape[0], vae_batch_size):
+                    chunk = latents[i : i + vae_batch_size]
+                    video_chunks.append(self.vae.decode(chunk, return_dict=False)[0])
+                video = torch.cat(video_chunks, dim=0)
+                del video_chunks
+            else:
+                video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        # Return updated output type
+        return MotifVideoPipelineOutput(frames=video)

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.36.0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "invert_sigmas": false,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 15.0,
+  "shift_terminal": null,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": false
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "architectures": [
+    "T5Gemma2Model"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "classifier_dropout_rate": 0.0,
+  "decoder": {
+    "_sliding_window_pattern": 6,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": null,
+    "dropout_rate": 0.0,
+    "dtype": "bfloat16",
+    "final_logit_softcapping": null,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "t5gemma2_decoder",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 34,
+    "num_key_value_heads": 4,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "full_attention": {
+        "factor": 8.0,
+        "rope_theta": 1000000,
+        "rope_type": "linear"
+      },
+      "sliding_attention": {
+        "rope_theta": 10000,
+        "rope_type": "default"
+      }
+    },
+    "sliding_window": 1024,
+    "use_bidirectional_attention": false,
+    "use_cache": true,
+    "vocab_size": 262144
+  },
+  "dropout_rate": 0.0,
+  "dtype": "bfloat16",
+  "encoder": {
+    "attention_dropout": 0.0,
+    "boi_token_index": 255999,
+    "dropout_rate": 0.0,
+    "dtype": "bfloat16",
+    "eoi_token_index": 256000,
+    "image_token_index": 256001,
+    "initializer_range": 0.02,
+    "mm_tokens_per_image": 256,
+    "model_type": "t5gemma2_encoder",
+    "text_config": {
+      "_name_or_path": "",
+      "_sliding_window_pattern": 6,
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_bias": false,
+      "attention_dropout": 0.0,
+      "attn_logit_softcapping": null,
+      "bos_token_id": 2,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "dropout_rate": 0.0,
+      "dtype": "bfloat16",
+      "eos_token_id": 1,
+      "final_logit_softcapping": null,
+      "finetuning_task": null,
+      "head_dim": 256,
+      "hidden_activation": "gelu_pytorch_tanh",
+      "hidden_size": 2560,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 10240,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layer_types": [
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention"
+      ],
+      "max_position_embeddings": 131072,
+      "model_type": "t5gemma2_text",
+      "num_attention_heads": 8,
+      "num_hidden_layers": 34,
+      "num_key_value_heads": 4,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "pad_token_id": 0,
+      "prefix": null,
+      "problem_type": null,
+      "query_pre_attn_scalar": 256,
+      "return_dict": true,
+      "rms_norm_eps": 1e-06,
+      "rope_parameters": {
+        "full_attention": {
+          "factor": 8.0,
+          "rope_theta": 1000000,
+          "rope_type": "linear"
+        },
+        "sliding_attention": {
+          "rope_theta": 10000,
+          "rope_type": "default"
+        }
+      },
+      "sep_token_id": null,
+      "sliding_window": 1024,
+      "task_specific_params": null,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "use_bidirectional_attention": false,
+      "use_cache": true,
+      "vocab_size": 262144
+    },
+    "vision_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_dropout": 0.0,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "dropout_rate": 0.0,
+      "dtype": "bfloat16",
+      "eos_token_id": null,
+      "finetuning_task": null,
+      "hidden_act": "gelu_pytorch_tanh",
+      "hidden_size": 1152,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_size": 896,
+      "intermediate_size": 4304,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layer_norm_eps": 1e-06,
+      "model_type": "siglip_vision_model",
+      "num_attention_heads": 16,
+      "num_channels": 3,
+      "num_hidden_layers": 27,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "pad_token_id": null,
+      "patch_size": 14,
+      "prefix": null,
+      "problem_type": null,
+      "return_dict": true,
+      "sep_token_id": null,
+      "task_specific_params": null,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "vision_use_head": false,
+      "vocab_size": 262144
+    },
+    "vocab_size": 262144
+  },
+  "eoi_token_index": 256000,
+  "eos_token_id": 1,
+  "image_token_index": 256001,
+  "initializer_range": 0.02,
+  "is_encoder_decoder": true,
+  "model_type": "t5gemma2",
+  "pad_token_id": 0,
+  "transformers_version": "5.0.0rc1",
+  "vocab_size": 262144
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c7dd568c34c56a521475124f226983dc191e57aa9b1cac9a22a87dcc753cb57
+size 16360212008

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3220c5bec16e78ddf8e59c08fecdede7e8d31820cb5b3e69f17fed6a29a0b30c
+size 33378248

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "additional_special_tokens": null,
+  "backend": "tokenizers",
+  "boi_token": "<start_of_image>",
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eoi_token": "<end_of_image>",
+  "eos_token": "<eos>",
+  "image_token": "<image_soft_token>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {
+    "boi_token": "<start_of_image>",
+    "eoi_token": "<end_of_image>",
+    "image_token": "<image_soft_token>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "Gemma3Processor",
+  "sp_model_kwargs": null,
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

transformer/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_class_name": "MotifVideoTransformer3DModel",
+  "_diffusers_version": "0.36.0",
+  "_library": "diffusers",
+  "attention_head_dim": 128,
+  "base_latent_size": null,
+  "image_condition_type": null,
+  "image_embed_dim": 1152,
+  "in_channels": 33,
+  "mlp_ratio": 4.0,
+  "norm_type": "layer_norm",
+  "num_attention_heads": 12,
+  "num_decoder_layers": 8,
+  "num_layers": 12,
+  "num_single_layers": 24,
+  "out_channels": 16,
+  "patch_size": 2,
+  "patch_size_t": 1,
+  "pooled_projection_dim": null,
+  "qk_norm": "rms_norm",
+  "rope_axes_dim": [
+    16,
+    56,
+    56
+  ],
+  "rope_theta": 10000.0,
+  "text_embed_dim": 2560,
+  "enable_text_cross_attention_dual": false,
+  "enable_text_cross_attention_single": true
+}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a8b17a188d358d9d0e9b097f5fc58c094f37a82a00cbb8895e54c2bdd73f6ff
+size 7849331048

transformer/transformer_motif_video.py ADDED Viewed

	@@ -0,0 +1,1350 @@

+# Copyright 2026 Motif Technologies. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import lru_cache
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.hooks._helpers import TransformerBlockMetadata, TransformerBlockRegistry
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import (
+    PixArtAlphaTextProjection,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import (
+    AdaLayerNormContinuous,
+    AdaLayerNormZero,
+    AdaLayerNormZeroSingle,
+)
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+# Stub functions for TREAD (Token REduction with Approximated Distillation).
+# These stubs ensure TREAD code paths are never activated during inference
+# without requiring the motif_core package.
+def is_tread_start(block_idx, start, end): return False
+def is_tread_end(block_idx, start, end): return False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+NUM_TRAIN_TIMESTEPS = 1000
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Tuple[torch.Tensor, torch.Tensor],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> torch.Tensor:
+    """
+    Apply rotary positional embeddings (RoPE) to input tensors.
+    This implementation supports both standard 2D RoPE tensors [L, Dh] and batched 4D RoPE
+    tensors [B, 1, L, Dh] for compatibility with TREAD's token-dropping mechanism where
+    different batches may have different token subsets.
+    Args:
+        x: Input tensor of shape [B, H, L, Dh].
+        freqs_cis: Tuple of (cos, sin) tensors. Supports shapes [L, Dh] or [B, 1, L, Dh].
+        use_real: Whether to use real-valued RoPE implementation.
+        use_real_unbind_dim: Dimension to unbind when using real-valued RoPE (-1 or -2).
+    Returns:
+        Tensor with rotary embeddings applied, same shape as input x.
+    """
+    if use_real:
+        cos, sin = freqs_cis
+        if cos.dim() == 2:  # [L, Dh] → [1, 1, L, Dh]
+            cos = cos.unsqueeze(0).unsqueeze(0)
+            sin = sin.unsqueeze(0).unsqueeze(0)
+        if cos.dim() != 4 or sin.dim() != 4:
+            raise RuntimeError(f"RoPE must be 2D or 4D, got cos={cos.dim()}D, sin={sin.dim()}D")
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if cos.size(-2) != x.size(-2) or cos.size(-1) != x.size(-1):
+            raise RuntimeError(
+                f"RoPE shape mismatch: rope[-2:]=({cos.size(-2)},{cos.size(-1)}) vs x[-2:]=({x.size(-2)},{x.size(-1)})"
+            )
+        if use_real_unbind_dim == -1:
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rot = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rot * freqs).flatten(3)
+        return x_out.type_as(x)
+class MotifVideoAttnProcessor2_0:
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "MotifVideoAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        query_input: Optional[torch.Tensor] = None,
+        key_input: Optional[torch.Tensor] = None,
+        value_input: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Cross-attention mode: query already projected externally (cross_attn_query_proj + norm),
+        # skip to_q and only apply reshape + norm_q + RoPE. K/V use to_k/to_v as normal.
+        if query_input is not None:
+            query = query_input.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            key = attn.to_k(key_input)
+            value = attn.to_v(value_input)
+            key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            if attn.norm_q is not None:
+                query = attn.norm_q(query)
+            if attn.norm_k is not None:
+                key = attn.norm_k(key)
+            if image_rotary_emb is not None:
+                query = apply_rotary_emb(query, image_rotary_emb)
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+            hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+            hidden_states = hidden_states.to(query.dtype)
+            return hidden_states, None
+        if attn.add_q_proj is None and encoder_hidden_states is not None:
+            hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        # 1. QKV projections
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        # 2. QK normalization
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # 3. Rotational positional embeddings applied to latent stream
+        if image_rotary_emb is not None:
+            if attn.add_q_proj is None and encoder_hidden_states is not None:
+                query = torch.cat(
+                    [
+                        apply_rotary_emb(query[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
+                        query[:, :, -encoder_hidden_states.shape[1] :],
+                    ],
+                    dim=2,
+                )
+                key = torch.cat(
+                    [
+                        apply_rotary_emb(key[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
+                        key[:, :, -encoder_hidden_states.shape[1] :],
+                    ],
+                    dim=2,
+                )
+            else:
+                query = apply_rotary_emb(query, image_rotary_emb)
+                key = apply_rotary_emb(key, image_rotary_emb)
+        # 4. Encoder condition QKV projection and normalization
+        if attn.add_q_proj is not None and encoder_hidden_states is not None:
+            encoder_query = attn.add_q_proj(encoder_hidden_states)
+            encoder_key = attn.add_k_proj(encoder_hidden_states)
+            encoder_value = attn.add_v_proj(encoder_hidden_states)
+            encoder_query = encoder_query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            encoder_key = encoder_key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            encoder_value = encoder_value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_query = attn.norm_added_q(encoder_query)
+            if attn.norm_added_k is not None:
+                encoder_key = attn.norm_added_k(encoder_key)
+            query = torch.cat([query, encoder_query], dim=2)
+            key = torch.cat([key, encoder_key], dim=2)
+            value = torch.cat([value, encoder_value], dim=2)
+        # 5. Attention
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+        # 6. Output projection
+        if encoder_hidden_states is not None:
+            hidden_states, encoder_hidden_states = (
+                hidden_states[:, : -encoder_hidden_states.shape[1]],
+                hidden_states[:, -encoder_hidden_states.shape[1] :],
+            )
+            if getattr(attn, "to_out", None) is not None:
+                hidden_states = attn.to_out[0](hidden_states)
+                hidden_states = attn.to_out[1](hidden_states)
+            if getattr(attn, "to_add_out", None) is not None:
+                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+class MotifVideoPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: Union[int, Tuple[int, int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        super().__init__()
+        patch_size = (patch_size, patch_size, patch_size) if isinstance(patch_size, int) else patch_size
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.proj(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)  # BCFHW -> BNC
+        return hidden_states
+class MotifVideoAdaNorm(nn.Module):
+    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
+        super().__init__()
+        out_features = out_features or 2 * in_features
+        self.linear = nn.Linear(in_features, out_features)
+        self.nonlinearity = nn.SiLU()
+    def forward(self, temb: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        temb = self.linear(self.nonlinearity(temb))
+        gate_msa, gate_mlp = temb.chunk(2, dim=1)
+        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
+        return gate_msa, gate_mlp
+class MotifVideoConditionEmbedding(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        pooled_projection_dim: int | None,
+    ):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        if isinstance(pooled_projection_dim, int):
+            self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        pooled_projection: torch.Tensor | None = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        timesteps_proj = self.time_proj(timestep)
+        timestep_embedder_dtype = next(self.timestep_embedder.parameters()).dtype
+        conditioning = self.timestep_embedder(timesteps_proj.to(timestep_embedder_dtype))  # (N, D)
+        if pooled_projection is not None:
+            conditioning = conditioning + self.text_embedder(pooled_projection)
+        token_replace_emb = None
+        return conditioning, token_replace_emb
+# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L485-L486
+def find_correction_factor(num_rotations, dim, base, max_position_embeddings):
+    dtype = num_rotations.dtype if isinstance(num_rotations, torch.Tensor) else torch.float32
+    max_pos_tensor = torch.as_tensor(max_position_embeddings, dtype=dtype)
+    return (dim * torch.log(max_pos_tensor / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )  # Inverse dim formula to find number of rotations
+# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L489-L495
+def find_correction_range(low_ratio, high_ratio, dim, base, ori_max_pe_len):
+    """
+    Find the correction range for NTK-by-parts interpolation.
+    """
+    low = torch.floor(find_correction_factor(low_ratio, dim, base, ori_max_pe_len))
+    high = torch.ceil(find_correction_factor(high_ratio, dim, base, ori_max_pe_len))
+    low = torch.clamp(low, min=0)
+    high = torch.clamp(high, max=dim - 1)
+    return low, high  # Clamp values just in case
+# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L498-L504
+def linear_ramp_mask(min_val, max_val, num_dim):
+    if isinstance(min_val, torch.Tensor):
+        if (min_val == max_val).all():
+            max_val = max_val + 0.001
+    elif min_val == max_val:
+        max_val += 0.001
+    linear_func = (torch.arange(num_dim, dtype=torch.float32) - min_val) / (max_val - min_val)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L507-L511
+def find_newbase_ntk(dim, base, scale):
+    """
+    Calculate the new base for NTK-aware scaling.
+    """
+    # Avoid division by zero when dim == 2 (or invalid smaller values).
+    # In these degenerate cases, fall back to the original base (no NTK adjustment).
+    if dim <= 2:
+        return base
+    return base * (scale ** (dim / (dim - 2)))
+# Copied from https://github.com/guyyariv/DyPE/blob/5dd4fab99b479ee487754140d717bfb888a6afa2/flux/transformer_flux.py#L514-L652
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[np.ndarray, int],
+    theta: float = 10000.0,
+    use_real=False,
+    linear_factor=1.0,
+    ntk_factor=1.0,
+    repeat_interleave_real=True,
+    freqs_dtype=torch.float32,
+    yarn=False,
+    max_pe_len=None,
+    ori_max_pe_len=64,
+    dype=False,
+    current_timestep=1.0,
+):
+    """
+    Precompute the frequency tensor for complex exponentials with RoPE.
+    Supports YARN interpolation for vision transformers.
+    Args:
+        dim (`int`):
+            Dimension of the frequency tensor.
+        pos (`np.ndarray` or `int`):
+            Position indices for the frequency tensor. [S] or scalar.
+        theta (`float`, *optional*, defaults to 10000.0):
+            Scaling factor for frequency computation.
+        use_real (`bool`, *optional*, defaults to False):
+            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+        linear_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for linear interpolation.
+        ntk_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for NTK-Aware RoPE.
+        repeat_interleave_real (`bool`, *optional*, defaults to True):
+            If True and use_real, real and imaginary parts are interleaved with themselves to reach dim.
+            Otherwise, they are concatenated.
+        freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
+            Data type of the frequency tensor.
+        yarn (`bool`, *optional*, defaults to False):
+            If True, use YARN interpolation combining NTK, linear, and base methods.
+        max_pe_len (`int`, *optional*):
+            Maximum position encoding length (current patches for vision models).
+        ori_max_pe_len (`int`, *optional*, defaults to 64):
+            Original maximum position encoding length (base patches for vision models).
+        dype (`bool`, *optional*, defaults to False):
+            If True, enable DyPE (Dynamic Position Encoding) with timestep-aware scaling.
+        current_timestep (`float`, *optional*, defaults to 1.0):
+            Current timestep for DyPE, normalized to [0, 1] where 1 is pure noise.
+    Returns:
+        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
+            If use_real=True, returns tuple of (cos, sin) tensors.
+    """
+    assert dim % 2 == 0
+    if isinstance(pos, int):
+        pos = torch.arange(pos)
+    if isinstance(pos, np.ndarray):
+        pos = torch.from_numpy(pos)
+    device = pos.device
+    if yarn and max_pe_len is not None and max_pe_len > ori_max_pe_len:
+        if not isinstance(max_pe_len, torch.Tensor):
+            max_pe_len = torch.tensor(max_pe_len, dtype=freqs_dtype, device=device)
+        scale = torch.clamp_min(max_pe_len / ori_max_pe_len, 1.0)
+        beta_0 = 1.25
+        beta_1 = 0.75
+        gamma_0 = 16
+        gamma_1 = 2
+        freqs_base = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim))
+        freqs_linear = 1.0 / torch.einsum(
+            "..., f -> ... f",
+            scale,
+            (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim)),
+        )
+        new_base = find_newbase_ntk(dim, theta, scale)
+        if new_base.dim() > 0:
+            new_base = new_base.view(-1, 1)
+        freqs_ntk = 1.0 / torch.pow(new_base, (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim))
+        if freqs_ntk.dim() > 1:
+            freqs_ntk = freqs_ntk.squeeze()
+        if dype:
+            beta_0 = torch.pow(beta_0, 2.0 * torch.pow(current_timestep, 2.0))
+            beta_1 = torch.pow(beta_1, 2.0 * torch.pow(current_timestep, 2.0))
+        low, high = find_correction_range(beta_0, beta_1, dim, theta, ori_max_pe_len)
+        high = torch.clamp(high, max=dim // 2)
+        freqs_mask = 1 - linear_ramp_mask(low, high, dim // 2).to(device).to(freqs_dtype)
+        freqs = freqs_linear * (1 - freqs_mask) + freqs_ntk * freqs_mask
+        if dype:
+            gamma_0 = torch.pow(gamma_0, 2.0 * torch.pow(current_timestep, 2.0))
+            gamma_1 = torch.pow(gamma_1, 2.0 * torch.pow(current_timestep, 2.0))
+        low, high = find_correction_range(gamma_0, gamma_1, dim, theta, ori_max_pe_len)
+        high = torch.clamp(high, max=dim // 2)
+        freqs_mask = 1 - linear_ramp_mask(low, high, dim // 2).to(device).to(freqs_dtype)
+        freqs = freqs * (1 - freqs_mask) + freqs_base * freqs_mask
+    else:
+        theta_ntk = theta * ntk_factor
+        freqs = 1.0 / (theta_ntk ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=device) / dim)) / linear_factor
+    freqs = torch.outer(pos, freqs)
+    is_npu = freqs.device.type == "npu"
+    if is_npu:
+        freqs = freqs.float()
+    if use_real and repeat_interleave_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1, output_size=freqs.shape[1] * 2).float()
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1, output_size=freqs.shape[1] * 2).float()
+        if yarn and max_pe_len is not None and max_pe_len > ori_max_pe_len:
+            mscale = torch.where(scale <= 1.0, 1.0, 0.1 * torch.log(scale) + 1.0).to(scale)
+            freqs_cos = freqs_cos * mscale
+            freqs_sin = freqs_sin * mscale
+        return freqs_cos, freqs_sin
+    elif use_real:
+        freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float()
+        freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float()
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs_cis
+class MotifVideoRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int,
+        patch_size_t: int,
+        rope_dim: List[int],
+        theta: float = 256.0,
+        base_latent_size: int | None = None,
+    ):
+        """
+        Rotary Positional Embedding (RoPE) for video latents.
+        Args:
+            patch_size (`int`):
+                Spatial patch size (e.g., 2).
+            patch_size_t (`int`):
+                Temporal patch size (e.g., 1).
+            rope_dim (`List[int]`):
+                Dimensions for RoPE across [Time, Height, Width] axes.
+            theta (`float`, *optional*, defaults to 256.0):
+                Base frequency for rotary embeddings.
+            base_latent_size (`int`, *optional*):
+                The maximum spatial dimension (in latent units) seen during training,
+                i.e. `training_resolution / vae_scale_factor_spatial`.
+                For example, for 1280x1280 training images and a VAE spatial downscale
+                (`vae_scale_factor_spatial`) of 8, this would be 160; for a downscale
+                of 16, it would be 80.
+        """
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t
+        self.rope_dim = rope_dim
+        self.theta = theta
+        self.base_latent_size = base_latent_size
+    @lru_cache(maxsize=8)
+    def _get_base_patch_grid_size(self, base_latent_size: Optional[int], patch_size: int) -> Optional[int]:
+        return base_latent_size // patch_size if base_latent_size else None
+    @lru_cache(maxsize=8)
+    def _get_dynamic_interpolation_scale(self, h: int, w: int, base_grid_size: int) -> float:
+        return math.sqrt(h * w / (base_grid_size**2))
+    def forward(self, hidden_states: torch.Tensor, timestep: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if self.training:
+            assert self.base_latent_size is None, (
+                "RoPE interpolation/extrapolation logic should only be enabled for inference. "
+                f"During training, base_latent_size must be None, but got {self.base_latent_size!r}."
+            )
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        rope_sizes = [num_frames // self.patch_size_t, height // self.patch_size, width // self.patch_size]
+        axes_grids = []
+        for i in range(3):
+            # Note: The following line diverges from original behaviour. We create the grid on the device, whereas
+            # original implementation creates it on CPU and then moves it to device. This results in numerical
+            # differences in layerwise debugging outputs, but visually it is the same.
+            grid = torch.arange(0, rope_sizes[i], device=hidden_states.device, dtype=torch.float32)
+            axes_grids.append(grid)
+        grid = torch.meshgrid(*axes_grids, indexing="ij")  # [W, H, T]
+        grid = torch.stack(grid, dim=0)  # [3, W, H, T]
+        base_patch_grid_size = self._get_base_patch_grid_size(self.base_latent_size, self.patch_size)
+        if base_patch_grid_size is not None:
+            if base_patch_grid_size <= 0:
+                raise ValueError(f"base_patch_grid_size must be a positive number, got {base_patch_grid_size}.")
+            dynamic_interpolation_scale = self._get_dynamic_interpolation_scale(
+                rope_sizes[1], rope_sizes[2], base_patch_grid_size
+            )
+        normalized_timestep = torch.tensor(1.0)
+        if not self.training and timestep is not None:
+            normalized_timestep = timestep[0] / NUM_TRAIN_TIMESTEPS
+        freqs = []
+        for i in range(3):
+            common_kwargs = {
+                "dim": self.rope_dim[i],
+                "pos": grid[i].reshape(-1),
+                "theta": self.theta,
+                "use_real": True,
+                "freqs_dtype": torch.float64,
+            }
+            # Apply scaling only to spatial dimensions (Height and Width, i=1 and i=2)
+            if i > 0 and base_patch_grid_size is not None and dynamic_interpolation_scale > 1.0:
+                # We project the training base to the current size using the uniform scale factor.
+                # max_pe_len tells the RoPE logic the "new" maximum length it's dealing with.
+                max_pe_len = torch.tensor(
+                    base_patch_grid_size * dynamic_interpolation_scale,
+                    dtype=torch.float64,
+                    device=hidden_states.device,
+                )
+                freq = get_1d_rotary_pos_embed(
+                    **common_kwargs,
+                    yarn=True,  # Enable Yet Another RoPE extensioN (YARN) for extrapolation
+                    max_pe_len=max_pe_len,
+                    ori_max_pe_len=base_patch_grid_size,  # The original training scale
+                    dype=True,  # Enable Dynamic Position Encoding (time-aware)
+                    current_timestep=normalized_timestep,
+                )
+            else:
+                # Time dimension OR within training bounds -> Standard RoPE
+                freq = get_1d_rotary_pos_embed(**common_kwargs)
+            freqs.append(freq)
+        freqs_cos = torch.cat([f[0] for f in freqs], dim=1)  # (W * H * T, D / 2)
+        freqs_sin = torch.cat([f[1] for f in freqs], dim=1)  # (W * H * T, D / 2)
+        return freqs_cos, freqs_sin
+class MotifVideoImageProjection(nn.Module):
+    def __init__(self, in_features: int, hidden_size: int):
+        super().__init__()
+        self.norm_in = nn.LayerNorm(in_features)
+        self.linear_1 = nn.Linear(in_features, in_features)
+        self.act_fn = nn.GELU()
+        self.linear_2 = nn.Linear(in_features, hidden_size)
+        self.norm_out = nn.LayerNorm(hidden_size)
+    def forward(self, image_embeds: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm_in(image_embeds)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.norm_out(hidden_states)
+        return hidden_states
+class MotifVideoSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 4.0,
+        qk_norm: str = "rms_norm",
+        norm_type: str = "layer_norm",
+        enable_text_cross_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        mlp_dim = int(hidden_size * mlp_ratio)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            bias=True,
+            processor=MotifVideoAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+            pre_only=True,
+        )
+        self.enable_text_cross_attention = enable_text_cross_attention
+        if enable_text_cross_attention:
+            self.cross_attn_query_proj = nn.Linear(hidden_size, hidden_size)
+            self.cross_attn_query_norm = nn.LayerNorm(hidden_size, eps=1e-6)
+            self.cross_attn_out_proj = nn.Linear(hidden_size, hidden_size)
+            nn.init.zeros_(self.cross_attn_out_proj.weight)
+            nn.init.zeros_(self.cross_attn_out_proj.bias)
+        self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type=norm_type)
+        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        token_replace_emb: torch.Tensor | None = None,
+        first_frame_num_tokens: int | None = None,
+        image_embed_seq_len: int = 0,
+        encoder_attention_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.shape[1]
+        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+        residual = hidden_states
+        # 1. Input normalization
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        norm_hidden_states, norm_encoder_hidden_states = (
+            norm_hidden_states[:, :-text_seq_length, :],
+            norm_hidden_states[:, -text_seq_length:, :],
+        )
+        # 2. Attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=image_rotary_emb,
+        )
+        # Text cross-attention: Q=proj(attn_output), K/V=normed text, reuse self.attn weights
+        if self.enable_text_cross_attention:
+            txt_kv = norm_encoder_hidden_states[:, image_embed_seq_len:, :]
+            text_mask = None
+            if encoder_attention_mask is not None:
+                text_mask = encoder_attention_mask[:, image_embed_seq_len:]
+                text_mask = text_mask.unsqueeze(1).unsqueeze(1).to(torch.bool)  # [B, 1, 1, L_txt]
+            cross_q = self.cross_attn_query_proj(attn_output)
+            cross_output, _ = self.attn(
+                hidden_states=cross_q,
+                query_input=cross_q,
+                key_input=txt_kv,
+                value_input=txt_kv,
+                attention_mask=text_mask,
+                image_rotary_emb=image_rotary_emb,
+            )
+            attn_output = attn_output + self.cross_attn_out_proj(cross_output)
+        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
+        # 3. Modulation and residual connection
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        hidden_states = gate.unsqueeze(1) * self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, :-text_seq_length, :],
+            hidden_states[:, -text_seq_length:, :],
+        )
+        return hidden_states, encoder_hidden_states
+class MotifVideoTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float,
+        qk_norm: str = "rms_norm",
+        norm_type: str = "layer_norm",
+        enable_text_cross_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        hidden_size = num_attention_heads * attention_head_dim
+        self.norm1 = AdaLayerNormZero(hidden_size, norm_type=norm_type)
+        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type=norm_type)
+        self.attn = Attention(
+            query_dim=hidden_size,
+            cross_attention_dim=None,
+            added_kv_proj_dim=hidden_size,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=hidden_size,
+            context_pre_only=False,
+            bias=True,
+            processor=MotifVideoAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=1e-6,
+        )
+        self.enable_text_cross_attention = enable_text_cross_attention
+        if enable_text_cross_attention:
+            self.cross_attn_query_proj = nn.Linear(hidden_size, hidden_size)
+            self.cross_attn_query_norm = nn.LayerNorm(hidden_size, eps=1e-6)
+            self.cross_attn_out_proj = nn.Linear(hidden_size, hidden_size)
+            nn.init.zeros_(self.cross_attn_out_proj.weight)
+            nn.init.zeros_(self.cross_attn_out_proj.bias)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm2_context = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        token_replace_emb: torch.Tensor | None = None,
+        first_frame_num_tokens: int | None = None,
+        image_embed_seq_len: int = 0,
+        encoder_attention_mask: torch.Tensor | None = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Input normalization
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        # 2. Joint attention
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            attention_mask=attention_mask,
+            image_rotary_emb=image_rotary_emb,
+        )
+        # 3. Modulation and residual connection
+        hidden_states = hidden_states + attn_output * gate_msa.unsqueeze(1)
+        # Text cross-attention: Q=proj(attn_output), K/V=normed text, reuse self.attn weights
+        if self.enable_text_cross_attention:
+            txt_kv = norm_encoder_hidden_states[:, image_embed_seq_len:, :]
+            text_mask = None
+            if encoder_attention_mask is not None:
+                text_mask = encoder_attention_mask[:, image_embed_seq_len:]
+                text_mask = text_mask.unsqueeze(1).unsqueeze(1).to(torch.bool)  # [B, 1, 1, L_txt]
+            cross_q = self.cross_attn_query_proj(attn_output)
+            cross_output, _ = self.attn(
+                hidden_states=cross_q,
+                query_input=cross_q,
+                key_input=txt_kv,
+                value_input=txt_kv,
+                attention_mask=text_mask,
+                image_rotary_emb=image_rotary_emb,
+            )
+            hidden_states = hidden_states + self.cross_attn_out_proj(cross_output)
+        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa.unsqueeze(1)
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        # 4. Feed-forward
+        ff_output = self.ff(norm_hidden_states)
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        hidden_states = hidden_states + gate_mlp.unsqueeze(1) * ff_output
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        return hidden_states, encoder_hidden_states
+TransformerBlockRegistry.register(
+    model_class=MotifVideoTransformerBlock,
+    metadata=TransformerBlockMetadata(
+        return_hidden_states_index=0,
+        return_encoder_hidden_states_index=1,
+    ),
+)
+TransformerBlockRegistry.register(
+    model_class=MotifVideoSingleTransformerBlock,
+    metadata=TransformerBlockMetadata(
+        return_hidden_states_index=0,
+        return_encoder_hidden_states_index=1,
+    ),
+)
+class MotifVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+    r"""
+    A Transformer model for video-like data used in [MotifVideo](https://huggingface.co/motif/motifvideo).
+    Args:
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        num_attention_heads (`int`, defaults to `24`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        num_layers (`int`, defaults to `20`):
+            The number of layers of dual-stream blocks to use.
+        num_single_layers (`int`, defaults to `40`):
+            The number of layers of single-stream blocks to use.
+        mlp_ratio (`float`, defaults to `4.0`):
+            The ratio of the hidden layer size to the input size in the feedforward network.
+        patch_size (`int`, defaults to `2`):
+            The size of the spatial patches to use in the patch embedding layer.
+        patch_size_t (`int`, defaults to `1`):
+            The size of the temporal patches to use in the patch embedding layer.
+        qk_norm (`str`, defaults to `rms_norm`):
+            The normalization to use for the query and key projections in the attention layers.
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        rope_theta (`float`, defaults to `256.0`):
+            The value of theta to use in the RoPE layer.
+        rope_axes_dim (`Tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions of the axes to use in the RoPE layer.
+        base_latent_size (`int`, *optional*):
+            The maximum spatial dimension (in latent units) seen during training.
+            For example, if trained on 1280x1280 with a VAE downscale of 16, this is 80.
+    """
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["x_embedder", "context_embedder", "norm"]
+    _no_split_modules = [
+        "MotifVideoTransformerBlock",
+        "MotifVideoSingleTransformerBlock",
+        "MotifVideoPatchEmbed",
+    ]
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 33,
+        out_channels: int = 16,
+        num_attention_heads: int = 24,
+        attention_head_dim: int = 128,
+        num_layers: int = 20,
+        num_single_layers: int = 40,
+        num_decoder_layers: int = 0,
+        mlp_ratio: float = 4.0,
+        patch_size: int = 2,
+        patch_size_t: int = 1,
+        qk_norm: str = "rms_norm",
+        norm_type: str = "layer_norm",
+        text_embed_dim: int = 4096,
+        image_embed_dim: int | None = None,
+        pooled_projection_dim: int | None = None,
+        rope_theta: float = 256.0,
+        rope_axes_dim: Tuple[int, ...] = (16, 56, 56),
+        base_latent_size: int | None = None,
+        enable_text_cross_attention_dual: bool = False,
+        enable_text_cross_attention_single: bool = False,
+    ) -> None:
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+        # 1. Latent and condition embedders
+        self.x_embedder = MotifVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
+        self.context_embedder = PixArtAlphaTextProjection(in_features=text_embed_dim, hidden_size=inner_dim)
+        # First frame conditioning: Image conditioning embedders
+        self.image_embed_dim = image_embed_dim
+        if image_embed_dim is not None:
+            # Project image embeddings from vision encoder to transformer dim
+            self.image_embedder = MotifVideoImageProjection(in_features=image_embed_dim, hidden_size=inner_dim)
+        self.time_text_embed = MotifVideoConditionEmbedding(inner_dim, pooled_projection_dim)
+        # 2. RoPE
+        self.rope = MotifVideoRotaryPosEmbed(
+            patch_size, patch_size_t, rope_axes_dim, rope_theta, base_latent_size=base_latent_size
+        )
+        # Cross-attention config
+        self.enable_text_cross_attention_dual = enable_text_cross_attention_dual
+        self.enable_text_cross_attention_single = enable_text_cross_attention_single
+        # 3. Dual stream transformer blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                MotifVideoTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    norm_type=norm_type,
+                    enable_text_cross_attention=enable_text_cross_attention_dual,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 4. Single stream transformer blocks
+        # Encoder blocks get cross-attention; decoder blocks do not (no text stream in decoder)
+        num_encoder_single = num_single_layers - num_decoder_layers
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                MotifVideoSingleTransformerBlock(
+                    num_attention_heads,
+                    attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    qk_norm=qk_norm,
+                    norm_type=norm_type,
+                    enable_text_cross_attention=enable_text_cross_attention_single
+                    if i < num_encoder_single
+                    else False,
+                )
+                for i in range(num_single_layers)
+            ]
+        )
+        # 5. Output projection
+        self.norm_out = AdaLayerNormContinuous(
+            inner_dim, inner_dim, elementwise_affine=False, eps=1e-6, norm_type=norm_type
+        )
+        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
+        # Verify cross-attention config matches actual block state.
+        # Catches silent misconfiguration (e.g. checkpoint config with renamed keys).
+        for i, block in enumerate(self.transformer_blocks):
+            if block.enable_text_cross_attention != enable_text_cross_attention_dual:
+                raise ValueError(
+                    f"transformer_blocks[{i}].enable_text_cross_attention="
+                    f"{block.enable_text_cross_attention}, expected {enable_text_cross_attention_dual}. "
+                    f"Check checkpoint config.json key names match __init__ parameters."
+                )
+        num_encoder_single = num_single_layers - num_decoder_layers
+        for i, block in enumerate(self.single_transformer_blocks):
+            expected = enable_text_cross_attention_single if i < num_encoder_single else False
+            if block.enable_text_cross_attention != expected:
+                raise ValueError(
+                    f"single_transformer_blocks[{i}].enable_text_cross_attention="
+                    f"{block.enable_text_cross_attention}, expected {expected}. "
+                    f"Check checkpoint config.json key names match __init__ parameters."
+                )
+        self.gradient_checkpointing = False
+        self.num_decoder_layers = num_decoder_layers
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def _maybe_gradient_checkpoint_block(self, block, *args):
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            return self._gradient_checkpointing_func(block, *args)
+        return block(*args)
+    def _get_unwrapped_blocks(self, blocks):
+        if hasattr(blocks, "_checkpoint_wrapped_module"):
+            return blocks._checkpoint_wrapped_module
+        elif hasattr(blocks, "module"):
+            return blocks.module
+        return blocks
+    def _create_attention_mask(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Create attention mask of shape [B, 1, 1, N] where N = L + E,
+        based on latent tokens (always valid) and the encoder mask.
+        Args:
+            hidden_states: [B, L, D]
+            encoder_attention_mask: [B, E] (required)
+        Returns:
+            attention_mask: [B, 1, 1, N]
+        """
+        attention_mask = F.pad(
+            encoder_attention_mask.to(torch.bool),
+            (hidden_states.shape[1], 0),
+            value=True,
+        )
+        attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, L+E]
+        return attention_mask
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor | None = None,
+        pooled_projections: torch.Tensor | None = None,
+        image_embeds: torch.Tensor | None = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+        tread_mixin: Optional[Any] = None,
+        tread_disabled: bool = False,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Forward pass of the MotifVideoTransformer3DModel.
+        Args:
+            hidden_states: Input latent tensor [B, C, F, H, W].
+            timestep: Diffusion timesteps [B].
+            encoder_hidden_states: Text conditioning [B, E, D].
+            encoder_attention_mask: Mask for text conditioning [B, E].
+            pooled_projections: Pooled text embeddings [B, D].
+            image_embeds: Optional image embeddings from vision encoder [B, N, D].
+            attention_kwargs: Additional arguments for attention processors.
+            return_dict: Whether to return a Transformer2DModelOutput.
+            tread_mixin: Optional TreadMixin instance for token reduction.
+            tread_disabled: When True, force tread_mixin to None (dense pass).
+                torch.compile specializes on this bool, producing separate graphs
+                for dense vs routed without attribute toggling.
+        Returns:
+            Transformer2DModelOutput or tuple containing the predicted samples.
+        """
+        if tread_disabled:
+            tread_mixin = None
+        elif tread_mixin is None:
+            tread_mixin = getattr(self, "_inference_tread_mixin", None)
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p, p_t = self.config.patch_size, self.config.patch_size_t
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p
+        post_patch_width = width // p
+        first_frame_num_tokens = 1 * post_patch_height * post_patch_width
+        # 1. RoPE
+        image_rotary_emb = self.rope(hidden_states, timestep=timestep)
+        # 2. Conditional embeddings
+        temb, token_replace_emb = self.time_text_embed(timestep, pooled_projections)
+        hidden_states = self.x_embedder(hidden_states)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        # First frame conditioning: Image embeddings from vision encoder
+        if image_embeds is not None:
+            # image_embeds: [B, N, D_img] -> [B, N, D]
+            image_embeds = self.image_embedder(image_embeds)
+            encoder_hidden_states = torch.cat([image_embeds, encoder_hidden_states], dim=1)
+            # Extend attention mask for image tokens
+            if encoder_attention_mask is not None:
+                image_mask = torch.ones(
+                    image_embeds.shape[0],
+                    image_embeds.shape[1],
+                    device=encoder_attention_mask.device,
+                    dtype=encoder_attention_mask.dtype,
+                )
+                encoder_attention_mask = torch.cat([image_mask, encoder_attention_mask], dim=1)
+        # image_embed_seq_len: used by cross-attention blocks to slice text from encoder_hidden_states
+        image_embed_seq_len = image_embeds.shape[1] if image_embeds is not None else 0
+        decoder_hidden_states = hidden_states.clone()
+        if encoder_attention_mask is not None:
+            attention_mask = self._create_attention_mask(
+                hidden_states=hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        else:
+            attention_mask = None
+        # TREAD state initialization: manage token reduction manually to support activation checkpointing
+        tread_active = False
+        current_route = None
+        ids_keep = None
+        x_full = None
+        orig_mask = attention_mask
+        orig_rope = image_rotary_emb
+        latent_len = hidden_states.shape[1]
+        # 4. Dual stream transformer blocks (Encoder)
+        for i, block in enumerate(self.transformer_blocks):
+            # Drop tokens if (1) TREAD is enabled, (2) current block is within the TREAD route.
+            if is_tread_start(tread_mixin, tread_active, i):
+                tread_active = True
+                current_route = tread_mixin._tread_route
+                # Reduce sequence length at the start of a TREAD route
+                ids_keep = tread_mixin.keep_indices(hidden_states, current_route["sel"]).to(hidden_states.device)
+                x_full = hidden_states.contiguous()
+                hidden_states = tread_mixin.gather_tokens(hidden_states, ids_keep)
+                attention_mask = tread_mixin.adjust_mask(orig_mask, latent_len, ids_keep)
+                image_rotary_emb = tread_mixin.gather_rope(orig_rope, ids_keep)
+            hidden_states, encoder_hidden_states = self._maybe_gradient_checkpoint_block(
+                block,
+                hidden_states,
+                encoder_hidden_states,
+                temb,
+                attention_mask,
+                image_rotary_emb,
+                token_replace_emb,
+                first_frame_num_tokens,
+                image_embed_seq_len,
+                encoder_attention_mask,
+            )
+            if is_tread_end(tread_mixin, tread_active, i):
+                # Restore full sequence length at the end of a TREAD route
+                hidden_states = tread_mixin.scatter_tokens(hidden_states, ids_keep, x_full)
+                tread_active = False
+                current_route = None
+                ids_keep = None
+                x_full = None
+                attention_mask = orig_mask
+                image_rotary_emb = orig_rope
+        # We need to unwrap the blocks because CheckpointWrapper does not support len(),
+        # which is required for slicing the blocks into encoder and decoder parts.
+        single_transformer_blocks = self.single_transformer_blocks
+        # 5. Single stream transformer blocks (Encoder)
+        num_dual = len(self.transformer_blocks)
+        for i, block in enumerate(
+            single_transformer_blocks[: len(single_transformer_blocks) - self.num_decoder_layers]
+        ):
+            # Drop tokens if (1) TREAD is enabled, (2) current block is within the TREAD route.
+            abs_i = num_dual + i
+            if is_tread_start(tread_mixin, tread_active, abs_i):
+                tread_active = True
+                current_route = tread_mixin._tread_route
+                # Reduce sequence length at the start of a TREAD route
+                ids_keep = tread_mixin.keep_indices(hidden_states, current_route["sel"]).to(hidden_states.device)
+                x_full = hidden_states.contiguous()
+                hidden_states = tread_mixin.gather_tokens(hidden_states, ids_keep)
+                attention_mask = tread_mixin.adjust_mask(orig_mask, latent_len, ids_keep)
+                image_rotary_emb = tread_mixin.gather_rope(orig_rope, ids_keep)
+            hidden_states, encoder_hidden_states = self._maybe_gradient_checkpoint_block(
+                block,
+                hidden_states,
+                encoder_hidden_states,
+                temb,
+                attention_mask,
+                image_rotary_emb,
+                token_replace_emb,
+                first_frame_num_tokens,
+                image_embed_seq_len,
+                encoder_attention_mask,
+            )
+            if is_tread_end(tread_mixin, tread_active, abs_i):
+                # Restore full sequence length at the end of a TREAD route
+                hidden_states = tread_mixin.scatter_tokens(hidden_states, ids_keep, x_full)
+                tread_active = False
+                current_route = None
+                ids_keep = None
+                x_full = None
+                attention_mask = orig_mask
+                image_rotary_emb = orig_rope
+        # 6. Single stream transformer blocks (Decoder)
+        if self.num_decoder_layers > 0:
+            encoder_hidden_states = hidden_states
+            attention_mask = None
+            num_single = len(single_transformer_blocks)
+            for i, block in enumerate(single_transformer_blocks[-self.num_decoder_layers :]):
+                abs_i = num_dual + (num_single - self.num_decoder_layers) + i
+                if is_tread_start(tread_mixin, tread_active, abs_i):
+                    tread_active = True
+                    current_route = tread_mixin._tread_route
+                    # Reduce sequence length at the start of a TREAD route
+                    ids_keep = tread_mixin.keep_indices(decoder_hidden_states, current_route["sel"]).to(
+                        decoder_hidden_states.device
+                    )
+                    x_full = encoder_hidden_states.contiguous()
+                    x_t_full = decoder_hidden_states.contiguous()
+                    decoder_hidden_states = tread_mixin.gather_tokens(decoder_hidden_states, ids_keep)
+                    encoder_hidden_states = tread_mixin.gather_tokens(encoder_hidden_states, ids_keep)
+                    attention_mask = tread_mixin.adjust_mask(orig_mask, latent_len, ids_keep)
+                    image_rotary_emb = tread_mixin.gather_rope(orig_rope, ids_keep)
+                decoder_hidden_states, encoder_hidden_states = self._maybe_gradient_checkpoint_block(
+                    block,
+                    decoder_hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    attention_mask,
+                    image_rotary_emb,
+                    token_replace_emb,
+                    first_frame_num_tokens,
+                )
+                if is_tread_end(tread_mixin, tread_active, abs_i):
+                    # Restore full sequence length at the end of a TREAD route
+                    decoder_hidden_states = tread_mixin.scatter_tokens(decoder_hidden_states, ids_keep, x_t_full)
+                    encoder_hidden_states = tread_mixin.scatter_tokens(encoder_hidden_states, ids_keep, x_full)
+                    tread_active = False
+                    current_route = None
+                    ids_keep = None
+                    x_full = None
+                    x_t_full = None
+                    attention_mask = orig_mask
+                    image_rotary_emb = orig_rope
+            hidden_states = decoder_hidden_states
+        # 7. Output projection
+        hidden_states = self.norm_out(hidden_states, temb)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1, p_t, p, p
+        )
+        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
+        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (hidden_states,)
+        return Transformer2DModelOutput(
+            sample=hidden_states,
+        )

vae/config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "_class_name": "AutoencoderKLWan",
+  "_diffusers_version": "0.35.2",
+  "_name_or_path": "Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+  "attn_scales": [],
+  "base_dim": 96,
+  "decoder_base_dim": null,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "in_channels": 3,
+  "is_residual": false,
+  "latents_mean": [
+    -0.7571,
+    -0.7089,
+    -0.9113,
+    0.1075,
+    -0.1745,
+    0.9653,
+    -0.1517,
+    1.5508,
+    0.4134,
+    -0.0715,
+    0.5517,
+    -0.3632,
+    -0.1922,
+    -0.9497,
+    0.2503,
+    -0.2921
+  ],
+  "latents_std": [
+    2.8184,
+    1.4541,
+    2.3275,
+    2.6558,
+    1.2196,
+    1.7708,
+    2.6052,
+    2.0743,
+    3.2687,
+    2.1526,
+    2.8652,
+    1.5579,
+    1.6382,
+    1.1253,
+    2.8251,
+    1.916
+  ],
+  "num_res_blocks": 2,
+  "out_channels": 3,
+  "patch_size": null,
+  "scale_factor_spatial": 8,
+  "scale_factor_temporal": 4,
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 16
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e524b3fffede1787a74e81b30976dce5400c4439ba64222168e607ed19e793
+size 507591892