Buckets:
| # ============================================================================= | |
| # LTX-2 Video-to-Video (IC-LoRA) Training Configuration | |
| # ============================================================================= | |
| # | |
| # This configuration is for training In-Context LoRA (IC-LoRA) adapters that | |
| # enable video-to-video transformations. IC-LoRA learns to apply visual | |
| # transformations (e.g., depth-to-video, pose control, style transfer, etc.) | |
| # by conditioning on reference videos. | |
| # | |
| # Key differences from text-to-video LoRA: | |
| # - Uses reference videos as conditioning input alongside text prompts | |
| # - Requires preprocessed reference latents in addition to target latents | |
| # - Validation requires reference videos to demonstrate the transformation | |
| # | |
| # Dataset structure for IC-LoRA training: | |
| # preprocessed_data_root/ | |
| # ├── latents/ # Target video latents (what the model learns to generate) | |
| # ├── conditions/ # Text embeddings for each video | |
| # └── reference_latents/ # Reference video latents (conditioning input) | |
| # | |
| # Dataset metadata columns: video, reference_video, caption | |
| # | |
| # ============================================================================= | |
| # ----------------------------------------------------------------------------- | |
| # Model Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Specifies the base model to fine-tune and the training mode. | |
| model: | |
| # Path to the LTX-2 model checkpoint (.safetensors file) | |
| # This should be a local path to your downloaded model | |
| model_path: "path/to/ltx-2-model.safetensors" | |
| # Path to the text encoder model directory | |
| # For LTX-2, this is typically the Gemma-based text encoder | |
| text_encoder_path: "path/to/gemma-text-encoder" | |
| # Training mode: "lora" for efficient adapter training, "full" for full fine-tuning | |
| # IC-LoRA reference conditioning is intended for LoRA adapter training. | |
| training_mode: "lora" | |
| # Optional: Path to resume training from a checkpoint | |
| # Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint) | |
| load_checkpoint: null | |
| # ----------------------------------------------------------------------------- | |
| # LoRA Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls the Low-Rank Adaptation parameters for efficient fine-tuning. | |
| lora: | |
| # Rank of the LoRA matrices (higher = more capacity but more parameters) | |
| # Typical values: 8, 16, 32, 64. Start with 16-32 for IC-LoRA. | |
| rank: 32 | |
| # Alpha scaling factor (usually set equal to rank) | |
| # The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0 | |
| alpha: 32 | |
| # Dropout probability for LoRA layers (0.0 = no dropout) | |
| # Can help with regularization if overfitting occurs | |
| dropout: 0.0 | |
| # Which transformer modules to apply LoRA to | |
| # The LTX-2 transformer has separate attention and FFN blocks for video and audio: | |
| # | |
| # VIDEO MODULES: | |
| # - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0 (video self-attention) | |
| # - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0 (video cross-attention to text) | |
| # - ff.net.0.proj, ff.net.2 (video feed-forward) | |
| # | |
| # AUDIO MODULES (not used for video-only IC-LoRA): | |
| # - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0 (audio self-attention) | |
| # - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0 (audio cross-attention to text) | |
| # - audio_ff.net.0.proj, audio_ff.net.2 (audio feed-forward) | |
| # | |
| # AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction, not used for video-only IC-LoRA): | |
| # - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0 | |
| # (Q from video, K/V from audio - allows video to attend to audio features) | |
| # - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0 | |
| # (Q from audio, K/V from video - allows audio to attend to video features) | |
| # | |
| # For IC-LoRA (video-only), we explicitly target video modules. | |
| # Including FFN layers often improves transformation quality. | |
| target_modules: | |
| # Video self-attention | |
| - "attn1.to_k" | |
| - "attn1.to_q" | |
| - "attn1.to_v" | |
| - "attn1.to_out.0" | |
| # Video cross-attention | |
| - "attn2.to_k" | |
| - "attn2.to_q" | |
| - "attn2.to_v" | |
| - "attn2.to_out.0" | |
| # Video feed-forward (often improves transformation quality) | |
| - "ff.net.0.proj" | |
| - "ff.net.2" | |
| # ----------------------------------------------------------------------------- | |
| # Training Strategy Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Defines the video-to-video (IC-LoRA) training approach using the unified | |
| # flexible strategy. Reference conditioning concatenates pre-encoded reference | |
| # video latents to the target sequence. Reference tokens participate in | |
| # bidirectional self-attention but receive no noise and are excluded from loss. | |
| training_strategy: | |
| # Strategy name: "flexible" for the unified conditioning framework | |
| # Supports all training modes (T2V, I2V, V2V, A2V, V2A, etc.) through | |
| # modality-specific configuration blocks. | |
| name: "flexible" | |
| # Video modality configuration | |
| video: | |
| # Whether the model generates video (true) or uses it as frozen conditioning (false) | |
| is_generated: true | |
| # Directory name (within preprocessed_data_root) containing target video latents | |
| latents_dir: "latents" | |
| # Conditions applied to the video modality during training | |
| conditions: | |
| # Reference conditioning (IC-LoRA): concatenates pre-encoded reference video | |
| # latents to the target sequence. The model learns to transform the reference | |
| # into the target video based on the text prompt. | |
| - type: reference | |
| # Directory name (within preprocessed_data_root) containing reference video latents | |
| # These are the conditioning inputs that guide the transformation | |
| latents_dir: "reference_latents" | |
| # Probability of applying reference conditioning per training sample | |
| probability: 1.0 | |
| # Optional first-frame conditioning to improve I2V capabilities | |
| # At low probability, this teaches the model to also accept first-frame input | |
| - type: first_frame | |
| probability: 0.2 | |
| # ----------------------------------------------------------------------------- | |
| # Optimization Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls the training optimization parameters. | |
| optimization: | |
| # Learning rate for the optimizer | |
| # Typical range for LoRA: 1e-5 to 1e-4 | |
| learning_rate: 2e-4 | |
| # Total number of training steps | |
| steps: 3000 | |
| # Batch size per GPU | |
| # Reduce if running out of memory | |
| batch_size: 1 | |
| # Number of gradient accumulation steps | |
| # Effective batch size = batch_size * gradient_accumulation_steps * num_gpus | |
| gradient_accumulation_steps: 1 | |
| # Maximum gradient norm for clipping (helps training stability) | |
| max_grad_norm: 1.0 | |
| # Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient) | |
| optimizer_type: "adamw" | |
| # Learning rate scheduler type | |
| # Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial" | |
| scheduler_type: "linear" | |
| # Additional scheduler parameters (depends on scheduler_type) | |
| scheduler_params: { } | |
| # Enable gradient checkpointing to reduce memory usage | |
| # Recommended for training with limited GPU memory | |
| enable_gradient_checkpointing: true | |
| # ----------------------------------------------------------------------------- | |
| # Acceleration Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Hardware acceleration and memory optimization settings. | |
| acceleration: | |
| # Mixed precision training mode | |
| # Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended) | |
| mixed_precision_mode: "bf16" | |
| # Model quantization for reduced memory usage | |
| # Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto" | |
| quantization: null | |
| # Load text encoder in 8-bit precision to save memory | |
| # Useful when GPU memory is limited | |
| load_text_encoder_in_8bit: false | |
| # Offload optimizer state to CPU during validation video sampling and restore it after. | |
| # Frees VRAM for the VAE decoder when optimizer state is large (full fine-tune, high-rank | |
| # LoRA). No effect under FSDP (sharded state). | |
| offload_optimizer_during_validation: false | |
| # ----------------------------------------------------------------------------- | |
| # Data Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Specifies the training data location and loading parameters. | |
| data: | |
| # Root directory containing preprocessed training data | |
| # Should contain: latents/, conditions/, and reference_latents/ subdirectories | |
| preprocessed_data_root: "/path/to/preprocessed/data" | |
| # Number of worker processes for data loading | |
| # Used for parallel data loading to speed up data loading | |
| num_dataloader_workers: 2 | |
| # ----------------------------------------------------------------------------- | |
| # Validation Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls validation sampling during training. | |
| # NOTE: Validation sampling use simplified inference pipelines and prioritizes speed over | |
| # maximum quality. For production-quality inference, use `packages/ltx-pipelines`. | |
| validation: | |
| # Validation samples — each sample describes a self-contained generation request. | |
| # For IC-LoRA, each sample includes a reference condition pointing to the conditioning video. | |
| samples: | |
| - prompt: >- | |
| A man in a casual blue jacket walks along a winding path through a lush green park on a | |
| bright sunny afternoon. Tall oak trees line the pathway, their leaves rustling gently in | |
| the breeze. Dappled sunlight creates shifting patterns on the ground as he strolls at a | |
| relaxed pace, occasionally looking up at the scenery around him. The audio captures | |
| footsteps on gravel, birds singing in the trees, distant children playing, and the soft | |
| whisper of wind through the foliage. | |
| conditions: | |
| - type: reference | |
| video: "/path/to/reference_video_1.mp4" | |
| # Set these to match --reference-downscale-factor / --reference-temporal-scale-factor | |
| # if reference latents were preprocessed at reduced spatial or temporal resolution. | |
| downscale_factor: 1 | |
| temporal_scale_factor: 1 | |
| include_in_output: true | |
| - prompt: >- | |
| A fluffy orange tabby cat sits perfectly still on a wooden windowsill, its green eyes | |
| intently tracking small birds hopping on a branch just outside the glass. The cat's ears | |
| twitch and rotate, following every movement. Warm afternoon light illuminates its fur, | |
| creating a soft golden glow. Behind the cat, a cozy living room with a bookshelf and | |
| houseplants is visible. The audio features gentle purring, occasional soft meows, muffled | |
| bird chirps through the window, and quiet ambient room sounds. | |
| conditions: | |
| - type: reference | |
| video: "/path/to/reference_video_2.mp4" | |
| # Set these to match --reference-downscale-factor / --reference-temporal-scale-factor | |
| # if reference latents were preprocessed at reduced spatial or temporal resolution. | |
| downscale_factor: 1 | |
| temporal_scale_factor: 1 | |
| include_in_output: true | |
| # Negative prompt to avoid unwanted artifacts | |
| negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted" | |
| # Output video dimensions [width, height, frames] | |
| # Width and height must be divisible by 32 | |
| # Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...) | |
| video_dims: [ 512, 512, 81 ] | |
| # Frame rate for generated videos | |
| frame_rate: 25.0 | |
| # Random seed for reproducible validation outputs | |
| seed: 42 | |
| # Number of denoising steps for validation inference | |
| # Higher values = better quality but slower generation | |
| inference_steps: 30 | |
| # Generate validation videos every N training steps | |
| # Set to null to disable validation during training | |
| interval: 100 | |
| # Classifier-free guidance scale | |
| # Higher values = stronger adherence to prompt but may introduce artifacts | |
| guidance_scale: 4.0 | |
| # STG (Spatio-Temporal Guidance) parameters for improved video quality | |
| # STG is combined with CFG for better temporal coherence | |
| stg_scale: 1.0 # Recommended: 1.0 (0.0 disables STG) | |
| stg_blocks: [29] # Recommended: single block 29 | |
| stg_mode: "stg_v" # "stg_v" for video-only (no audio training) | |
| # Whether to generate audio in validation samples | |
| # Can be enabled even when not training the audio branch | |
| generate_audio: false | |
| # Skip validation at the beginning of training (step 0) | |
| skip_initial_validation: false | |
| # ----------------------------------------------------------------------------- | |
| # Checkpoint Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Controls model checkpoint saving during training. | |
| checkpoints: | |
| # Save a checkpoint every N steps | |
| # Set to null to disable intermediate checkpoints | |
| interval: 250 | |
| # Number of most recent checkpoints to keep | |
| # Set to -1 to keep all checkpoints | |
| keep_last_n: 3 | |
| # Precision to use when saving checkpoint weights | |
| # Options: "bfloat16" (default, smaller files) or "float32" (full precision) | |
| precision: "bfloat16" | |
| # ----------------------------------------------------------------------------- | |
| # Flow Matching Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Parameters for the flow matching training objective. | |
| flow_matching: | |
| # Timestep sampling mode | |
| # "shifted_logit_normal" is recommended for LTX-2 models | |
| timestep_sampling_mode: "shifted_logit_normal" | |
| # Additional parameters for timestep sampling | |
| timestep_sampling_params: { } | |
| # ----------------------------------------------------------------------------- | |
| # Hugging Face Hub Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Settings for uploading trained models to the Hugging Face Hub. | |
| hub: | |
| # Whether to push the trained model to the Hub | |
| push_to_hub: false | |
| # Repository ID on Hugging Face Hub (e.g., "username/my-ic-lora-model") | |
| # Required if push_to_hub is true | |
| hub_model_id: null | |
| # ----------------------------------------------------------------------------- | |
| # Weights & Biases Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Settings for experiment tracking with W&B. | |
| wandb: | |
| # Enable W&B logging | |
| enabled: false | |
| # W&B project name | |
| project: "ltx-2-trainer" | |
| # W&B username or team (null uses default account) | |
| entity: null | |
| # Tags to help organize runs | |
| tags: [ "ltx2", "ic-lora", "v2v" ] | |
| # Log validation media (video/audio) to W&B | |
| log_validation_videos: true | |
| # ----------------------------------------------------------------------------- | |
| # General Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Global settings for the training run. | |
| # Random seed for reproducibility | |
| seed: 42 | |
| # Directory to save outputs (checkpoints, validation videos, logs) | |
| output_dir: "outputs/v2v_ic_lora" | |
Xet Storage Details
- Size:
- 15.6 kB
- Xet hash:
- ecf0e1ae614afc44f38e877728c9f551c8ae227250b7e98c03736a5f7f7ee650
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.