model: model_id: vpt middle_only: false # Simple encoder config (spatial conv + temporal transformer) encoder: d_model: 1024 n_heads: 16 n_layers: 12 max_seq_len: 128 # Maximum temporal sequence length input_size: 128 # Spatial resolution of input frames (assumes square) ch_0: 64 ch_max: 512 # Extended button configuration with keyboard and mouse inputs buttons: # Movement keys - type: keyboard code: 87 # W keycode label: "W" - type: keyboard code: 65 # A keycode label: "A" - type: keyboard code: 83 # S keycode label: "S" - type: keyboard code: 68 # D keycode label: "D" - type: keyboard code: 160 # Shift label: "LShift" - type: keyboard code: 70 # F label: "F" - type: mouse code: 1 # Left mouse button label: "LMB" - type: mouse code: 2 # Right mouse button label: "RMB" - type: keyboard code: 32 # Space label: "Space" - type: keyboard code: 82 # R label: "R" - type: keyboard code: 69 # E label: "E" - type: keyboard code: 86 # V label: "V" - type: keyboard code: 67 # C label: "C" - type: keyboard code: 17 # Ctrl label: "Ctrl" - type: keyboard code: 49 # 1 label: "1" - type: keyboard code: 50 # 2 label: "2" - type: keyboard code: 51 # 3 label: "3" - type: keyboard code: 73 label: "I" - type: keyboad code: 9 label: "Tab" - type: keyboard code: 27 label: "Esc" train: model_id: simple trainer_id: basic data_dir: /mnt/data/idm_data target_size: [128, 128] window_length: 32 batch_size: 16 sample_data_dir: /mnt/data/idm_data n_samples: 4 sample_window_length: 512 epochs: 1000 opt: AdamW opt_kwargs: lr: 1.0e-4 betas: [0.9, 0.95] eps: 1.0e-15 weight_decay: 1.0e-2 checkpoint_dir: ./checkpoints/simpler_vpt output_path: ./checkpoints/simpler_vpt/ema resume_ckpt: null #latest sample_interval: 500 save_interval: 5000 # Use log1p scaling for mouse inputs use_log1p_scaling: true logging: name: shahbuland project: new-idms-vpt run_name: baseline-32frame-vpt