| model: | |
| model_id: vpt | |
| middle_only: false | |
| # Simple encoder config (spatial conv + temporal transformer) | |
| encoder: | |
| d_model: 1024 | |
| n_heads: 16 | |
| n_layers: 12 | |
| max_seq_len: 128 # Maximum temporal sequence length | |
| input_size: 128 # Spatial resolution of input frames (assumes square) | |
| ch_0: 64 | |
| ch_max: 512 | |
| # Extended button configuration with keyboard and mouse inputs | |
| buttons: | |
| # Movement keys | |
| - type: keyboard | |
| code: 87 # W keycode | |
| label: "W" | |
| - type: keyboard | |
| code: 65 # A keycode | |
| label: "A" | |
| - type: keyboard | |
| code: 83 # S keycode | |
| label: "S" | |
| - type: keyboard | |
| code: 68 # D keycode | |
| label: "D" | |
| - type: keyboard | |
| code: 160 # Shift | |
| label: "LShift" | |
| - type: keyboard | |
| code: 70 # F | |
| label: "F" | |
| - type: mouse | |
| code: 1 # Left mouse button | |
| label: "LMB" | |
| - type: mouse | |
| code: 2 # Right mouse button | |
| label: "RMB" | |
| - type: keyboard | |
| code: 32 # Space | |
| label: "Space" | |
| - type: keyboard | |
| code: 82 # R | |
| label: "R" | |
| - type: keyboard | |
| code: 69 # E | |
| label: "E" | |
| - type: keyboard | |
| code: 86 # V | |
| label: "V" | |
| - type: keyboard | |
| code: 67 # C | |
| label: "C" | |
| - type: keyboard | |
| code: 17 # Ctrl | |
| label: "Ctrl" | |
| - type: keyboard | |
| code: 49 # 1 | |
| label: "1" | |
| - type: keyboard | |
| code: 50 # 2 | |
| label: "2" | |
| - type: keyboard | |
| code: 51 # 3 | |
| label: "3" | |
| - type: keyboard | |
| code: 73 | |
| label: "I" | |
| - type: keyboad | |
| code: 9 | |
| label: "Tab" | |
| - type: keyboard | |
| code: 27 | |
| label: "Esc" | |
| train: | |
| model_id: simple | |
| trainer_id: basic | |
| data_dir: /mnt/data/idm_data | |
| target_size: [128, 128] | |
| window_length: 32 | |
| batch_size: 16 | |
| sample_data_dir: /mnt/data/idm_data | |
| n_samples: 4 | |
| sample_window_length: 512 | |
| epochs: 1000 | |
| opt: AdamW | |
| opt_kwargs: | |
| lr: 1.0e-4 | |
| betas: [0.9, 0.95] | |
| eps: 1.0e-15 | |
| weight_decay: 1.0e-2 | |
| checkpoint_dir: ./checkpoints/simpler_vpt | |
| output_path: ./checkpoints/simpler_vpt/ema | |
| resume_ckpt: null #latest | |
| sample_interval: 500 | |
| save_interval: 5000 | |
| # Use log1p scaling for mouse inputs | |
| use_log1p_scaling: true | |
| logging: | |
| name: shahbuland | |
| project: new-idms-vpt | |
| run_name: baseline-32frame-vpt |