owl-idm-vpt-v0 / config.yml
shahbuland's picture
Upload folder using huggingface_hub
04ca33a verified
Raw
History Blame Contribute Delete
2.32 kB
model:
model_id: vpt
middle_only: false
# Simple encoder config (spatial conv + temporal transformer)
encoder:
d_model: 1024
n_heads: 16
n_layers: 12
max_seq_len: 128 # Maximum temporal sequence length
input_size: 128 # Spatial resolution of input frames (assumes square)
ch_0: 64
ch_max: 512
# Extended button configuration with keyboard and mouse inputs
buttons:
# Movement keys
- type: keyboard
code: 87 # W keycode
label: "W"
- type: keyboard
code: 65 # A keycode
label: "A"
- type: keyboard
code: 83 # S keycode
label: "S"
- type: keyboard
code: 68 # D keycode
label: "D"
- type: keyboard
code: 160 # Shift
label: "LShift"
- type: keyboard
code: 70 # F
label: "F"
- type: mouse
code: 1 # Left mouse button
label: "LMB"
- type: mouse
code: 2 # Right mouse button
label: "RMB"
- type: keyboard
code: 32 # Space
label: "Space"
- type: keyboard
code: 82 # R
label: "R"
- type: keyboard
code: 69 # E
label: "E"
- type: keyboard
code: 86 # V
label: "V"
- type: keyboard
code: 67 # C
label: "C"
- type: keyboard
code: 17 # Ctrl
label: "Ctrl"
- type: keyboard
code: 49 # 1
label: "1"
- type: keyboard
code: 50 # 2
label: "2"
- type: keyboard
code: 51 # 3
label: "3"
- type: keyboard
code: 73
label: "I"
- type: keyboad
code: 9
label: "Tab"
- type: keyboard
code: 27
label: "Esc"
train:
model_id: simple
trainer_id: basic
data_dir: /mnt/data/idm_data
target_size: [128, 128]
window_length: 32
batch_size: 16
sample_data_dir: /mnt/data/idm_data
n_samples: 4
sample_window_length: 512
epochs: 1000
opt: AdamW
opt_kwargs:
lr: 1.0e-4
betas: [0.9, 0.95]
eps: 1.0e-15
weight_decay: 1.0e-2
checkpoint_dir: ./checkpoints/simpler_vpt
output_path: ./checkpoints/simpler_vpt/ema
resume_ckpt: null #latest
sample_interval: 500
save_interval: 5000
# Use log1p scaling for mouse inputs
use_log1p_scaling: true
logging:
name: shahbuland
project: new-idms-vpt
run_name: baseline-32frame-vpt