Robotics
ONNX
English
Chinese
real-world
dual-arm
whole body control
manipulation
File size: 7,532 Bytes
77e28ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
TAG: debug
LOG_DIR: tensorboard_logs
seed: 7
vla_path: paligemma-3b-pt-224
data_root_dir: /galaxea_dataset/galaxea/pp_project/rlds_334_tasks_distributed/
dataset_name: bbox_training_r1_lite_5_parts
run_root_dir: runs/base
adapter_tmp_dir: adapter_tmp_weights
hf_token: /galaxea_fulltime/share/.hf_token
ckpt: /galaxea_fulltime/pretrained_ckpts/pi0_libero/pi0_torch_state.pt
use_lora: false
lora_rank: 32
lora_dropout: 0.0
use_quantization: false
enable_bf16: true
model_param_to_bf16: false
vla_training_strategy: vla-full-train
weight_decay: 1.0e-06
batch_size: 4
grad_accumulation_steps: 1
learning_rate: 2.5e-05
warmup_steps: 500
lr_scheduler_type: cosine
image_aug: true
max_epochs: 8
save_steps: 23523
log_steps: 100
use_torch_compile: false
wandb_project: 1101_pnp_rla_image_condition_376_tasks_5_parts
wandb_entity: cuijianning1996-galaxea-ai
exp_name: 376_tasks_img_as_cond_with_randomly_rotated_bbox
use_ema: false
ema:
  update_after_step: 0
  power: 0.67
DATASET:
  robot_cfg:
    with_left_arm: true
    with_right_arm: true
    with_torso: false
    with_chassis: false
  use_relative_joint_action: true
  window_size: 1
  future_action_window_size: 31
  camera_views:
  - head_condition
  - head
  - wrist_left
  - wrist_right
  shuffle_buffer_size: 10000
  balance_weights: false
  use_last_action: false
  share_datasets_statistics: true
  short_prompt: true
  aug_instruction_kwargs:
    drop_high_level_prob: 1.0
    bbox_as_instruction: false
    image_condition: true
    image_condition_lang_prefix: Pick the object in the first image and place into
      the tableware.
    bbox_jitter_ratio: 0.0
  action_proprio_normalization_type: normal
  use_pretrained_data_stats: false
  proprio_noise_std: 0.05
  image_augment_kwargs:
    head:
      random_brightness:
      - 0.2
      random_contrast:
      - 0.8
      - 1.2
      random_saturation:
      - 0.8
      - 1.2
      random_hue:
      - 0.05
      augment_order:
      - random_brightness
      - random_contrast
      - random_saturation
      - random_hue
    wrist_left:
      random_brightness:
      - 0.2
      random_contrast:
      - 0.8
      - 1.2
      random_saturation:
      - 0.8
      - 1.2
      random_hue:
      - 0.05
      random_drop_all_image:
      - 0.3
      augment_order:
      - random_drop_all_image
      - random_brightness
      - random_contrast
      - random_saturation
      - random_hue
    wrist_right:
      random_brightness:
      - 0.2
      random_contrast:
      - 0.8
      - 1.2
      random_saturation:
      - 0.8
      - 1.2
      random_hue:
      - 0.05
      random_drop_all_image:
      - 0.3
      augment_order:
      - random_drop_all_image
      - random_brightness
      - random_contrast
      - random_saturation
      - random_hue
model_family: galaxea_zero
MODEL: 
  name: vla.galaxea_zero.GalaxeaZeroWrapper
  vla_name: "paligemma-3b-pt-224"
  load_inside: False
  pretrained_model_path: /galaxea_fulltime/pretrained_ckpts/cache/paligemma-3b-pt-224
  input_ids: True
  action_expert_only: False
  image_token_index: 257152
  vocab_size: 257216
  pad_token_id: 0
  cond_steps: 1 # len proprio
  horizon_steps: 32
  action_dim: 26 # 2 x [QPOS (6) + gripper (1)] + Torso Velocity (6) + Chassis Velocity (6)
  proprio_dim: 21  # 2 * [QPOS (6) + gripper (1)] + 4 (torso) + 3 (base vel) + last action(26)
  max_text_tokens: 55 # 55 for galaxea0002
  max_seq_len: ${eval:'${MODEL.num_input_images} * ${MODEL.vision.num_image_tokens} + ${MODEL.max_text_tokens}'} 
  max_image_text_tokens: ${MODEL.max_seq_len} # = ${max_seq_len}
  action_decoder_layers: 2
  flow_sampling: beta
  num_inference_steps: 10
  final_action_clip_value: null  # data normalized in [-1,1]
  action_expert_adaptive_mode: null
  num_input_images: ${eval:'${DATASET.window_size} * len(${DATASET.camera_views})'} # $DATASET.window_size * LEN($DATASET.camera_views)
  vision:
    name: vla.model.paligemma.siglip.SiglipVisionModel
    hidden_size: 1152 # siglip
    intermediate_size: 4304
    num_hidden_layers: 27
    num_attention_heads: 16
    num_channels: 3
    image_size: 224
    patch_size: 14
    layer_norm_eps: 0.000001
    attention_dropout: 0.0
    num_image_tokens: 256
    lora:
      r: ${lora_rank}
      dropout: ${lora_dropout}
    use_quantize: False
    use_lora: False
  vision_projector:
    name: vla.model.paligemma.siglip.PaliGemmaMultiModalProjector
    vision_config:
      hidden_size: 1152
      projection_dim: 2048
    lora:
      r: ${lora_rank}
      dropout: ${lora_dropout}
    use_quantize: False
    use_lora: False
  joint:
    name: vla.model.g0.joint_model.JointModel
    action_expert_adaptive_mode: null
    mixture:
      vlm:   # gemma
        hidden_size: 2048
        intermediate_size: 16384
        use_final_norm: False
        cache: True
        use_quantize: False
        use_lora: False
        adaptive_mode:  # not applicable for gemma
      proprio:
        hidden_size: 1024
        intermediate_size: 4096
        use_final_norm: True  # technically no, but sharing weights with action anyway
        cache: True
        use_quantize: False
        use_lora: False
        adaptive_mode: null
      action:
        hidden_size: 1024
        intermediate_size: 4096
        use_final_norm: True
        cache: False
        use_quantize: False
        use_lora: False
        adaptive_mode: null
    time_hidden_size: 256 # only applicable if using adaptive
    lora:
      r: ${lora_rank}
      dropout: ${lora_dropout}
    num_hidden_layers: 18
    num_attention_heads: 8
    num_key_value_heads: 1
    head_dim: 256
    max_position_embeddings: 8192
    rms_norm_eps: 0.000001
    rope_theta: 10000.0
    attention_bias: False
    attention_dropout: 0.0
    pad_token_id: 0
#################################################################################################################
# For evaluation
#################################################################################################################
EVALUATION:
  checkpoint: null     # Pretrained checkpoint path
  load_in_8bit: False                       # (For OpenVLA only) Load with 8-bit quantization
  load_in_4bit: False                       # (For OpenVLA only) Load with 4-bit quantization
  center_crop: True                         # Center crop? (if trained w/ random crop image aug)
  #################################################################################################################
  # LIBERO environment-specific parameters
  #################################################################################################################
  task_suite_name: "simpler_widowx"          # Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90
  num_steps_wait: 10                         # Number of steps to wait for objects to stabilize in sim
  num_trials_per_task: 24                    # Number of rollouts per task
  use_wrist_image: False
  #################################################################################################################
  # Utils
  #################################################################################################################
  run_id_note: None                          # Extra note to add in run ID for logging
  local_log_dir: "./experiments/logs"        # Local directory for eval logs
  use_wandb: False                            # Whether to also log results in Weights & Biases
  seed: 7                                    # Random Seed (for reproducibility)