Robotics
ONNX
English
Chinese
real-world
dual-arm
whole body control
manipulation
File size: 9,101 Bytes
5f3ded9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
seed: 7
resume_ckpt: /vla_fulltime/jianning.cui/code/GalaxeaFM/runs/merge_pipeline/real/r1lite_g0_pp_bbox_400_tasks/2025-12-22_05-53-31/checkpoints/step_124838.pt
output_dir: ${hydra:runtime.output_dir}
checkpointing_steps: 17834
logger:
  type: wandb
  log_steps: 10
  task: ${hydra:runtime.choices.task}
  project: ${split:${logger.task},0}
  experiment_name: ${split:${logger.task},-1}
  mode: online
  workspace: cuijianning1996-galaxea-ai
  dir: ${output_dir}/wandb
batch_size_val: 16
eval_episodes_num: 1
ckpt_path: /data/trt_ckpts/model_state_dict.pt
env: R1ProBlocksStackEasy
target_controller_type: bimanual_relaxed_ik
tags: null
edp:
  card: null
  training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S}
  git_branch: null
  git_commit: null
  root: null
  repo_ids: null
  save_dir: ${output_dir}
  tags: ${tags}
  max_steps: ${model.max_steps}
  batch_size: ${model.batch_size}
data:
  _target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset
  dataset_dirs: null
  shape_meta:
    action:
    - key: left_arm
      raw_shape: 6
      shape: 6
    - key: left_gripper
      raw_shape: 1
      shape: 1
    - key: right_arm
      raw_shape: 6
      shape: 6
    - key: right_gripper
      raw_shape: 1
      shape: 1
    state:
    - key: left_arm
      raw_shape: 6
      shape: 6
    - key: left_gripper
      raw_shape: 1
      shape: 1
    - key: right_arm
      raw_shape: 6
      shape: 6
    - key: right_gripper
      raw_shape: 1
      shape: 1
    images:
    - key: head_condition
      raw_shape:
      - 3
      - 224
      - 224
      shape:
      - 3
      - 224
      - 224
    - key: head_rgb
      raw_shape:
      - 3
      - 720
      - 1280
      shape:
      - 3
      - 224
      - 224
    - key: left_wrist_rgb
      raw_shape:
      - 3
      - 720
      - 1280
      shape:
      - 3
      - 224
      - 224
    - key: right_wrist_rgb
      raw_shape:
      - 3
      - 720
      - 1280
      shape:
      - 3
      - 224
      - 224
  action_size: 32
  past_action_size: 0
  obs_size: 1
  ee_start_moving_thresh: 0.0
  val_set_proportion: 0.05
  use_bbox_condition: true
  dataset_root: /galaxea_dataset/galaxea/pp_project/lerobot_with_bbox
  dataset_prefixes:
  - BENCH
  - Bench
model:
  pretrained_ckpt: /galaxea_dataset/mnt/tmp/pp_wt_img_cond/checkpoints/org2fm_v2.pt
  use_pretrained_norm_stats: true
  model_weights_to_bf16: false
  enable_bf16_training: true
  use_torch_compile: false
  find_unused_parameters: false
  batch_size: 2
  num_workers: 4
  pin_memory: true
  persistent_workers: true
  max_epochs: 4
  max_steps: null
  grad_accumulation_steps: 2
  use_8bit_optimizer: false
  learning_rate: 2.5e-05
  weight_decay: 1.0e-06
  betas:
  - 0.9
  - 0.999
  lr_scheduler_type: cosine
  warmup_steps: 500
  max_grad_norm: 1.0
  use_ema: false
  ema:
    update_after_step: 0
    power: 0.67
  use_sync_bn: false
  processor:
    _target_: galaxea_fm.processors.galaxea_zero_processor.GalaxeaZeroProcessor
    shape_meta: ${data.shape_meta}
    num_obs_steps: ${data.obs_size}
    action_state_transforms:
    - _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform
      keys:
      - left_arm
      - right_arm
    use_stepwise_action_norm: true
    norm_default_mode: z-score
    norm_exception_mode:
      action:
        left_gripper: 0/100
        right_gripper: 0/100
    action_state_merger:
      _target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign
    train_transforms:
      head_condition:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      head_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      left_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      right_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
    val_transforms:
      head_condition:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      head_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      left_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      right_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
    num_output_cameras: 4
    use_zh_instruction: false
    drop_high_level_prob: 1.0
    pad_token_id: ${model.model_arch.pad_token_id}
    image_token_index: ${model.model_arch.image_token_index}
    tokenizer_params:
      pretrained_model_name_or_path: /data/google/paligemma-3b-pt-224
      local_files_only: false
      token: null
    max_text_tokens: ${model.model_arch.max_text_tokens}
    max_image_text_tokens: ${model.model_arch.max_image_text_tokens}
    num_input_cameras: ${model.model_arch.num_input_images}
    num_image_tokens_per_camera: ${model.model_arch.vision.num_image_tokens}
  model_arch:
    _target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy
    model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero
    pretrained_model_path: /data/google/paligemma-3b-pt-224
    vla_training_strategy: vla-full-train
    backbone_lr_multiplier: 1.0
    image_token_index: 257152
    pad_token_id: 0
    vocab_size: 257216
    cond_steps: ${data.obs_size}
    horizon_steps: ${data.action_size}
    max_text_tokens: 55
    max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * ${model.model_arch.vision.num_image_tokens}
      + ${model.model_arch.max_text_tokens}'}
    num_input_images: ${eval:'${model.model_arch.cond_steps} * ${model.processor.num_output_cameras}'}
    num_extra_image_tokens_per_camera: 0
    final_action_clip_value: null
    action_dim: 14
    proprio_dim: 14
    action_decoder_layers: 2
    action_expert_adaptive_mode: null
    flow_sampling: beta
    num_inference_steps: 10
    vision:
      name: galaxea_fm.models.galaxea_zero.paligemma.siglip.SiglipVisionModel
      hidden_size: 1152
      intermediate_size: 4304
      num_hidden_layers: 27
      num_attention_heads: 16
      num_channels: 3
      image_size: 224
      patch_size: 14
      layer_norm_eps: 1.0e-06
      attention_dropout: 0.0
      num_image_tokens: 256
    vision_projector:
      name: galaxea_fm.models.galaxea_zero.paligemma.siglip.PaliGemmaMultiModalProjector
      vision_config:
        hidden_size: 1152
        projection_dim: 2048
    joint:
      name: galaxea_fm.models.galaxea_zero.joint_model.JointModel
      action_expert_adaptive_mode: null
      mixture:
        vlm:
          hidden_size: 2048
          intermediate_size: 16384
          use_final_norm: false
          cache: true
        proprio:
          hidden_size: 1024
          intermediate_size: 4096
          use_final_norm: true
          cache: true
          adaptive_mode: null
        action:
          hidden_size: 1024
          intermediate_size: 4096
          use_final_norm: true
          cache: false
          adaptive_mode: null
      time_hidden_size: 256
      num_hidden_layers: 18
      num_attention_heads: 8
      num_key_value_heads: 1
      head_dim: 256
      max_position_embeddings: 8192
      rms_norm_eps: 1.0e-06
      rope_theta: 10000.0
      attention_bias: false
      attention_dropout: 0.0