| tags: null |
| seed: 7 |
| resume_ckpt: null |
| output_dir: ${hydra:runtime.output_dir} |
| dataset_stats_cache_dir: ${oc.env:GALAXEA_FM_DATASET_STATS_CACHE_DIR} |
| min_batch_size: 1 |
| max_batch_size: 256 |
| num_test_steps: 3 |
| checkpointing_steps: 5000 |
| logger: |
| type: swanlab |
| log_steps: 10 |
| task: ${hydra:runtime.choices.task} |
| project: ${split:${logger.task},0} |
| experiment_name: ${split:${logger.task},-1} |
| mode: cloud |
| workspace: Galaxea-AI |
| dir: null |
| batch_size_val: 16 |
| eval_episodes_num: 1 |
| ckpt_path: null |
| env: R1ProBlocksStackEasy |
| target_controller_type: bimanual_relaxed_ik |
| edp: |
| card: null |
| training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S} |
| git_branch: null |
| git_commit: null |
| root: null |
| repo_ids: null |
| save_dir: ${output_dir} |
| tags: ${tags} |
| max_steps: ${model.max_steps} |
| batch_size: ${model.batch_size} |
| EVALUATION: |
| task_suite_names: |
| - libero_10 |
| - libero_spatial |
| - libero_object |
| - libero_goal |
| num_steps_wait: 10 |
| replan_steps: 5 |
| num_trials: 50 |
| output_dir: ${output_dir} |
| run_id_note: null |
| env_num: 50 |
| data: |
| dataset: |
| _target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset |
| dataset_dirs: |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1225/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251224_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251224_6011_B1_007 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1225/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251224_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251224_v2.0_6011_B1_007 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data1225/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251225_6011_B1_007_v20251226_101622 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data1225/Beijing_Demo_Handover_Gift_And_Box_Moving_Hands_251225_v2.0_6011_B1_007_v20251226_101627 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1226/Beijing_Demo_Handover_Gift_And_Box_Moving_Gifts_251226_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Moving_Gifts_251226_v2.0_6011_B1_007 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1227/Beijing_Demo_Handover_Gift_And_Box_Fallen_Gifts_251227_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Fallen_Gifts_251227_v2.0_6011_B1_007 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1229/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251229_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251229_v2.0_6011_B1_007 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1230/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007 |
| - /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1230/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007-2/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007-2 |
| shape_meta: |
| action: |
| - key: left_ee_pose |
| raw_shape: 7 |
| shape: 9 |
| - key: left_gripper |
| raw_shape: 1 |
| shape: 1 |
| - key: right_ee_pose |
| raw_shape: 7 |
| shape: 9 |
| - key: right_gripper |
| raw_shape: 1 |
| shape: 1 |
| - key: torso |
| raw_shape: 4 |
| shape: 4 |
| state: |
| - key: left_ee_pose |
| raw_shape: 7 |
| shape: 9 |
| - key: left_gripper |
| raw_shape: 1 |
| shape: 1 |
| - key: right_ee_pose |
| raw_shape: 7 |
| shape: 9 |
| - key: right_gripper |
| raw_shape: 1 |
| shape: 1 |
| - key: torso |
| raw_shape: 4 |
| shape: 4 |
| images: |
| - key: head_rgb |
| raw_shape: |
| - 3 |
| - 360 |
| - 640 |
| shape: |
| - 3 |
| - ${model.model_arch.input_image_size.0} |
| - ${model.model_arch.input_image_size.1} |
| - key: left_wrist_rgb |
| raw_shape: |
| - 3 |
| - 480 |
| - 640 |
| shape: |
| - 3 |
| - ${model.model_arch.input_image_size.0} |
| - ${model.model_arch.input_image_size.1} |
| - key: right_wrist_rgb |
| raw_shape: |
| - 3 |
| - 480 |
| - 640 |
| shape: |
| - 3 |
| - ${model.model_arch.input_image_size.0} |
| - ${model.model_arch.input_image_size.1} |
| action_size: 32 |
| past_action_size: 0 |
| obs_size: 1 |
| ee_start_moving_thresh: 0.002 |
| val_set_proportion: 0.05 |
| processor: |
| _target_: galaxea_fm.processors.base_processor.BaseProcessor |
| shape_meta: ${data.dataset.shape_meta} |
| num_obs_steps: ${data.dataset.obs_size} |
| action_state_transforms: |
| - _target_: galaxea_fm.transforms.relative_action.RelativePoseTransform |
| keys: |
| - left_ee_pose |
| - right_ee_pose |
| - _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform |
| keys: |
| - torso |
| - _target_: galaxea_fm.transforms.rotation.PoseRotationTransform |
| rotation_type: rotation_6d |
| category_keys: |
| action: |
| - left_ee_pose |
| - right_ee_pose |
| state: |
| - left_ee_pose |
| - right_ee_pose |
| use_stepwise_action_norm: true |
| norm_default_mode: q01/q99 |
| norm_exception_mode: |
| action: |
| left_gripper: 0/100 |
| right_gripper: 0/100 |
| action_state_merger: |
| _target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign |
| train_transforms: |
| head_rgb: |
| - _target_: torchvision.transforms.Resize |
| size: ${model.model_arch.input_image_size} |
| - _target_: galaxea_fm.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Normalize |
| mean: |
| - 0.5 |
| - 0.5 |
| - 0.5 |
| std: |
| - 0.5 |
| - 0.5 |
| - 0.5 |
| left_wrist_rgb: ${data.processor.train_transforms.head_rgb} |
| right_wrist_rgb: ${data.processor.train_transforms.head_rgb} |
| val_transforms: |
| head_rgb: |
| - _target_: torchvision.transforms.Resize |
| size: ${model.model_arch.input_image_size} |
| - _target_: galaxea_fm.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Normalize |
| mean: |
| - 0.5 |
| - 0.5 |
| - 0.5 |
| std: |
| - 0.5 |
| - 0.5 |
| - 0.5 |
| left_wrist_rgb: ${data.processor.val_transforms.head_rgb} |
| right_wrist_rgb: ${data.processor.val_transforms.head_rgb} |
| drop_high_level_prob: 1.0 |
| use_zh_instruction: false |
| num_output_images: 3 |
| action_output_dim: 24 |
| proprio_output_dim: 24 |
| model: |
| pretrained_ckpt: /efm-nas/efm-nas/group-yaq/ziyang.jiao/model_res/real/r1pro_g0tiny_pretrain/2026-01-20_10-12-35/checkpoints/step_390000.pt |
| use_pretrained_norm_stats: true |
| model_weights_to_bf16: false |
| enable_bf16_training: true |
| use_torch_compile: false |
| find_unused_parameters: true |
| batch_size: 20 |
| num_workers: 12 |
| pin_memory: true |
| persistent_workers: true |
| max_epochs: null |
| max_steps: 50000 |
| grad_accumulation_steps: 1 |
| use_8bit_optimizer: false |
| learning_rate: 6.0e-05 |
| weight_decay: 0.001 |
| betas: |
| - 0.9 |
| - 0.95 |
| lr_scheduler_type: cosine |
| warmup_steps: 480 |
| max_grad_norm: 1.0 |
| use_ema: false |
| ema: |
| update_after_step: 0 |
| power: 0.67 |
| use_sync_bn: false |
| model_arch: |
| _target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy |
| model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero |
| tokenizer: |
| _target_: galaxea_fm.models.vla_tiny.smolvlm2.tokenizer.SmolVLM2Tokenizer |
| tokenizer_params: |
| pretrained_model_name_or_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct |
| local_files_only: true |
| pad_token_id: ${model.model_arch.pad_token_id} |
| image_token_index: ${model.model_arch.image_token_index} |
| max_text_tokens: ${model.model_arch.max_text_tokens} |
| num_tokens_per_image: ${model.model_arch.vision.num_image_tokens} |
| num_input_images: ${model.model_arch.num_input_images} |
| pretrained_model_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct |
| vla_training_strategy: vla-full-train |
| backbone_lr_multiplier: 0.1 |
| image_token_index: 49190 |
| pad_token_id: 2 |
| vocab_size: 49280 |
| fill_padded_with_token: true |
| embed_token_key_prefix: model.text_model.embed_tokens |
| cond_steps: ${data.dataset.obs_size} |
| horizon_steps: ${data.dataset.action_size} |
| max_text_tokens: 55 |
| max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * (${model.model_arch.vision.num_image_tokens} |
| + 3) + ${model.model_arch.max_text_tokens}'} |
| num_input_images: ${eval:'${model.model_arch.cond_steps} * ${data.processor.num_output_images}'} |
| input_image_size: |
| - ${model.model_arch.vision.image_size} |
| - ${model.model_arch.vision.image_size} |
| final_action_clip_value: null |
| action_dim: ${data.processor.action_output_dim} |
| proprio_dim: ${data.processor.proprio_output_dim} |
| action_decoder_layers: 2 |
| action_expert_adaptive_mode: null |
| flow_sampling: beta |
| num_inference_steps: 10 |
| vision: |
| name: galaxea_fm.models.vla_tiny.smolvlm2.smolvlm2_vision.SmolVLMVisionTransformer |
| key_prefix: model.vision_model |
| hidden_size: 768 |
| intermediate_size: 3072 |
| num_hidden_layers: 12 |
| num_attention_heads: 12 |
| num_channels: 3 |
| image_size: 512 |
| patch_size: 16 |
| layer_norm_eps: 1.0e-06 |
| attention_dropout: 0.0 |
| num_image_tokens: 64 |
| vision_projector: |
| name: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMConnector |
| key_prefix: model.connector |
| vision_config: |
| scale_factor: 4 |
| hidden_size: 768 |
| projection_dim: ${model.model_arch.joint.mixture.vlm.hidden_size} |
| num_input_images: ${model.model_arch.num_input_images} |
| text_config: |
| hidden_size: ${model.model_arch.joint.mixture.vlm.hidden_size} |
| joint: |
| name: galaxea_fm.models.galaxea_zero.joint_model.JointModel |
| key_prefix: model.text_model |
| action_expert_adaptive_mode: null |
| module_names: |
| mlp: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextMLP |
| norm: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRMSNorm |
| rope: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRotaryEmbedding |
| mixture: |
| vlm: |
| hidden_size: 960 |
| intermediate_size: 2560 |
| use_final_norm: true |
| cache: true |
| proprio: |
| hidden_size: 720 |
| intermediate_size: 2048 |
| use_final_norm: true |
| cache: true |
| adaptive_mode: null |
| action: |
| hidden_size: 720 |
| intermediate_size: 2048 |
| use_final_norm: true |
| cache: false |
| adaptive_mode: null |
| time_hidden_size: 256 |
| num_hidden_layers: 16 |
| num_attention_heads: 15 |
| num_key_value_heads: 5 |
| head_dim: 64 |
| max_position_embeddings: 8192 |
| rms_norm_eps: 1.0e-05 |
| rope_theta: 100000.0 |
| attention_bias: false |
| attention_dropout: 0.0 |
|
|