seed: 7 resume_ckpt: /vla_fulltime/jianning.cui/code/GalaxeaFM/runs/merge_pipeline/real/r1lite_g0_pp_bbox_400_tasks/2025-12-22_05-53-31/checkpoints/step_124838.pt output_dir: ${hydra:runtime.output_dir} checkpointing_steps: 17834 logger: type: wandb log_steps: 10 task: ${hydra:runtime.choices.task} project: ${split:${logger.task},0} experiment_name: ${split:${logger.task},-1} mode: online workspace: cuijianning1996-galaxea-ai dir: ${output_dir}/wandb batch_size_val: 16 eval_episodes_num: 1 ckpt_path: /data/trt_ckpts/model_state_dict.pt env: R1ProBlocksStackEasy target_controller_type: bimanual_relaxed_ik tags: null edp: card: null training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S} git_branch: null git_commit: null root: null repo_ids: null save_dir: ${output_dir} tags: ${tags} max_steps: ${model.max_steps} batch_size: ${model.batch_size} data: _target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset dataset_dirs: null shape_meta: action: - key: left_arm raw_shape: 6 shape: 6 - key: left_gripper raw_shape: 1 shape: 1 - key: right_arm raw_shape: 6 shape: 6 - key: right_gripper raw_shape: 1 shape: 1 state: - key: left_arm raw_shape: 6 shape: 6 - key: left_gripper raw_shape: 1 shape: 1 - key: right_arm raw_shape: 6 shape: 6 - key: right_gripper raw_shape: 1 shape: 1 images: - key: head_condition raw_shape: - 3 - 224 - 224 shape: - 3 - 224 - 224 - key: head_rgb raw_shape: - 3 - 720 - 1280 shape: - 3 - 224 - 224 - key: left_wrist_rgb raw_shape: - 3 - 720 - 1280 shape: - 3 - 224 - 224 - key: right_wrist_rgb raw_shape: - 3 - 720 - 1280 shape: - 3 - 224 - 224 action_size: 32 past_action_size: 0 obs_size: 1 ee_start_moving_thresh: 0.0 val_set_proportion: 0.05 use_bbox_condition: true dataset_root: /galaxea_dataset/galaxea/pp_project/lerobot_with_bbox dataset_prefixes: - BENCH - Bench model: pretrained_ckpt: /galaxea_dataset/mnt/tmp/pp_wt_img_cond/checkpoints/org2fm_v2.pt use_pretrained_norm_stats: true model_weights_to_bf16: false enable_bf16_training: true use_torch_compile: false find_unused_parameters: false batch_size: 2 num_workers: 4 pin_memory: true persistent_workers: true max_epochs: 4 max_steps: null grad_accumulation_steps: 2 use_8bit_optimizer: false learning_rate: 2.5e-05 weight_decay: 1.0e-06 betas: - 0.9 - 0.999 lr_scheduler_type: cosine warmup_steps: 500 max_grad_norm: 1.0 use_ema: false ema: update_after_step: 0 power: 0.67 use_sync_bn: false processor: _target_: galaxea_fm.processors.galaxea_zero_processor.GalaxeaZeroProcessor shape_meta: ${data.shape_meta} num_obs_steps: ${data.obs_size} action_state_transforms: - _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform keys: - left_arm - right_arm use_stepwise_action_norm: true norm_default_mode: z-score norm_exception_mode: action: left_gripper: 0/100 right_gripper: 0/100 action_state_merger: _target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign train_transforms: head_condition: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 head_rgb: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 left_wrist_rgb: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 right_wrist_rgb: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 val_transforms: head_condition: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 head_rgb: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 left_wrist_rgb: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 right_wrist_rgb: - _target_: torchvision.transforms.Resize size: - 224 - 224 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 num_output_cameras: 4 use_zh_instruction: false drop_high_level_prob: 1.0 pad_token_id: ${model.model_arch.pad_token_id} image_token_index: ${model.model_arch.image_token_index} tokenizer_params: pretrained_model_name_or_path: /data/google/paligemma-3b-pt-224 local_files_only: false token: null max_text_tokens: ${model.model_arch.max_text_tokens} max_image_text_tokens: ${model.model_arch.max_image_text_tokens} num_input_cameras: ${model.model_arch.num_input_images} num_image_tokens_per_camera: ${model.model_arch.vision.num_image_tokens} model_arch: _target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero pretrained_model_path: /data/google/paligemma-3b-pt-224 vla_training_strategy: vla-full-train backbone_lr_multiplier: 1.0 image_token_index: 257152 pad_token_id: 0 vocab_size: 257216 cond_steps: ${data.obs_size} horizon_steps: ${data.action_size} max_text_tokens: 55 max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * ${model.model_arch.vision.num_image_tokens} + ${model.model_arch.max_text_tokens}'} num_input_images: ${eval:'${model.model_arch.cond_steps} * ${model.processor.num_output_cameras}'} num_extra_image_tokens_per_camera: 0 final_action_clip_value: null action_dim: 14 proprio_dim: 14 action_decoder_layers: 2 action_expert_adaptive_mode: null flow_sampling: beta num_inference_steps: 10 vision: name: galaxea_fm.models.galaxea_zero.paligemma.siglip.SiglipVisionModel hidden_size: 1152 intermediate_size: 4304 num_hidden_layers: 27 num_attention_heads: 16 num_channels: 3 image_size: 224 patch_size: 14 layer_norm_eps: 1.0e-06 attention_dropout: 0.0 num_image_tokens: 256 vision_projector: name: galaxea_fm.models.galaxea_zero.paligemma.siglip.PaliGemmaMultiModalProjector vision_config: hidden_size: 1152 projection_dim: 2048 joint: name: galaxea_fm.models.galaxea_zero.joint_model.JointModel action_expert_adaptive_mode: null mixture: vlm: hidden_size: 2048 intermediate_size: 16384 use_final_norm: false cache: true proprio: hidden_size: 1024 intermediate_size: 4096 use_final_norm: true cache: true adaptive_mode: null action: hidden_size: 1024 intermediate_size: 4096 use_final_norm: true cache: false adaptive_mode: null time_hidden_size: 256 num_hidden_layers: 18 num_attention_heads: 8 num_key_value_heads: 1 head_dim: 256 max_position_embeddings: 8192 rms_norm_eps: 1.0e-06 rope_theta: 10000.0 attention_bias: false attention_dropout: 0.0