tags: null seed: 7 resume_ckpt: null output_dir: ${hydra:runtime.output_dir} dataset_stats_cache_dir: ${oc.env:GALAXEA_FM_DATASET_STATS_CACHE_DIR} checkpointing_steps: 10000 logger: type: swanlab log_steps: 10 task: ${hydra:runtime.choices.task} project: ${split:${logger.task},0} experiment_name: ${split:${logger.task},-1} mode: cloud workspace: Galaxea-AI dir: null batch_size_val: 16 eval_episodes_num: 1 ckpt_path: null env: R1ProBlocksStackEasy target_controller_type: bimanual_relaxed_ik edp: card: null training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S} git_branch: null git_commit: null root: null repo_ids: null save_dir: ${output_dir} tags: ${tags} max_steps: ${model.max_steps} batch_size: ${model.batch_size} EVALUATION: task_suite_names: - libero_10 - libero_spatial - libero_object - libero_goal num_steps_wait: 10 replan_steps: 5 num_trials: 50 output_dir: ${output_dir} run_id_note: null env_num: 50 data: dataset: _target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset dataset_dirs: - /To/Your/Data shape_meta: action: - key: left_ee_pose raw_shape: 7 shape: 9 - key: left_gripper raw_shape: 1 shape: 1 - key: right_ee_pose raw_shape: 7 shape: 9 - key: right_gripper raw_shape: 1 shape: 1 - key: torso raw_shape: 4 shape: 4 state: - key: left_ee_pose raw_shape: 7 shape: 9 - key: left_gripper raw_shape: 1 shape: 1 - key: right_ee_pose raw_shape: 7 shape: 9 - key: right_gripper raw_shape: 1 shape: 1 - key: torso raw_shape: 4 shape: 4 images: - key: head_rgb raw_shape: - 3 - 360 - 640 shape: - 3 - ${model.model_arch.input_image_size.0} - ${model.model_arch.input_image_size.1} - key: left_wrist_rgb raw_shape: - 3 - 480 - 640 shape: - 3 - ${model.model_arch.input_image_size.0} - ${model.model_arch.input_image_size.1} - key: right_wrist_rgb raw_shape: - 3 - 480 - 640 shape: - 3 - ${model.model_arch.input_image_size.0} - ${model.model_arch.input_image_size.1} action_size: 32 past_action_size: 0 obs_size: 1 ee_start_moving_thresh: 0.0 val_set_proportion: 0.0 processor: _target_: galaxea_fm.processors.base_processor.BaseProcessor shape_meta: ${data.dataset.shape_meta} num_obs_steps: ${data.dataset.obs_size} action_state_transforms: - _target_: galaxea_fm.transforms.relative_action.RelativePoseTransform keys: - left_ee_pose - right_ee_pose - _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform keys: - torso - _target_: galaxea_fm.transforms.rotation.PoseRotationTransform rotation_type: rotation_6d category_keys: action: - left_ee_pose - right_ee_pose state: - left_ee_pose - right_ee_pose use_stepwise_action_norm: true norm_default_mode: q01/q99 norm_exception_mode: action: left_gripper: 0/100 right_gripper: 0/100 action_state_merger: _target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign train_transforms: head_rgb: - _target_: torchvision.transforms.Resize size: ${model.model_arch.input_image_size} - _target_: torchvision.transforms.ColorJitter brightness: 0.3 contrast: 0.4 saturation: 0.5 hue: 0.3 - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 left_wrist_rgb: ${data.processor.train_transforms.head_rgb} right_wrist_rgb: ${data.processor.train_transforms.head_rgb} val_transforms: head_rgb: - _target_: torchvision.transforms.Resize size: ${model.model_arch.input_image_size} - _target_: galaxea_fm.transforms.image.ToTensor - _target_: torchvision.transforms.Normalize mean: - 0.5 - 0.5 - 0.5 std: - 0.5 - 0.5 - 0.5 left_wrist_rgb: ${data.processor.val_transforms.head_rgb} right_wrist_rgb: ${data.processor.val_transforms.head_rgb} drop_high_level_prob: 1.0 use_zh_instruction: false num_output_images: 3 action_output_dim: 24 proprio_output_dim: 24 model: pretrained_ckpt: null use_pretrained_norm_stats: false model_weights_to_bf16: false enable_bf16_training: true use_torch_compile: false find_unused_parameters: true batch_size: 18 num_workers: 12 pin_memory: true persistent_workers: true max_epochs: 10 max_steps: null grad_accumulation_steps: 1 use_8bit_optimizer: false learning_rate: 0.0001 weight_decay: 0.01 betas: - 0.9 - 0.95 lr_scheduler_type: constant_with_warmup warmup_steps: 10000 max_grad_norm: 1.0 use_ema: false ema: update_after_step: 0 power: 0.67 use_sync_bn: false model_arch: _target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero tokenizer: _target_: galaxea_fm.models.vla_tiny.smolvlm2.tokenizer.SmolVLM2Tokenizer tokenizer_params: pretrained_model_name_or_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct local_files_only: true pad_token_id: ${model.model_arch.pad_token_id} image_token_index: ${model.model_arch.image_token_index} max_text_tokens: ${model.model_arch.max_text_tokens} num_tokens_per_image: ${model.model_arch.vision.num_image_tokens} num_input_images: ${model.model_arch.num_input_images} pretrained_model_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct vla_training_strategy: vla-full-train backbone_lr_multiplier: 0.1 image_token_index: 49190 pad_token_id: 2 vocab_size: 49280 fill_padded_with_token: true embed_token_key_prefix: model.text_model.embed_tokens cond_steps: ${data.dataset.obs_size} horizon_steps: ${data.dataset.action_size} max_text_tokens: 55 max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * (${model.model_arch.vision.num_image_tokens} + 3) + ${model.model_arch.max_text_tokens}'} num_input_images: ${eval:'${model.model_arch.cond_steps} * ${data.processor.num_output_images}'} input_image_size: - ${model.model_arch.vision.image_size} - ${model.model_arch.vision.image_size} final_action_clip_value: null action_dim: ${data.processor.action_output_dim} proprio_dim: ${data.processor.proprio_output_dim} action_decoder_layers: 2 action_expert_adaptive_mode: null flow_sampling: beta num_inference_steps: 10 vision: name: galaxea_fm.models.vla_tiny.smolvlm2.smolvlm2_vision.SmolVLMVisionTransformer key_prefix: model.vision_model hidden_size: 768 intermediate_size: 3072 num_hidden_layers: 12 num_attention_heads: 12 num_channels: 3 image_size: 512 patch_size: 16 layer_norm_eps: 1.0e-06 attention_dropout: 0.0 num_image_tokens: 64 vision_projector: name: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMConnector key_prefix: model.connector vision_config: scale_factor: 4 hidden_size: 768 projection_dim: ${model.model_arch.joint.mixture.vlm.hidden_size} num_input_images: ${model.model_arch.num_input_images} text_config: hidden_size: ${model.model_arch.joint.mixture.vlm.hidden_size} joint: name: galaxea_fm.models.galaxea_zero.joint_model.JointModel key_prefix: model.text_model action_expert_adaptive_mode: null module_names: mlp: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextMLP norm: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRMSNorm rope: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRotaryEmbedding mixture: vlm: hidden_size: 960 intermediate_size: 2560 use_final_norm: true cache: true proprio: hidden_size: 720 intermediate_size: 2048 use_final_norm: true cache: true adaptive_mode: null action: hidden_size: 720 intermediate_size: 2048 use_final_norm: true cache: false adaptive_mode: null time_hidden_size: 256 num_hidden_layers: 16 num_attention_heads: 15 num_key_value_heads: 5 head_dim: 64 max_position_embeddings: 8192 rms_norm_eps: 1.0e-05 rope_theta: 100000.0 attention_bias: false attention_dropout: 0.0