tags: null
seed: 7
resume_ckpt: null
output_dir: ${hydra:runtime.output_dir}
dataset_stats_cache_dir: ${oc.env:GALAXEA_FM_DATASET_STATS_CACHE_DIR}
checkpointing_steps: 10000
logger:
  type: swanlab
  log_steps: 10
  task: ${hydra:runtime.choices.task}
  project: ${split:${logger.task},0}
  experiment_name: ${split:${logger.task},-1}
  mode: cloud
  workspace: Galaxea-AI
  dir: null
batch_size_val: 16
eval_episodes_num: 1
ckpt_path: null
env: R1ProBlocksStackEasy
target_controller_type: bimanual_relaxed_ik
edp:
  card: null
  training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S}
  git_branch: null
  git_commit: null
  root: null
  repo_ids: null
  save_dir: ${output_dir}
  tags: ${tags}
  max_steps: ${model.max_steps}
  batch_size: ${model.batch_size}
EVALUATION:
  task_suite_names:
  - libero_10
  - libero_spatial
  - libero_object
  - libero_goal
  num_steps_wait: 10
  replan_steps: 5
  num_trials: 50
  output_dir: ${output_dir}
  run_id_note: null
  env_num: 50
data:
  dataset:
    _target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset
    dataset_dirs:
    - /To/Your/Data
    shape_meta:
      action:
      - key: left_ee_pose
        raw_shape: 7
        shape: 9
      - key: left_gripper
        raw_shape: 1
        shape: 1
      - key: right_ee_pose
        raw_shape: 7
        shape: 9
      - key: right_gripper
        raw_shape: 1
        shape: 1
      - key: torso
        raw_shape: 4
        shape: 4
      state:
      - key: left_ee_pose
        raw_shape: 7
        shape: 9
      - key: left_gripper
        raw_shape: 1
        shape: 1
      - key: right_ee_pose
        raw_shape: 7
        shape: 9
      - key: right_gripper
        raw_shape: 1
        shape: 1
      - key: torso
        raw_shape: 4
        shape: 4
      images:
      - key: head_rgb
        raw_shape:
        - 3
        - 360
        - 640
        shape:
        - 3
        - ${model.model_arch.input_image_size.0}
        - ${model.model_arch.input_image_size.1}
      - key: left_wrist_rgb
        raw_shape:
        - 3
        - 480
        - 640
        shape:
        - 3
        - ${model.model_arch.input_image_size.0}
        - ${model.model_arch.input_image_size.1}
      - key: right_wrist_rgb
        raw_shape:
        - 3
        - 480
        - 640
        shape:
        - 3
        - ${model.model_arch.input_image_size.0}
        - ${model.model_arch.input_image_size.1}
    action_size: 32
    past_action_size: 0
    obs_size: 1
    ee_start_moving_thresh: 0.0
    val_set_proportion: 0.0
  processor:
    _target_: galaxea_fm.processors.base_processor.BaseProcessor
    shape_meta: ${data.dataset.shape_meta}
    num_obs_steps: ${data.dataset.obs_size}
    action_state_transforms:
    - _target_: galaxea_fm.transforms.relative_action.RelativePoseTransform
      keys:
      - left_ee_pose
      - right_ee_pose
    - _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform
      keys:
      - torso
    - _target_: galaxea_fm.transforms.rotation.PoseRotationTransform
      rotation_type: rotation_6d
      category_keys:
        action:
        - left_ee_pose
        - right_ee_pose
        state:
        - left_ee_pose
        - right_ee_pose
    use_stepwise_action_norm: true
    norm_default_mode: q01/q99
    norm_exception_mode:
      action:
        left_gripper: 0/100
        right_gripper: 0/100
    action_state_merger:
      _target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign
    train_transforms:
      head_rgb:
      - _target_: torchvision.transforms.Resize
        size: ${model.model_arch.input_image_size}
      - _target_: torchvision.transforms.ColorJitter
        brightness: 0.3
        contrast: 0.4
        saturation: 0.5
        hue: 0.3
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      left_wrist_rgb: ${data.processor.train_transforms.head_rgb}
      right_wrist_rgb: ${data.processor.train_transforms.head_rgb}
    val_transforms:
      head_rgb:
      - _target_: torchvision.transforms.Resize
        size: ${model.model_arch.input_image_size}
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      left_wrist_rgb: ${data.processor.val_transforms.head_rgb}
      right_wrist_rgb: ${data.processor.val_transforms.head_rgb}
    drop_high_level_prob: 1.0
    use_zh_instruction: false
    num_output_images: 3
    action_output_dim: 24
    proprio_output_dim: 24
model:
  pretrained_ckpt: null
  use_pretrained_norm_stats: false
  model_weights_to_bf16: false
  enable_bf16_training: true
  use_torch_compile: false
  find_unused_parameters: true
  batch_size: 18
  num_workers: 12
  pin_memory: true
  persistent_workers: true
  max_epochs: 10
  max_steps: null
  grad_accumulation_steps: 1
  use_8bit_optimizer: false
  learning_rate: 0.0001
  weight_decay: 0.01
  betas:
  - 0.9
  - 0.95
  lr_scheduler_type: constant_with_warmup
  warmup_steps: 10000
  max_grad_norm: 1.0
  use_ema: false
  ema:
    update_after_step: 0
    power: 0.67
  use_sync_bn: false
  model_arch:
    _target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy
    model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero
    tokenizer:
      _target_: galaxea_fm.models.vla_tiny.smolvlm2.tokenizer.SmolVLM2Tokenizer
      tokenizer_params:
        pretrained_model_name_or_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct
        local_files_only: true
      pad_token_id: ${model.model_arch.pad_token_id}
      image_token_index: ${model.model_arch.image_token_index}
      max_text_tokens: ${model.model_arch.max_text_tokens}
      num_tokens_per_image: ${model.model_arch.vision.num_image_tokens}
      num_input_images: ${model.model_arch.num_input_images}
    pretrained_model_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct
    vla_training_strategy: vla-full-train
    backbone_lr_multiplier: 0.1
    image_token_index: 49190
    pad_token_id: 2
    vocab_size: 49280
    fill_padded_with_token: true
    embed_token_key_prefix: model.text_model.embed_tokens
    cond_steps: ${data.dataset.obs_size}
    horizon_steps: ${data.dataset.action_size}
    max_text_tokens: 55
    max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * (${model.model_arch.vision.num_image_tokens}
      + 3) + ${model.model_arch.max_text_tokens}'}
    num_input_images: ${eval:'${model.model_arch.cond_steps} * ${data.processor.num_output_images}'}
    input_image_size:
    - ${model.model_arch.vision.image_size}
    - ${model.model_arch.vision.image_size}
    final_action_clip_value: null
    action_dim: ${data.processor.action_output_dim}
    proprio_dim: ${data.processor.proprio_output_dim}
    action_decoder_layers: 2
    action_expert_adaptive_mode: null
    flow_sampling: beta
    num_inference_steps: 10
    vision:
      name: galaxea_fm.models.vla_tiny.smolvlm2.smolvlm2_vision.SmolVLMVisionTransformer
      key_prefix: model.vision_model
      hidden_size: 768
      intermediate_size: 3072
      num_hidden_layers: 12
      num_attention_heads: 12
      num_channels: 3
      image_size: 512
      patch_size: 16
      layer_norm_eps: 1.0e-06
      attention_dropout: 0.0
      num_image_tokens: 64
    vision_projector:
      name: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMConnector
      key_prefix: model.connector
      vision_config:
        scale_factor: 4
        hidden_size: 768
        projection_dim: ${model.model_arch.joint.mixture.vlm.hidden_size}
        num_input_images: ${model.model_arch.num_input_images}
      text_config:
        hidden_size: ${model.model_arch.joint.mixture.vlm.hidden_size}
    joint:
      name: galaxea_fm.models.galaxea_zero.joint_model.JointModel
      key_prefix: model.text_model
      action_expert_adaptive_mode: null
      module_names:
        mlp: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextMLP
        norm: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRMSNorm
        rope: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRotaryEmbedding
      mixture:
        vlm:
          hidden_size: 960
          intermediate_size: 2560
          use_final_norm: true
          cache: true
        proprio:
          hidden_size: 720
          intermediate_size: 2048
          use_final_norm: true
          cache: true
          adaptive_mode: null
        action:
          hidden_size: 720
          intermediate_size: 2048
          use_final_norm: true
          cache: false
          adaptive_mode: null
      time_hidden_size: 256
      num_hidden_layers: 16
      num_attention_heads: 15
      num_key_value_heads: 5
      head_dim: 64
      max_position_embeddings: 8192
      rms_norm_eps: 1.0e-05
      rope_theta: 100000.0
      attention_bias: false
      attention_dropout: 0.0