File size: 9,101 Bytes

5f3ded9

seed: 7
resume_ckpt: /vla_fulltime/jianning.cui/code/GalaxeaFM/runs/merge_pipeline/real/r1lite_g0_pp_bbox_400_tasks/2025-12-22_05-53-31/checkpoints/step_124838.pt
output_dir: ${hydra:runtime.output_dir}
checkpointing_steps: 17834
logger:
  type: wandb
  log_steps: 10
  task: ${hydra:runtime.choices.task}
  project: ${split:${logger.task},0}
  experiment_name: ${split:${logger.task},-1}
  mode: online
  workspace: cuijianning1996-galaxea-ai
  dir: ${output_dir}/wandb
batch_size_val: 16
eval_episodes_num: 1
ckpt_path: /data/trt_ckpts/model_state_dict.pt
env: R1ProBlocksStackEasy
target_controller_type: bimanual_relaxed_ik
tags: null
edp:
  card: null
  training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S}
  git_branch: null
  git_commit: null
  root: null
  repo_ids: null
  save_dir: ${output_dir}
  tags: ${tags}
  max_steps: ${model.max_steps}
  batch_size: ${model.batch_size}
data:
  _target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset
  dataset_dirs: null
  shape_meta:
    action:
    - key: left_arm
      raw_shape: 6
      shape: 6
    - key: left_gripper
      raw_shape: 1
      shape: 1
    - key: right_arm
      raw_shape: 6
      shape: 6
    - key: right_gripper
      raw_shape: 1
      shape: 1
    state:
    - key: left_arm
      raw_shape: 6
      shape: 6
    - key: left_gripper
      raw_shape: 1
      shape: 1
    - key: right_arm
      raw_shape: 6
      shape: 6
    - key: right_gripper
      raw_shape: 1
      shape: 1
    images:
    - key: head_condition
      raw_shape:
      - 3
      - 224
      - 224
      shape:
      - 3
      - 224
      - 224
    - key: head_rgb
      raw_shape:
      - 3
      - 720
      - 1280
      shape:
      - 3
      - 224
      - 224
    - key: left_wrist_rgb
      raw_shape:
      - 3
      - 720
      - 1280
      shape:
      - 3
      - 224
      - 224
    - key: right_wrist_rgb
      raw_shape:
      - 3
      - 720
      - 1280
      shape:
      - 3
      - 224
      - 224
  action_size: 32
  past_action_size: 0
  obs_size: 1
  ee_start_moving_thresh: 0.0
  val_set_proportion: 0.05
  use_bbox_condition: true
  dataset_root: /galaxea_dataset/galaxea/pp_project/lerobot_with_bbox
  dataset_prefixes:
  - BENCH
  - Bench
model:
  pretrained_ckpt: /galaxea_dataset/mnt/tmp/pp_wt_img_cond/checkpoints/org2fm_v2.pt
  use_pretrained_norm_stats: true
  model_weights_to_bf16: false
  enable_bf16_training: true
  use_torch_compile: false
  find_unused_parameters: false
  batch_size: 2
  num_workers: 4
  pin_memory: true
  persistent_workers: true
  max_epochs: 4
  max_steps: null
  grad_accumulation_steps: 2
  use_8bit_optimizer: false
  learning_rate: 2.5e-05
  weight_decay: 1.0e-06
  betas:
  - 0.9
  - 0.999
  lr_scheduler_type: cosine
  warmup_steps: 500
  max_grad_norm: 1.0
  use_ema: false
  ema:
    update_after_step: 0
    power: 0.67
  use_sync_bn: false
  processor:
    _target_: galaxea_fm.processors.galaxea_zero_processor.GalaxeaZeroProcessor
    shape_meta: ${data.shape_meta}
    num_obs_steps: ${data.obs_size}
    action_state_transforms:
    - _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform
      keys:
      - left_arm
      - right_arm
    use_stepwise_action_norm: true
    norm_default_mode: z-score
    norm_exception_mode:
      action:
        left_gripper: 0/100
        right_gripper: 0/100
    action_state_merger:
      _target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign
    train_transforms:
      head_condition:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      head_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      left_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      right_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
    val_transforms:
      head_condition:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      head_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      left_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
      right_wrist_rgb:
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      - _target_: galaxea_fm.transforms.image.ToTensor
      - _target_: torchvision.transforms.Normalize
        mean:
        - 0.5
        - 0.5
        - 0.5
        std:
        - 0.5
        - 0.5
        - 0.5
    num_output_cameras: 4
    use_zh_instruction: false
    drop_high_level_prob: 1.0
    pad_token_id: ${model.model_arch.pad_token_id}
    image_token_index: ${model.model_arch.image_token_index}
    tokenizer_params:
      pretrained_model_name_or_path: /data/google/paligemma-3b-pt-224
      local_files_only: false
      token: null
    max_text_tokens: ${model.model_arch.max_text_tokens}
    max_image_text_tokens: ${model.model_arch.max_image_text_tokens}
    num_input_cameras: ${model.model_arch.num_input_images}
    num_image_tokens_per_camera: ${model.model_arch.vision.num_image_tokens}
  model_arch:
    _target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy
    model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero
    pretrained_model_path: /data/google/paligemma-3b-pt-224
    vla_training_strategy: vla-full-train
    backbone_lr_multiplier: 1.0
    image_token_index: 257152
    pad_token_id: 0
    vocab_size: 257216
    cond_steps: ${data.obs_size}
    horizon_steps: ${data.action_size}
    max_text_tokens: 55
    max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * ${model.model_arch.vision.num_image_tokens}
      + ${model.model_arch.max_text_tokens}'}
    num_input_images: ${eval:'${model.model_arch.cond_steps} * ${model.processor.num_output_cameras}'}
    num_extra_image_tokens_per_camera: 0
    final_action_clip_value: null
    action_dim: 14
    proprio_dim: 14
    action_decoder_layers: 2
    action_expert_adaptive_mode: null
    flow_sampling: beta
    num_inference_steps: 10
    vision:
      name: galaxea_fm.models.galaxea_zero.paligemma.siglip.SiglipVisionModel
      hidden_size: 1152
      intermediate_size: 4304
      num_hidden_layers: 27
      num_attention_heads: 16
      num_channels: 3
      image_size: 224
      patch_size: 14
      layer_norm_eps: 1.0e-06
      attention_dropout: 0.0
      num_image_tokens: 256
    vision_projector:
      name: galaxea_fm.models.galaxea_zero.paligemma.siglip.PaliGemmaMultiModalProjector
      vision_config:
        hidden_size: 1152
        projection_dim: 2048
    joint:
      name: galaxea_fm.models.galaxea_zero.joint_model.JointModel
      action_expert_adaptive_mode: null
      mixture:
        vlm:
          hidden_size: 2048
          intermediate_size: 16384
          use_final_norm: false
          cache: true
        proprio:
          hidden_size: 1024
          intermediate_size: 4096
          use_final_norm: true
          cache: true
          adaptive_mode: null
        action:
          hidden_size: 1024
          intermediate_size: 4096
          use_final_norm: true
          cache: false
          adaptive_mode: null
      time_hidden_size: 256
      num_hidden_layers: 18
      num_attention_heads: 8
      num_key_value_heads: 1
      head_dim: 256
      max_position_embeddings: 8192
      rms_norm_eps: 1.0e-06
      rope_theta: 10000.0
      attention_bias: false
      attention_dropout: 0.0