TAG: debug
LOG_DIR: tensorboard_logs
seed: 7
vla_path: paligemma-3b-pt-224
data_root_dir: /galaxea_dataset/galaxea/pp_project/rlds_334_tasks_distributed/
dataset_name: bbox_training_r1_lite_5_parts
run_root_dir: runs/base
adapter_tmp_dir: adapter_tmp_weights
hf_token: /galaxea_fulltime/share/.hf_token
ckpt: /galaxea_fulltime/pretrained_ckpts/pi0_libero/pi0_torch_state.pt
use_lora: false
lora_rank: 32
lora_dropout: 0.0
use_quantization: false
enable_bf16: true
model_param_to_bf16: false
vla_training_strategy: vla-full-train
weight_decay: 1.0e-06
batch_size: 4
grad_accumulation_steps: 1
learning_rate: 2.5e-05
warmup_steps: 500
lr_scheduler_type: cosine
image_aug: true
max_epochs: 8
save_steps: 23523
log_steps: 100
use_torch_compile: false
wandb_project: 1101_pnp_rla_image_condition_376_tasks_5_parts
wandb_entity: cuijianning1996-galaxea-ai
exp_name: 376_tasks_img_as_cond_with_randomly_rotated_bbox
use_ema: false
ema:
  update_after_step: 0
  power: 0.67
DATASET:
  robot_cfg:
    with_left_arm: true
    with_right_arm: true
    with_torso: false
    with_chassis: false
  use_relative_joint_action: true
  window_size: 1
  future_action_window_size: 31
  camera_views:
  - head_condition
  - head
  - wrist_left
  - wrist_right
  shuffle_buffer_size: 10000
  balance_weights: false
  use_last_action: false
  share_datasets_statistics: true
  short_prompt: true
  aug_instruction_kwargs:
    drop_high_level_prob: 1.0
    bbox_as_instruction: false
    image_condition: true
    image_condition_lang_prefix: Pick the object in the first image and place into
      the tableware.
    bbox_jitter_ratio: 0.0
  action_proprio_normalization_type: normal
  use_pretrained_data_stats: false
  proprio_noise_std: 0.05
  image_augment_kwargs:
    head:
      random_brightness:
      - 0.2
      random_contrast:
      - 0.8
      - 1.2
      random_saturation:
      - 0.8
      - 1.2
      random_hue:
      - 0.05
      augment_order:
      - random_brightness
      - random_contrast
      - random_saturation
      - random_hue
    wrist_left:
      random_brightness:
      - 0.2
      random_contrast:
      - 0.8
      - 1.2
      random_saturation:
      - 0.8
      - 1.2
      random_hue:
      - 0.05
      random_drop_all_image:
      - 0.3
      augment_order:
      - random_drop_all_image
      - random_brightness
      - random_contrast
      - random_saturation
      - random_hue
    wrist_right:
      random_brightness:
      - 0.2
      random_contrast:
      - 0.8
      - 1.2
      random_saturation:
      - 0.8
      - 1.2
      random_hue:
      - 0.05
      random_drop_all_image:
      - 0.3
      augment_order:
      - random_drop_all_image
      - random_brightness
      - random_contrast
      - random_saturation
      - random_hue
model_family: galaxea_zero
MODEL: 
  name: vla.galaxea_zero.GalaxeaZeroWrapper
  vla_name: "paligemma-3b-pt-224"
  load_inside: False
  pretrained_model_path: /galaxea_fulltime/pretrained_ckpts/cache/paligemma-3b-pt-224
  input_ids: True
  action_expert_only: False
  image_token_index: 257152
  vocab_size: 257216
  pad_token_id: 0
  cond_steps: 1 # len proprio
  horizon_steps: 32
  action_dim: 26 # 2 x [QPOS (6) + gripper (1)] + Torso Velocity (6) + Chassis Velocity (6)
  proprio_dim: 21  # 2 * [QPOS (6) + gripper (1)] + 4 (torso) + 3 (base vel) + last action(26)
  max_text_tokens: 55 # 55 for galaxea0002
  max_seq_len: ${eval:'${MODEL.num_input_images} * ${MODEL.vision.num_image_tokens} + ${MODEL.max_text_tokens}'} 
  max_image_text_tokens: ${MODEL.max_seq_len} # = ${max_seq_len}
  action_decoder_layers: 2
  flow_sampling: beta
  num_inference_steps: 10
  final_action_clip_value: null  # data normalized in [-1,1]
  action_expert_adaptive_mode: null
  num_input_images: ${eval:'${DATASET.window_size} * len(${DATASET.camera_views})'} # $DATASET.window_size * LEN($DATASET.camera_views)
  vision:
    name: vla.model.paligemma.siglip.SiglipVisionModel
    hidden_size: 1152 # siglip
    intermediate_size: 4304
    num_hidden_layers: 27
    num_attention_heads: 16
    num_channels: 3
    image_size: 224
    patch_size: 14
    layer_norm_eps: 0.000001
    attention_dropout: 0.0
    num_image_tokens: 256
    lora:
      r: ${lora_rank}
      dropout: ${lora_dropout}
    use_quantize: False
    use_lora: False
  vision_projector:
    name: vla.model.paligemma.siglip.PaliGemmaMultiModalProjector
    vision_config:
      hidden_size: 1152
      projection_dim: 2048
    lora:
      r: ${lora_rank}
      dropout: ${lora_dropout}
    use_quantize: False
    use_lora: False
  joint:
    name: vla.model.g0.joint_model.JointModel
    action_expert_adaptive_mode: null
    mixture:
      vlm:   # gemma
        hidden_size: 2048
        intermediate_size: 16384
        use_final_norm: False
        cache: True
        use_quantize: False
        use_lora: False
        adaptive_mode:  # not applicable for gemma
      proprio:
        hidden_size: 1024
        intermediate_size: 4096
        use_final_norm: True  # technically no, but sharing weights with action anyway
        cache: True
        use_quantize: False
        use_lora: False
        adaptive_mode: null
      action:
        hidden_size: 1024
        intermediate_size: 4096
        use_final_norm: True
        cache: False
        use_quantize: False
        use_lora: False
        adaptive_mode: null
    time_hidden_size: 256 # only applicable if using adaptive
    lora:
      r: ${lora_rank}
      dropout: ${lora_dropout}
    num_hidden_layers: 18
    num_attention_heads: 8
    num_key_value_heads: 1
    head_dim: 256
    max_position_embeddings: 8192
    rms_norm_eps: 0.000001
    rope_theta: 10000.0
    attention_bias: False
    attention_dropout: 0.0
    pad_token_id: 0
#################################################################################################################
# For evaluation
#################################################################################################################
EVALUATION:
  checkpoint: null     # Pretrained checkpoint path
  load_in_8bit: False                       # (For OpenVLA only) Load with 8-bit quantization
  load_in_4bit: False                       # (For OpenVLA only) Load with 4-bit quantization
  center_crop: True                         # Center crop? (if trained w/ random crop image aug)
  #################################################################################################################
  # LIBERO environment-specific parameters
  #################################################################################################################
  task_suite_name: "simpler_widowx"          # Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90
  num_steps_wait: 10                         # Number of steps to wait for objects to stabilize in sim
  num_trials_per_task: 24                    # Number of rollouts per task
  use_wrist_image: False
  #################################################################################################################
  # Utils
  #################################################################################################################
  run_id_note: None                          # Extra note to add in run ID for logging
  local_log_dir: "./experiments/logs"        # Local directory for eval logs
  use_wandb: False                            # Whether to also log results in Weights & Biases
  seed: 7                                    # Random Seed (for reproducibility)