File size: 7,532 Bytes
77e28ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | TAG: debug
LOG_DIR: tensorboard_logs
seed: 7
vla_path: paligemma-3b-pt-224
data_root_dir: /galaxea_dataset/galaxea/pp_project/rlds_334_tasks_distributed/
dataset_name: bbox_training_r1_lite_5_parts
run_root_dir: runs/base
adapter_tmp_dir: adapter_tmp_weights
hf_token: /galaxea_fulltime/share/.hf_token
ckpt: /galaxea_fulltime/pretrained_ckpts/pi0_libero/pi0_torch_state.pt
use_lora: false
lora_rank: 32
lora_dropout: 0.0
use_quantization: false
enable_bf16: true
model_param_to_bf16: false
vla_training_strategy: vla-full-train
weight_decay: 1.0e-06
batch_size: 4
grad_accumulation_steps: 1
learning_rate: 2.5e-05
warmup_steps: 500
lr_scheduler_type: cosine
image_aug: true
max_epochs: 8
save_steps: 23523
log_steps: 100
use_torch_compile: false
wandb_project: 1101_pnp_rla_image_condition_376_tasks_5_parts
wandb_entity: cuijianning1996-galaxea-ai
exp_name: 376_tasks_img_as_cond_with_randomly_rotated_bbox
use_ema: false
ema:
update_after_step: 0
power: 0.67
DATASET:
robot_cfg:
with_left_arm: true
with_right_arm: true
with_torso: false
with_chassis: false
use_relative_joint_action: true
window_size: 1
future_action_window_size: 31
camera_views:
- head_condition
- head
- wrist_left
- wrist_right
shuffle_buffer_size: 10000
balance_weights: false
use_last_action: false
share_datasets_statistics: true
short_prompt: true
aug_instruction_kwargs:
drop_high_level_prob: 1.0
bbox_as_instruction: false
image_condition: true
image_condition_lang_prefix: Pick the object in the first image and place into
the tableware.
bbox_jitter_ratio: 0.0
action_proprio_normalization_type: normal
use_pretrained_data_stats: false
proprio_noise_std: 0.05
image_augment_kwargs:
head:
random_brightness:
- 0.2
random_contrast:
- 0.8
- 1.2
random_saturation:
- 0.8
- 1.2
random_hue:
- 0.05
augment_order:
- random_brightness
- random_contrast
- random_saturation
- random_hue
wrist_left:
random_brightness:
- 0.2
random_contrast:
- 0.8
- 1.2
random_saturation:
- 0.8
- 1.2
random_hue:
- 0.05
random_drop_all_image:
- 0.3
augment_order:
- random_drop_all_image
- random_brightness
- random_contrast
- random_saturation
- random_hue
wrist_right:
random_brightness:
- 0.2
random_contrast:
- 0.8
- 1.2
random_saturation:
- 0.8
- 1.2
random_hue:
- 0.05
random_drop_all_image:
- 0.3
augment_order:
- random_drop_all_image
- random_brightness
- random_contrast
- random_saturation
- random_hue
model_family: galaxea_zero
MODEL:
name: vla.galaxea_zero.GalaxeaZeroWrapper
vla_name: "paligemma-3b-pt-224"
load_inside: False
pretrained_model_path: /galaxea_fulltime/pretrained_ckpts/cache/paligemma-3b-pt-224
input_ids: True
action_expert_only: False
image_token_index: 257152
vocab_size: 257216
pad_token_id: 0
cond_steps: 1 # len proprio
horizon_steps: 32
action_dim: 26 # 2 x [QPOS (6) + gripper (1)] + Torso Velocity (6) + Chassis Velocity (6)
proprio_dim: 21 # 2 * [QPOS (6) + gripper (1)] + 4 (torso) + 3 (base vel) + last action(26)
max_text_tokens: 55 # 55 for galaxea0002
max_seq_len: ${eval:'${MODEL.num_input_images} * ${MODEL.vision.num_image_tokens} + ${MODEL.max_text_tokens}'}
max_image_text_tokens: ${MODEL.max_seq_len} # = ${max_seq_len}
action_decoder_layers: 2
flow_sampling: beta
num_inference_steps: 10
final_action_clip_value: null # data normalized in [-1,1]
action_expert_adaptive_mode: null
num_input_images: ${eval:'${DATASET.window_size} * len(${DATASET.camera_views})'} # $DATASET.window_size * LEN($DATASET.camera_views)
vision:
name: vla.model.paligemma.siglip.SiglipVisionModel
hidden_size: 1152 # siglip
intermediate_size: 4304
num_hidden_layers: 27
num_attention_heads: 16
num_channels: 3
image_size: 224
patch_size: 14
layer_norm_eps: 0.000001
attention_dropout: 0.0
num_image_tokens: 256
lora:
r: ${lora_rank}
dropout: ${lora_dropout}
use_quantize: False
use_lora: False
vision_projector:
name: vla.model.paligemma.siglip.PaliGemmaMultiModalProjector
vision_config:
hidden_size: 1152
projection_dim: 2048
lora:
r: ${lora_rank}
dropout: ${lora_dropout}
use_quantize: False
use_lora: False
joint:
name: vla.model.g0.joint_model.JointModel
action_expert_adaptive_mode: null
mixture:
vlm: # gemma
hidden_size: 2048
intermediate_size: 16384
use_final_norm: False
cache: True
use_quantize: False
use_lora: False
adaptive_mode: # not applicable for gemma
proprio:
hidden_size: 1024
intermediate_size: 4096
use_final_norm: True # technically no, but sharing weights with action anyway
cache: True
use_quantize: False
use_lora: False
adaptive_mode: null
action:
hidden_size: 1024
intermediate_size: 4096
use_final_norm: True
cache: False
use_quantize: False
use_lora: False
adaptive_mode: null
time_hidden_size: 256 # only applicable if using adaptive
lora:
r: ${lora_rank}
dropout: ${lora_dropout}
num_hidden_layers: 18
num_attention_heads: 8
num_key_value_heads: 1
head_dim: 256
max_position_embeddings: 8192
rms_norm_eps: 0.000001
rope_theta: 10000.0
attention_bias: False
attention_dropout: 0.0
pad_token_id: 0
#################################################################################################################
# For evaluation
#################################################################################################################
EVALUATION:
checkpoint: null # Pretrained checkpoint path
load_in_8bit: False # (For OpenVLA only) Load with 8-bit quantization
load_in_4bit: False # (For OpenVLA only) Load with 4-bit quantization
center_crop: True # Center crop? (if trained w/ random crop image aug)
#################################################################################################################
# LIBERO environment-specific parameters
#################################################################################################################
task_suite_name: "simpler_widowx" # Task suite. Options: libero_spatial, libero_object, libero_goal, libero_10, libero_90
num_steps_wait: 10 # Number of steps to wait for objects to stabilize in sim
num_trials_per_task: 24 # Number of rollouts per task
use_wrist_image: False
#################################################################################################################
# Utils
#################################################################################################################
run_id_note: None # Extra note to add in run ID for logging
local_log_dir: "./experiments/logs" # Local directory for eval logs
use_wandb: False # Whether to also log results in Weights & Biases
seed: 7 # Random Seed (for reproducibility)
|