Robotics
ONNX
English
Chinese
real-world
dual-arm
whole body control
manipulation
G0-VLA / G0Tiny_handover /config.yaml
lllliuxiao23's picture
update g0tiny handover
005eeea verified
Raw
History Blame
11 kB
tags: null
seed: 7
resume_ckpt: null
output_dir: ${hydra:runtime.output_dir}
dataset_stats_cache_dir: ${oc.env:GALAXEA_FM_DATASET_STATS_CACHE_DIR}
min_batch_size: 1
max_batch_size: 256
num_test_steps: 3
checkpointing_steps: 5000
logger:
type: swanlab
log_steps: 10
task: ${hydra:runtime.choices.task}
project: ${split:${logger.task},0}
experiment_name: ${split:${logger.task},-1}
mode: cloud
workspace: Galaxea-AI
dir: null
batch_size_val: 16
eval_episodes_num: 1
ckpt_path: null
env: R1ProBlocksStackEasy
target_controller_type: bimanual_relaxed_ik
edp:
card: null
training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S}
git_branch: null
git_commit: null
root: null
repo_ids: null
save_dir: ${output_dir}
tags: ${tags}
max_steps: ${model.max_steps}
batch_size: ${model.batch_size}
EVALUATION:
task_suite_names:
- libero_10
- libero_spatial
- libero_object
- libero_goal
num_steps_wait: 10
replan_steps: 5
num_trials: 50
output_dir: ${output_dir}
run_id_note: null
env_num: 50
data:
dataset:
_target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset
dataset_dirs:
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1225/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251224_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251224_6011_B1_007
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1225/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251224_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251224_v2.0_6011_B1_007
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data1225/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251225_6011_B1_007_v20251226_101622
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data1225/Beijing_Demo_Handover_Gift_And_Box_Moving_Hands_251225_v2.0_6011_B1_007_v20251226_101627
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1226/Beijing_Demo_Handover_Gift_And_Box_Moving_Gifts_251226_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Moving_Gifts_251226_v2.0_6011_B1_007
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1227/Beijing_Demo_Handover_Gift_And_Box_Fallen_Gifts_251227_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Fallen_Gifts_251227_v2.0_6011_B1_007
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1229/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251229_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251229_v2.0_6011_B1_007
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1230/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007
- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1230/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007-2/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007-2
shape_meta:
action:
- key: left_ee_pose
raw_shape: 7
shape: 9
- key: left_gripper
raw_shape: 1
shape: 1
- key: right_ee_pose
raw_shape: 7
shape: 9
- key: right_gripper
raw_shape: 1
shape: 1
- key: torso
raw_shape: 4
shape: 4
state:
- key: left_ee_pose
raw_shape: 7
shape: 9
- key: left_gripper
raw_shape: 1
shape: 1
- key: right_ee_pose
raw_shape: 7
shape: 9
- key: right_gripper
raw_shape: 1
shape: 1
- key: torso
raw_shape: 4
shape: 4
images:
- key: head_rgb
raw_shape:
- 3
- 360
- 640
shape:
- 3
- ${model.model_arch.input_image_size.0}
- ${model.model_arch.input_image_size.1}
- key: left_wrist_rgb
raw_shape:
- 3
- 480
- 640
shape:
- 3
- ${model.model_arch.input_image_size.0}
- ${model.model_arch.input_image_size.1}
- key: right_wrist_rgb
raw_shape:
- 3
- 480
- 640
shape:
- 3
- ${model.model_arch.input_image_size.0}
- ${model.model_arch.input_image_size.1}
action_size: 32
past_action_size: 0
obs_size: 1
ee_start_moving_thresh: 0.002
val_set_proportion: 0.05
processor:
_target_: galaxea_fm.processors.base_processor.BaseProcessor
shape_meta: ${data.dataset.shape_meta}
num_obs_steps: ${data.dataset.obs_size}
action_state_transforms:
- _target_: galaxea_fm.transforms.relative_action.RelativePoseTransform
keys:
- left_ee_pose
- right_ee_pose
- _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform
keys:
- torso
- _target_: galaxea_fm.transforms.rotation.PoseRotationTransform
rotation_type: rotation_6d
category_keys:
action:
- left_ee_pose
- right_ee_pose
state:
- left_ee_pose
- right_ee_pose
use_stepwise_action_norm: true
norm_default_mode: q01/q99
norm_exception_mode:
action:
left_gripper: 0/100
right_gripper: 0/100
action_state_merger:
_target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign
train_transforms:
head_rgb:
- _target_: torchvision.transforms.Resize
size: ${model.model_arch.input_image_size}
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
left_wrist_rgb: ${data.processor.train_transforms.head_rgb}
right_wrist_rgb: ${data.processor.train_transforms.head_rgb}
val_transforms:
head_rgb:
- _target_: torchvision.transforms.Resize
size: ${model.model_arch.input_image_size}
- _target_: galaxea_fm.transforms.image.ToTensor
- _target_: torchvision.transforms.Normalize
mean:
- 0.5
- 0.5
- 0.5
std:
- 0.5
- 0.5
- 0.5
left_wrist_rgb: ${data.processor.val_transforms.head_rgb}
right_wrist_rgb: ${data.processor.val_transforms.head_rgb}
drop_high_level_prob: 1.0
use_zh_instruction: false
num_output_images: 3
action_output_dim: 24
proprio_output_dim: 24
model:
pretrained_ckpt: /efm-nas/efm-nas/group-yaq/ziyang.jiao/model_res/real/r1pro_g0tiny_pretrain/2026-01-20_10-12-35/checkpoints/step_390000.pt
use_pretrained_norm_stats: true
model_weights_to_bf16: false
enable_bf16_training: true
use_torch_compile: false
find_unused_parameters: true
batch_size: 20
num_workers: 12
pin_memory: true
persistent_workers: true
max_epochs: null
max_steps: 50000
grad_accumulation_steps: 1
use_8bit_optimizer: false
learning_rate: 6.0e-05
weight_decay: 0.001
betas:
- 0.9
- 0.95
lr_scheduler_type: cosine
warmup_steps: 480
max_grad_norm: 1.0
use_ema: false
ema:
update_after_step: 0
power: 0.67
use_sync_bn: false
model_arch:
_target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy
model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero
tokenizer:
_target_: galaxea_fm.models.vla_tiny.smolvlm2.tokenizer.SmolVLM2Tokenizer
tokenizer_params:
pretrained_model_name_or_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct
local_files_only: true
pad_token_id: ${model.model_arch.pad_token_id}
image_token_index: ${model.model_arch.image_token_index}
max_text_tokens: ${model.model_arch.max_text_tokens}
num_tokens_per_image: ${model.model_arch.vision.num_image_tokens}
num_input_images: ${model.model_arch.num_input_images}
pretrained_model_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct
vla_training_strategy: vla-full-train
backbone_lr_multiplier: 0.1
image_token_index: 49190
pad_token_id: 2
vocab_size: 49280
fill_padded_with_token: true
embed_token_key_prefix: model.text_model.embed_tokens
cond_steps: ${data.dataset.obs_size}
horizon_steps: ${data.dataset.action_size}
max_text_tokens: 55
max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * (${model.model_arch.vision.num_image_tokens}
+ 3) + ${model.model_arch.max_text_tokens}'}
num_input_images: ${eval:'${model.model_arch.cond_steps} * ${data.processor.num_output_images}'}
input_image_size:
- ${model.model_arch.vision.image_size}
- ${model.model_arch.vision.image_size}
final_action_clip_value: null
action_dim: ${data.processor.action_output_dim}
proprio_dim: ${data.processor.proprio_output_dim}
action_decoder_layers: 2
action_expert_adaptive_mode: null
flow_sampling: beta
num_inference_steps: 10
vision:
name: galaxea_fm.models.vla_tiny.smolvlm2.smolvlm2_vision.SmolVLMVisionTransformer
key_prefix: model.vision_model
hidden_size: 768
intermediate_size: 3072
num_hidden_layers: 12
num_attention_heads: 12
num_channels: 3
image_size: 512
patch_size: 16
layer_norm_eps: 1.0e-06
attention_dropout: 0.0
num_image_tokens: 64
vision_projector:
name: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMConnector
key_prefix: model.connector
vision_config:
scale_factor: 4
hidden_size: 768
projection_dim: ${model.model_arch.joint.mixture.vlm.hidden_size}
num_input_images: ${model.model_arch.num_input_images}
text_config:
hidden_size: ${model.model_arch.joint.mixture.vlm.hidden_size}
joint:
name: galaxea_fm.models.galaxea_zero.joint_model.JointModel
key_prefix: model.text_model
action_expert_adaptive_mode: null
module_names:
mlp: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextMLP
norm: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRMSNorm
rope: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRotaryEmbedding
mixture:
vlm:
hidden_size: 960
intermediate_size: 2560
use_final_norm: true
cache: true
proprio:
hidden_size: 720
intermediate_size: 2048
use_final_norm: true
cache: true
adaptive_mode: null
action:
hidden_size: 720
intermediate_size: 2048
use_final_norm: true
cache: false
adaptive_mode: null
time_hidden_size: 256
num_hidden_layers: 16
num_attention_heads: 15
num_key_value_heads: 5
head_dim: 64
max_position_embeddings: 8192
rms_norm_eps: 1.0e-05
rope_theta: 100000.0
attention_bias: false
attention_dropout: 0.0