update g0tiny handover

005eeea verified 5 months ago

11 kB

	tags: null
	seed: 7
	resume_ckpt: null
	output_dir: ${hydra:runtime.output_dir}
	dataset_stats_cache_dir: ${oc.env:GALAXEA_FM_DATASET_STATS_CACHE_DIR}
	min_batch_size: 1
	max_batch_size: 256
	num_test_steps: 3
	checkpointing_steps: 5000
	logger:
	type: swanlab
	log_steps: 10
	task: ${hydra:runtime.choices.task}
	project: ${split:${logger.task},0}
	experiment_name: ${split:${logger.task},-1}
	mode: cloud
	workspace: Galaxea-AI
	dir: null
	batch_size_val: 16
	eval_episodes_num: 1
	ckpt_path: null
	env: R1ProBlocksStackEasy
	target_controller_type: bimanual_relaxed_ik
	edp:
	card: null
	training_time: ${now:%Y-%m-%d}_${now:%H-%M-%S}
	git_branch: null
	git_commit: null
	root: null
	repo_ids: null
	save_dir: ${output_dir}
	tags: ${tags}
	max_steps: ${model.max_steps}
	batch_size: ${model.batch_size}
	EVALUATION:
	task_suite_names:
	- libero_10
	- libero_spatial
	- libero_object
	- libero_goal
	num_steps_wait: 10
	replan_steps: 5
	num_trials: 50
	output_dir: ${output_dir}
	run_id_note: null
	env_num: 50
	data:
	dataset:
	_target_: galaxea_fm.data.galaxea_lerobot_dataset.GalaxeaLerobotDataset
	dataset_dirs:
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1225/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251224_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251224_6011_B1_007
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1225/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251224_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251224_v2.0_6011_B1_007
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data1225/Beijing_Demo_Handover_Gift_And_Box_Delay_Hands_v2.0_251225_6011_B1_007_v20251226_101622
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data1225/Beijing_Demo_Handover_Gift_And_Box_Moving_Hands_251225_v2.0_6011_B1_007_v20251226_101627
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1226/Beijing_Demo_Handover_Gift_And_Box_Moving_Gifts_251226_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Moving_Gifts_251226_v2.0_6011_B1_007
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1227/Beijing_Demo_Handover_Gift_And_Box_Fallen_Gifts_251227_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Fallen_Gifts_251227_v2.0_6011_B1_007
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1229/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251229_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251229_v2.0_6011_B1_007
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1230/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007
	- /efm-nas/efm-nas/group-pxj/hairui.ren/dataset/gift/data_1230/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007-2/Beijing_Demo_Handover_Gift_And_Box_Normal_Grab_Gifts_251230_v2.0_6011_B1_007-2
	shape_meta:
	action:
	- key: left_ee_pose
	raw_shape: 7
	shape: 9
	- key: left_gripper
	raw_shape: 1
	shape: 1
	- key: right_ee_pose
	raw_shape: 7
	shape: 9
	- key: right_gripper
	raw_shape: 1
	shape: 1
	- key: torso
	raw_shape: 4
	shape: 4
	state:
	- key: left_ee_pose
	raw_shape: 7
	shape: 9
	- key: left_gripper
	raw_shape: 1
	shape: 1
	- key: right_ee_pose
	raw_shape: 7
	shape: 9
	- key: right_gripper
	raw_shape: 1
	shape: 1
	- key: torso
	raw_shape: 4
	shape: 4
	images:
	- key: head_rgb
	raw_shape:
	- 3
	- 360
	- 640
	shape:
	- 3
	- ${model.model_arch.input_image_size.0}
	- ${model.model_arch.input_image_size.1}
	- key: left_wrist_rgb
	raw_shape:
	- 3
	- 480
	- 640
	shape:
	- 3
	- ${model.model_arch.input_image_size.0}
	- ${model.model_arch.input_image_size.1}
	- key: right_wrist_rgb
	raw_shape:
	- 3
	- 480
	- 640
	shape:
	- 3
	- ${model.model_arch.input_image_size.0}
	- ${model.model_arch.input_image_size.1}
	action_size: 32
	past_action_size: 0
	obs_size: 1
	ee_start_moving_thresh: 0.002
	val_set_proportion: 0.05
	processor:
	_target_: galaxea_fm.processors.base_processor.BaseProcessor
	shape_meta: ${data.dataset.shape_meta}
	num_obs_steps: ${data.dataset.obs_size}
	action_state_transforms:
	- _target_: galaxea_fm.transforms.relative_action.RelativePoseTransform
	keys:
	- left_ee_pose
	- right_ee_pose
	- _target_: galaxea_fm.transforms.relative_action.RelativeJointTransform
	keys:
	- torso
	- _target_: galaxea_fm.transforms.rotation.PoseRotationTransform
	rotation_type: rotation_6d
	category_keys:
	action:
	- left_ee_pose
	- right_ee_pose
	state:
	- left_ee_pose
	- right_ee_pose
	use_stepwise_action_norm: true
	norm_default_mode: q01/q99
	norm_exception_mode:
	action:
	left_gripper: 0/100
	right_gripper: 0/100
	action_state_merger:
	_target_: galaxea_fm.transforms.action_state_merger.ConcatLeftAlign
	train_transforms:
	head_rgb:
	- _target_: torchvision.transforms.Resize
	size: ${model.model_arch.input_image_size}
	- _target_: galaxea_fm.transforms.image.ToTensor
	- _target_: torchvision.transforms.Normalize
	mean:
	- 0.5
	- 0.5
	- 0.5
	std:
	- 0.5
	- 0.5
	- 0.5
	left_wrist_rgb: ${data.processor.train_transforms.head_rgb}
	right_wrist_rgb: ${data.processor.train_transforms.head_rgb}
	val_transforms:
	head_rgb:
	- _target_: torchvision.transforms.Resize
	size: ${model.model_arch.input_image_size}
	- _target_: galaxea_fm.transforms.image.ToTensor
	- _target_: torchvision.transforms.Normalize
	mean:
	- 0.5
	- 0.5
	- 0.5
	std:
	- 0.5
	- 0.5
	- 0.5
	left_wrist_rgb: ${data.processor.val_transforms.head_rgb}
	right_wrist_rgb: ${data.processor.val_transforms.head_rgb}
	drop_high_level_prob: 1.0
	use_zh_instruction: false
	num_output_images: 3
	action_output_dim: 24
	proprio_output_dim: 24
	model:
	pretrained_ckpt: /efm-nas/efm-nas/group-yaq/ziyang.jiao/model_res/real/r1pro_g0tiny_pretrain/2026-01-20_10-12-35/checkpoints/step_390000.pt
	use_pretrained_norm_stats: true
	model_weights_to_bf16: false
	enable_bf16_training: true
	use_torch_compile: false
	find_unused_parameters: true
	batch_size: 20
	num_workers: 12
	pin_memory: true
	persistent_workers: true
	max_epochs: null
	max_steps: 50000
	grad_accumulation_steps: 1
	use_8bit_optimizer: false
	learning_rate: 6.0e-05
	weight_decay: 0.001
	betas:
	- 0.9
	- 0.95
	lr_scheduler_type: cosine
	warmup_steps: 480
	max_grad_norm: 1.0
	use_ema: false
	ema:
	update_after_step: 0
	power: 0.67
	use_sync_bn: false
	model_arch:
	_target_: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZeroPolicy
	model_name: galaxea_fm.models.galaxea_zero.galaxea_zero_policy.GalaxeaZero
	tokenizer:
	_target_: galaxea_fm.models.vla_tiny.smolvlm2.tokenizer.SmolVLM2Tokenizer
	tokenizer_params:
	pretrained_model_name_or_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct
	local_files_only: true
	pad_token_id: ${model.model_arch.pad_token_id}
	image_token_index: ${model.model_arch.image_token_index}
	max_text_tokens: ${model.model_arch.max_text_tokens}
	num_tokens_per_image: ${model.model_arch.vision.num_image_tokens}
	num_input_images: ${model.model_arch.num_input_images}
	pretrained_model_path: /efm-nas/efm-nas/efm-shared/pretrained_model/smolvlm2-500m-video-instruct
	vla_training_strategy: vla-full-train
	backbone_lr_multiplier: 0.1
	image_token_index: 49190
	pad_token_id: 2
	vocab_size: 49280
	fill_padded_with_token: true
	embed_token_key_prefix: model.text_model.embed_tokens
	cond_steps: ${data.dataset.obs_size}
	horizon_steps: ${data.dataset.action_size}
	max_text_tokens: 55
	max_image_text_tokens: ${eval:'${model.model_arch.num_input_images} * (${model.model_arch.vision.num_image_tokens}
	+ 3) + ${model.model_arch.max_text_tokens}'}
	num_input_images: ${eval:'${model.model_arch.cond_steps} * ${data.processor.num_output_images}'}
	input_image_size:
	- ${model.model_arch.vision.image_size}
	- ${model.model_arch.vision.image_size}
	final_action_clip_value: null
	action_dim: ${data.processor.action_output_dim}
	proprio_dim: ${data.processor.proprio_output_dim}
	action_decoder_layers: 2
	action_expert_adaptive_mode: null
	flow_sampling: beta
	num_inference_steps: 10
	vision:
	name: galaxea_fm.models.vla_tiny.smolvlm2.smolvlm2_vision.SmolVLMVisionTransformer
	key_prefix: model.vision_model
	hidden_size: 768
	intermediate_size: 3072
	num_hidden_layers: 12
	num_attention_heads: 12
	num_channels: 3
	image_size: 512
	patch_size: 16
	layer_norm_eps: 1.0e-06
	attention_dropout: 0.0
	num_image_tokens: 64
	vision_projector:
	name: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMConnector
	key_prefix: model.connector
	vision_config:
	scale_factor: 4
	hidden_size: 768
	projection_dim: ${model.model_arch.joint.mixture.vlm.hidden_size}
	num_input_images: ${model.model_arch.num_input_images}
	text_config:
	hidden_size: ${model.model_arch.joint.mixture.vlm.hidden_size}
	joint:
	name: galaxea_fm.models.galaxea_zero.joint_model.JointModel
	key_prefix: model.text_model
	action_expert_adaptive_mode: null
	module_names:
	mlp: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextMLP
	norm: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRMSNorm
	rope: galaxea_fm.models.vla_tiny.smolvlm2.modules.SmolVLMTextRotaryEmbedding
	mixture:
	vlm:
	hidden_size: 960
	intermediate_size: 2560
	use_final_norm: true
	cache: true
	proprio:
	hidden_size: 720
	intermediate_size: 2048
	use_final_norm: true
	cache: true
	adaptive_mode: null
	action:
	hidden_size: 720
	intermediate_size: 2048
	use_final_norm: true
	cache: false
	adaptive_mode: null
	time_hidden_size: 256
	num_hidden_layers: 16
	num_attention_heads: 15
	num_key_value_heads: 5
	head_dim: 64
	max_position_embeddings: 8192
	rms_norm_eps: 1.0e-05
	rope_theta: 100000.0
	attention_bias: false
	attention_dropout: 0.0