mgovind7
/

UniLACT

robot manipulation

multi-modal perception

vision-language-action

Model card Files Files and versions

UniLACT / unilarn_trained_on_calvin /config.yaml

mgovind7's picture

Add UniLACT model weights of 3 stages for calvin

e792dae 3 months ago

2.67 kB

	_target_: unilarn.src.models.unilarn.UniLARN
	codebook_dim: 32
	commit_loss_w: 1.0
	recon_loss_w: 1.0
	recon_depth_loss_w: 1.0
	perceptual_loss_w: 1.0
	image_encoder:
	_target_: transformers.ViTMAEModel.from_pretrained
	pretrained_model_name_or_path: facebook/vit-mae-large
	m_former:
	_target_: unilarn.src.models.m_former.MFormer
	add_pooling_layer: false
	config:
	_target_: transformers.ViTConfig
	query_num: 8
	input_hidden_size: 1024
	num_patches: 197
	attention_probs_dropout_prob: 0.0
	hidden_act: gelu
	hidden_dropout_prob: 0.0
	hidden_size: 768
	initializer_range: 0.02
	intermediate_size: 3072
	layer_norm_eps: 1.0e-12
	model_type: vit
	num_attention_heads: 12
	num_hidden_layers: 4
	qkv_bias: true
	m_former_depth:
	_target_: unilarn.src.models.m_former.MFormer
	add_pooling_layer: false
	config:
	_target_: transformers.ViTConfig
	query_num: 8
	input_hidden_size: 1024
	num_patches: 197
	attention_probs_dropout_prob: 0.0
	hidden_act: gelu
	hidden_dropout_prob: 0.0
	hidden_size: 768
	initializer_range: 0.02
	intermediate_size: 3072
	layer_norm_eps: 1.0e-12
	model_type: vit
	num_attention_heads: 12
	num_hidden_layers: 4
	qkv_bias: true
	vector_quantizer:
	_target_: unilarn.src.models.vector_quantizer.VectorQuantizer2
	n_e: 128
	e_dim: 32
	beta: 0.25
	remap: null
	sane_index_shape: true
	legacy: false
	vector_quantizer_uni:
	_target_: unilarn.src.models.vector_quantizer.VectorQuantizer2
	n_e: 128
	e_dim: 32
	beta: 0.25
	remap: null
	sane_index_shape: true
	legacy: false
	decoder:
	_target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder
	config:
	_target_: transformers.ViTConfig
	query_num: 8
	attention_probs_dropout_prob: 0.0
	hidden_act: gelu
	hidden_dropout_prob: 0.0
	hidden_size: 768
	image_size: 224
	initializer_range: 0.02
	intermediate_size: 3072
	layer_norm_eps: 1.0e-12
	model_type: vit
	num_attention_heads: 12
	num_channels: 3
	num_hidden_layers: 12
	patch_size: 16
	qkv_bias: true
	encoder_stride: 16
	num_patches: 196
	depth_decoder:
	_target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder
	config:
	_target_: transformers.ViTConfig
	query_num: 8
	attention_probs_dropout_prob: 0.0
	hidden_act: gelu
	hidden_dropout_prob: 0.0
	hidden_size: 768
	image_size: 224
	initializer_range: 0.02
	intermediate_size: 3072
	layer_norm_eps: 1.0e-12
	model_type: vit
	num_attention_heads: 12
	num_channels: 3
	num_hidden_layers: 12
	patch_size: 16
	qkv_bias: true
	encoder_stride: 16
	num_patches: 196