_target_: unilarn.src.models.unilarn.UniLARN codebook_dim: 32 commit_loss_w: 1.0 recon_loss_w: 1.0 recon_depth_loss_w: 1.0 perceptual_loss_w: 1.0 image_encoder: _target_: transformers.ViTMAEModel.from_pretrained pretrained_model_name_or_path: facebook/vit-mae-large m_former: _target_: unilarn.src.models.m_former.MFormer add_pooling_layer: false config: _target_: transformers.ViTConfig query_num: 8 input_hidden_size: 1024 num_patches: 197 attention_probs_dropout_prob: 0.0 hidden_act: gelu hidden_dropout_prob: 0.0 hidden_size: 768 initializer_range: 0.02 intermediate_size: 3072 layer_norm_eps: 1.0e-12 model_type: vit num_attention_heads: 12 num_hidden_layers: 4 qkv_bias: true m_former_depth: _target_: unilarn.src.models.m_former.MFormer add_pooling_layer: false config: _target_: transformers.ViTConfig query_num: 8 input_hidden_size: 1024 num_patches: 197 attention_probs_dropout_prob: 0.0 hidden_act: gelu hidden_dropout_prob: 0.0 hidden_size: 768 initializer_range: 0.02 intermediate_size: 3072 layer_norm_eps: 1.0e-12 model_type: vit num_attention_heads: 12 num_hidden_layers: 4 qkv_bias: true vector_quantizer: _target_: unilarn.src.models.vector_quantizer.VectorQuantizer2 n_e: 128 e_dim: 32 beta: 0.25 remap: null sane_index_shape: true legacy: false vector_quantizer_uni: _target_: unilarn.src.models.vector_quantizer.VectorQuantizer2 n_e: 128 e_dim: 32 beta: 0.25 remap: null sane_index_shape: true legacy: false decoder: _target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder config: _target_: transformers.ViTConfig query_num: 8 attention_probs_dropout_prob: 0.0 hidden_act: gelu hidden_dropout_prob: 0.0 hidden_size: 768 image_size: 224 initializer_range: 0.02 intermediate_size: 3072 layer_norm_eps: 1.0e-12 model_type: vit num_attention_heads: 12 num_channels: 3 num_hidden_layers: 12 patch_size: 16 qkv_bias: true encoder_stride: 16 num_patches: 196 depth_decoder: _target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder config: _target_: transformers.ViTConfig query_num: 8 attention_probs_dropout_prob: 0.0 hidden_act: gelu hidden_dropout_prob: 0.0 hidden_size: 768 image_size: 224 initializer_range: 0.02 intermediate_size: 3072 layer_norm_eps: 1.0e-12 model_type: vit num_attention_heads: 12 num_channels: 3 num_hidden_layers: 12 patch_size: 16 qkv_bias: true encoder_stride: 16 num_patches: 196