| _target_: unilarn.src.models.unilarn.UniLARN | |
| codebook_dim: 32 | |
| commit_loss_w: 1.0 | |
| recon_loss_w: 1.0 | |
| recon_depth_loss_w: 1.0 | |
| perceptual_loss_w: 1.0 | |
| image_encoder: | |
| _target_: transformers.ViTMAEModel.from_pretrained | |
| pretrained_model_name_or_path: facebook/vit-mae-large | |
| m_former: | |
| _target_: unilarn.src.models.m_former.MFormer | |
| add_pooling_layer: false | |
| config: | |
| _target_: transformers.ViTConfig | |
| query_num: 8 | |
| input_hidden_size: 1024 | |
| num_patches: 197 | |
| attention_probs_dropout_prob: 0.0 | |
| hidden_act: gelu | |
| hidden_dropout_prob: 0.0 | |
| hidden_size: 768 | |
| initializer_range: 0.02 | |
| intermediate_size: 3072 | |
| layer_norm_eps: 1.0e-12 | |
| model_type: vit | |
| num_attention_heads: 12 | |
| num_hidden_layers: 4 | |
| qkv_bias: true | |
| m_former_depth: | |
| _target_: unilarn.src.models.m_former.MFormer | |
| add_pooling_layer: false | |
| config: | |
| _target_: transformers.ViTConfig | |
| query_num: 8 | |
| input_hidden_size: 1024 | |
| num_patches: 197 | |
| attention_probs_dropout_prob: 0.0 | |
| hidden_act: gelu | |
| hidden_dropout_prob: 0.0 | |
| hidden_size: 768 | |
| initializer_range: 0.02 | |
| intermediate_size: 3072 | |
| layer_norm_eps: 1.0e-12 | |
| model_type: vit | |
| num_attention_heads: 12 | |
| num_hidden_layers: 4 | |
| qkv_bias: true | |
| vector_quantizer: | |
| _target_: unilarn.src.models.vector_quantizer.VectorQuantizer2 | |
| n_e: 128 | |
| e_dim: 32 | |
| beta: 0.25 | |
| remap: null | |
| sane_index_shape: true | |
| legacy: false | |
| vector_quantizer_uni: | |
| _target_: unilarn.src.models.vector_quantizer.VectorQuantizer2 | |
| n_e: 128 | |
| e_dim: 32 | |
| beta: 0.25 | |
| remap: null | |
| sane_index_shape: true | |
| legacy: false | |
| decoder: | |
| _target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder | |
| config: | |
| _target_: transformers.ViTConfig | |
| query_num: 8 | |
| attention_probs_dropout_prob: 0.0 | |
| hidden_act: gelu | |
| hidden_dropout_prob: 0.0 | |
| hidden_size: 768 | |
| image_size: 224 | |
| initializer_range: 0.02 | |
| intermediate_size: 3072 | |
| layer_norm_eps: 1.0e-12 | |
| model_type: vit | |
| num_attention_heads: 12 | |
| num_channels: 3 | |
| num_hidden_layers: 12 | |
| patch_size: 16 | |
| qkv_bias: true | |
| encoder_stride: 16 | |
| num_patches: 196 | |
| depth_decoder: | |
| _target_: unilarn.src.models.latent_action_decoder.LatentActionDecoder | |
| config: | |
| _target_: transformers.ViTConfig | |
| query_num: 8 | |
| attention_probs_dropout_prob: 0.0 | |
| hidden_act: gelu | |
| hidden_dropout_prob: 0.0 | |
| hidden_size: 768 | |
| image_size: 224 | |
| initializer_range: 0.02 | |
| intermediate_size: 3072 | |
| layer_norm_eps: 1.0e-12 | |
| model_type: vit | |
| num_attention_heads: 12 | |
| num_channels: 3 | |
| num_hidden_layers: 12 | |
| patch_size: 16 | |
| qkv_bias: true | |
| encoder_stride: 16 | |
| num_patches: 196 | |