_target_: unilact.src.models.unilact.UniLACT model_lang: _target_: transformers.T5EncoderModel.from_pretrained pretrained_model_name_or_path: t5-base model_vision: _target_: unilact.src.models.mae_model.MaeEncoder use_obs_feature: true pretrained_model_name_or_path: facebook/vit-mae-large model_causal_transformer: _target_: unilact.src.models.trajectory_gpt2.GPT2Model config: _target_: unilact.src.models.trajectory_gpt2.GPT2Config vocab_size: 1 n_embd: 768 n_layer: 12 n_head: 12 activation_function: relu dropout: 0.1 n_positions: 1024 act_dim: 7 hidden_size: 768 sequence_length: 2 chunk_size: 3 per_latent_motion_len: 8 latent_motion_codebook_size: 128 latent_motion_pred: true act_pred: false img_feat_dim: 1024 patch_feat_dim: 1024 lang_feat_dim: 768 mask_latent_motion_probability: 1.0 freeze_lang: true freeze_vision: true use_latent_motion_pos_embedding: true pred_tokens_modality: unified output_modality_tokens: cross-modal