{ "vocab_size": 50258, "emb_dim": 768, "context_length": 256, "n_heads": 12, "n_layers": 12, "drop_rate": 0.0, "qkv_bias": true, "cross_attention_pos": [ 2, 5, 8, 11 ], "vision_enabled": true, "vision_encoder_type": "torchvision_vit_b_16", "vision_pretrained": true, "vision_freeze": true, "perceiver_num_latents": 128, "perceiver_depth": 6, "perceiver_heads": 12, "perceiver_dim_head": 64, "weight_decay": 0.01, "learning_rate": 0.0001 }