{ "architectures": [ "Qwen3VLGeometryForConditionalGeneration" ], "depth_drop_prob": 0.0, "depth_loss_weight": 1.0, "depth_only_training": true, "dtype": "bfloat16", "eos_token_id": 151645, "geometry_config": { "dtype": "bfloat16", "enabled": true, "encoder_type": "lingbot_depth", "freeze_encoder": true, "fusion_layers": null, "fusion_method": "resize_and_add", "hidden_size": 1024, "match_post_merge_resolution": false, "merger_hidden_dim": null, "merger_type": "mlp", "model_name_or_path": null, "model_type": "geometry_encoder", "num_heads": 8, "num_layers": 1, "output_hidden_size": null, "reference_frame": "first", "use_3d_position_encoding": true, "encoder_model_config": { "encoder": { "backbone": "dinov2_vitl14", "intermediate_layers": 1, "dim_out": 1024, "strict": false, "depth_emb_mode": "conv_1c", "img_depth_fuse_mode": "cat_token" }, "neck": { "dim_in": [ 1026, 2, 2, 2, 2 ], "dim_out": null, "dim_res_blocks": [ 1024, 256, 128, 64, 32 ], "num_res_blocks": [ 0, 2, 2, 2, 0 ], "res_block_in_norm": "none", "res_block_hidden_norm": "none", "resamplers": [ "conv_transpose", "conv_transpose", "conv_transpose", "bilinear" ] }, "depth_head": { "dim_in": [ 1024, 256, 128, 64, 32 ], "dim_out": [ null, null, null, null, 1 ], "dim_res_blocks": [ 1024, 256, 128, 64, 32 ], "num_res_blocks": [ 0, 1, 1, 1, 0 ], "res_block_in_norm": "none", "res_block_hidden_norm": "none", "resamplers": [ "conv_transpose", "conv_transpose", "conv_transpose", "bilinear" ] }, "mask_head": { "dim_in": [ 1024, 256, 128, 64, 32 ], "dim_out": [ null, null, null, null, 1 ], "dim_res_blocks": [ 1024, 256, 128, 64, 32 ], "num_res_blocks": [ 0, 1, 1, 1, 0 ], "res_block_in_norm": "none", "res_block_hidden_norm": "none", "resamplers": [ "conv_transpose", "conv_transpose", "conv_transpose", "bilinear" ] }, "remap_output": "exp", "remap_depth_in": "log", "num_tokens_range": [ 1200, 3600 ] } }, "image_token_id": 151655, "model_type": "qwen3_vl_geometry", "pad_token_id": 151643, "save_depth_viz_dir": null, "save_depth_viz_interval": 1000, "save_depth_viz_max_per_dataset": 20, "text_config": { "attention_bias": false, "attention_dropout": 0.0, "bos_token_id": 151643, "dtype": "bfloat16", "eos_token_id": 151645, "head_dim": 128, "hidden_act": "silu", "hidden_size": 4096, "initializer_range": 0.02, "intermediate_size": 12288, "max_position_embeddings": 262144, "model_type": "qwen3_vl_geometry_text", "num_attention_heads": 32, "num_hidden_layers": 36, "num_key_value_heads": 8, "rms_norm_eps": 1e-06, "rope_scaling": { "mrope_interleaved": true, "mrope_section": [ 24, 20, 20 ], "rope_type": "default" }, "rope_theta": 5000000, "use_cache": false, "vocab_size": 151936 }, "tie_word_embeddings": false, "transformers_version": "4.57.1", "use_cache": false, "use_depth_decoder": true, "use_encoder_output_for_depth_loss": false, "video_token_id": 151656, "vision_config": { "deepstack_visual_indexes": [ 8, 16, 24 ], "depth": 27, "dtype": "bfloat16", "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "in_channels": 3, "initializer_range": 0.02, "intermediate_size": 4304, "model_type": "qwen3_vl_geometry", "num_heads": 16, "num_position_embeddings": 2304, "out_hidden_size": 4096, "patch_size": 16, "spatial_merge_size": 2, "temporal_patch_size": 2 }, "vision_end_token_id": 151653, "vision_start_token_id": 151652 }