{ "_gradient_checkpointing": false, "architectures": [ "MetaQuery" ], "attn_implementation": null, "connector_num_hidden_layers": 24, "diffusion_model_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", "discard_mllm_image_feature": false, "dtype": "bfloat16", "in_channels": 32, "input_size": [ 15, 20 ], "loss_type": "flow", "max_input_text_tokens": 256, "max_pixels": 1003520, "min_pixels": 200740, "mllm_id": "google/gemma-2-2b-it", "model_type": "metaquery", "modules_to_freeze": [ "vae", "model.mllm_backbone" ], "modules_to_unfreeze": [], "noise_scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", "num_metaqueries": 64, "scheduler_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers", "source_vae_feature": true, "system_prompt": "You are a robot and should focus on your actions. Generate a new image that meets the user's instruction while maintaining consistency with the original input where appropriate.", "transformers_version": "4.57.1", "vae_downsample_f": 32, "vae_id": "Efficient-Large-Model/Sana_1600M_512px_diffusers" }