"""Gemma3 Pi0.6 (270m VLM) configuration""" from transformers import Gemma3Config class Gemma3Pi06Config(Gemma3Config): """ Configuration for Gemma3 Pi0.6 - a VLM with 270m language model. This config combines: - Vision tower from google/gemma-3-4b-pt (SigLIP) - Multi-modal projector (reinitialize for dimension compatibility) - Language model from google/gemma-3-270m """ model_type = "gemma3" # Keep gemma3 for LLaMAFactory compatibility def __init__( self, vlm_base_model="google/gemma-3-4b-pt", llm_base_model="google/gemma-3-270m", **kwargs ): # Store base model IDs for reference self.vlm_base_model = vlm_base_model self.llm_base_model = llm_base_model # Load base configs from transformers import AutoConfig vlm_config = AutoConfig.from_pretrained(vlm_base_model, trust_remote_code=True) llm_config = AutoConfig.from_pretrained(llm_base_model, trust_remote_code=True) # Initialize with VLM config (keeps vision_config) super().__init__(**vlm_config.to_dict()) # Override text_config with LLM config (except vocab_size) for key, value in llm_config.to_dict().items(): if key not in ['_name_or_path', 'transformers_version', 'model_type', 'architectures', 'vocab_size']: setattr(self.text_config, key, value) # Keep VLM vocab size for image tokens # VLM: 262208 (262144 base + 64 image tokens) # LLM: 262144 (base only) # We need the extended vocab for multimodal functionality self.text_config.vocab_size = vlm_config.text_config.vocab_size # Apply any user overrides for key, value in kwargs.items(): setattr(self, key, value)