| """Gemma3 Pi0.6 (270m VLM) configuration""" |
|
|
| from transformers import Gemma3Config |
|
|
|
|
| class Gemma3Pi06Config(Gemma3Config): |
| """ |
| Configuration for Gemma3 Pi0.6 - a VLM with 270m language model. |
| |
| This config combines: |
| - Vision tower from google/gemma-3-4b-pt (SigLIP) |
| - Multi-modal projector (reinitialize for dimension compatibility) |
| - Language model from google/gemma-3-270m |
| """ |
|
|
| model_type = "gemma3" |
|
|
| def __init__( |
| self, |
| vlm_base_model="google/gemma-3-4b-pt", |
| llm_base_model="google/gemma-3-270m", |
| **kwargs |
| ): |
| |
| self.vlm_base_model = vlm_base_model |
| self.llm_base_model = llm_base_model |
|
|
| |
| from transformers import AutoConfig |
|
|
| vlm_config = AutoConfig.from_pretrained(vlm_base_model, trust_remote_code=True) |
| llm_config = AutoConfig.from_pretrained(llm_base_model, trust_remote_code=True) |
|
|
| |
| super().__init__(**vlm_config.to_dict()) |
|
|
| |
| for key, value in llm_config.to_dict().items(): |
| if key not in ['_name_or_path', 'transformers_version', 'model_type', 'architectures', 'vocab_size']: |
| setattr(self.text_config, key, value) |
|
|
| |
| |
| |
| |
| self.text_config.vocab_size = vlm_config.text_config.vocab_size |
|
|
| |
| for key, value in kwargs.items(): |
| setattr(self, key, value) |
|
|