gemma-3-0.7b-vlm-custom / configuration_gemma3_pi06.py
sonsus's picture
Upload folder using huggingface_hub
86096d3 verified
"""Gemma3 Pi0.6 (270m VLM) configuration"""
from transformers import Gemma3Config
class Gemma3Pi06Config(Gemma3Config):
"""
Configuration for Gemma3 Pi0.6 - a VLM with 270m language model.
This config combines:
- Vision tower from google/gemma-3-4b-pt (SigLIP)
- Multi-modal projector (reinitialize for dimension compatibility)
- Language model from google/gemma-3-270m
"""
model_type = "gemma3" # Keep gemma3 for LLaMAFactory compatibility
def __init__(
self,
vlm_base_model="google/gemma-3-4b-pt",
llm_base_model="google/gemma-3-270m",
**kwargs
):
# Store base model IDs for reference
self.vlm_base_model = vlm_base_model
self.llm_base_model = llm_base_model
# Load base configs
from transformers import AutoConfig
vlm_config = AutoConfig.from_pretrained(vlm_base_model, trust_remote_code=True)
llm_config = AutoConfig.from_pretrained(llm_base_model, trust_remote_code=True)
# Initialize with VLM config (keeps vision_config)
super().__init__(**vlm_config.to_dict())
# Override text_config with LLM config (except vocab_size)
for key, value in llm_config.to_dict().items():
if key not in ['_name_or_path', 'transformers_version', 'model_type', 'architectures', 'vocab_size']:
setattr(self.text_config, key, value)
# Keep VLM vocab size for image tokens
# VLM: 262208 (262144 base + 64 image tokens)
# LLM: 262144 (base only)
# We need the extended vocab for multimodal functionality
self.text_config.vocab_size = vlm_config.text_config.vocab_size
# Apply any user overrides
for key, value in kwargs.items():
setattr(self, key, value)