gemma-3-0.7b-vlm-custom / configuration_gemma3_pi06.py

Upload folder using huggingface_hub

86096d3 verified 5 months ago

1.81 kB

	"""Gemma3 Pi0.6 (270m VLM) configuration"""

	from transformers import Gemma3Config


	class Gemma3Pi06Config(Gemma3Config):
	"""
	Configuration for Gemma3 Pi0.6 - a VLM with 270m language model.

	This config combines:
	- Vision tower from google/gemma-3-4b-pt (SigLIP)
	- Multi-modal projector (reinitialize for dimension compatibility)
	- Language model from google/gemma-3-270m
	"""

	model_type = "gemma3" # Keep gemma3 for LLaMAFactory compatibility

	def __init__(
	self,
	vlm_base_model="google/gemma-3-4b-pt",
	llm_base_model="google/gemma-3-270m",
	**kwargs
	):
	# Store base model IDs for reference
	self.vlm_base_model = vlm_base_model
	self.llm_base_model = llm_base_model

	# Load base configs
	from transformers import AutoConfig

	vlm_config = AutoConfig.from_pretrained(vlm_base_model, trust_remote_code=True)
	llm_config = AutoConfig.from_pretrained(llm_base_model, trust_remote_code=True)

	# Initialize with VLM config (keeps vision_config)
	super().__init__(**vlm_config.to_dict())

	# Override text_config with LLM config (except vocab_size)
	for key, value in llm_config.to_dict().items():
	if key not in ['_name_or_path', 'transformers_version', 'model_type', 'architectures', 'vocab_size']:
	setattr(self.text_config, key, value)

	# Keep VLM vocab size for image tokens
	# VLM: 262208 (262144 base + 64 image tokens)
	# LLM: 262144 (base only)
	# We need the extended vocab for multimodal functionality
	self.text_config.vocab_size = vlm_config.text_config.vocab_size

	# Apply any user overrides
	for key, value in kwargs.items():
	setattr(self, key, value)