Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitattributes +1 -0
README.md +56 -0
added_tokens.json +3 -0
config.json +82 -0
configuration_gemma3_pi06.py +50 -0
generation_config.json +7 -0
model.safetensors +3 -0
modeling_gemma3_pi06.py +153 -0
preprocessor_config.json +29 -0
processor_config.json +4 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# Gemma3-270m-VLM (Pi0.6)
+A Vision-Language Model combining:
+- **Vision Tower**: SigLIP from google/gemma-3-4b-pt (417M params)
+- **Multi-modal Projector**: Randomly initialized (739K params)
+- **Language Model**: google/gemma-3-270m (268M params)
+**Total**: 686M parameters
+## Architecture
+- Vision hidden size: 1152
+- LLM hidden size: 640
+- Vocab size: 262,208 (includes 64 image tokens)
+- Image token index: 262,144
+## Usage
+### With LLaMAFactory
+```bash
+llamafactory-cli train \
+    --stage sft \
+    --model_name_or_path models/gemma3-270m-vlm-with-weights \
+    --template gemma3 \
+    --dataset mllm_demo \
+    --freeze_vision_tower True \
+    --freeze_multi_modal_projector True \
+    --bf16 True \
+    ...
+```
+### With Transformers
+```python
+from transformers import AutoModelForImageTextToText, AutoProcessor
+model = AutoModelForImageTextToText.from_pretrained(
+    "models/gemma3-270m-vlm-with-weights",
+    torch_dtype="bfloat16"
+)
+processor = AutoProcessor.from_pretrained("models/gemma3-270m-vlm-with-weights")
+```
+## Training Recommendations
+1. **Freeze vision tower and projector initially** to train only the LLM
+2. **Use small learning rate** (e.g., 5e-5 or 1e-4)
+3. **Gradually unfreeze** projector after LLM converges
+4. Vision tower can remain frozen if using pretrained vision encoder
+## Notes
+- Multi-modal projector is randomly initialized and needs training
+- The model uses Gemma3 tokenizer with 262,144 base tokens + 64 image tokens
+- Compatible with all Gemma3 features (sliding window attention, etc.)

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

config.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+  "architectures": [
+    "Gemma3Pi06ForConditionalGeneration"
+  ],
+  "attn_implementation": null,
+  "auto_map": {
+    "AutoConfig": "configuration_gemma3_pi06.Gemma3Pi06Config",
+    "AutoModelForImageTextToText": "modeling_gemma3_pi06.Gemma3Pi06ForConditionalGeneration"
+  },
+  "boi_token_index": 255999,
+  "dtype": "float32",
+  "eoi_token_index": 256000,
+  "image_token_index": 262144,
+  "initializer_range": 0.02,
+  "llm_base_model": "google/gemma-3-270m",
+  "mm_tokens_per_image": 256,
+  "model_type": "gemma3",
+  "model_variant": "pi06",
+  "text_config": {
+    "_sliding_window_pattern": 6,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_logit_softcapping": null,
+    "dtype": "bfloat16",
+    "final_logit_softcapping": null,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 640,
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 32768,
+    "model_type": "gemma3_text",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 18,
+    "num_key_value_heads": 1,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_local_base_freq": 10000.0,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": 512,
+    "use_bidirectional_attention": false,
+    "use_cache": true,
+    "vocab_size": 262208
+  },
+  "transformers_version": "4.57.1",
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 896,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "vision_use_head": false
+  },
+  "vlm_base_model": "google/gemma-3-4b-pt"
+}

configuration_gemma3_pi06.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Gemma3 Pi0.6 (270m VLM) configuration"""
+from transformers import Gemma3Config
+class Gemma3Pi06Config(Gemma3Config):
+    """
+    Configuration for Gemma3 Pi0.6 - a VLM with 270m language model.
+    This config combines:
+    - Vision tower from google/gemma-3-4b-pt (SigLIP)
+    - Multi-modal projector (reinitialize for dimension compatibility)
+    - Language model from google/gemma-3-270m
+    """
+    model_type = "gemma3"  # Keep gemma3 for LLaMAFactory compatibility
+    def __init__(
+        self,
+        vlm_base_model="google/gemma-3-4b-pt",
+        llm_base_model="google/gemma-3-270m",
+        **kwargs
+    ):
+        # Store base model IDs for reference
+        self.vlm_base_model = vlm_base_model
+        self.llm_base_model = llm_base_model
+        # Load base configs
+        from transformers import AutoConfig
+        vlm_config = AutoConfig.from_pretrained(vlm_base_model, trust_remote_code=True)
+        llm_config = AutoConfig.from_pretrained(llm_base_model, trust_remote_code=True)
+        # Initialize with VLM config (keeps vision_config)
+        super().__init__(**vlm_config.to_dict())
+        # Override text_config with LLM config (except vocab_size)
+        for key, value in llm_config.to_dict().items():
+            if key not in ['_name_or_path', 'transformers_version', 'model_type', 'architectures', 'vocab_size']:
+                setattr(self.text_config, key, value)
+        # Keep VLM vocab size for image tokens
+        # VLM: 262208 (262144 base + 64 image tokens)
+        # LLM: 262144 (base only)
+        # We need the extended vocab for multimodal functionality
+        self.text_config.vocab_size = vlm_config.text_config.vocab_size
+        # Apply any user overrides
+        for key, value in kwargs.items():
+            setattr(self, key, value)

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.57.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88da8abcdebae0e4d38bff59a488e752dae7c8a344b41c1776d3f958584df649
+size 2206791152

modeling_gemma3_pi06.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Gemma3 Pi0.6 (270m VLM) modeling"""
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
+from transformers.models.gemma3.modeling_gemma3 import Gemma3ForConditionalGeneration
+from .configuration_gemma3_pi06 import Gemma3Pi06Config
+class Gemma3Pi06ForConditionalGeneration(Gemma3ForConditionalGeneration):
+    """
+    Gemma3 Pi0.6 - VLM with 270m language model.
+    Combines vision components from gemma-3-4b-pt with language model from gemma-3-270m.
+    """
+    config_class = Gemma3Pi06Config
+    def __init__(self, config: Gemma3Pi06Config):
+        # Initialize with the config (creates architecture with 270m LLM size)
+        super().__init__(config)
+        # Reinitialize projector for correct dimensions
+        # Vision hidden: 1152 -> LLM hidden: 640 (for 270m)
+        vision_hidden = config.vision_config.hidden_size
+        llm_hidden = config.text_config.hidden_size
+        # Recreate projector with correct dimensions
+        self.model.multi_modal_projector.mm_input_projection_weight = nn.Parameter(
+            torch.randn(vision_hidden, llm_hidden) * 0.02
+        )
+        self.model.multi_modal_projector.mm_soft_emb_norm = nn.LayerNorm(
+            vision_hidden, eps=config.text_config.rms_norm_eps
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load model with weights from two sources:
+        - Vision tower + processor from VLM base (gemma-3-4b-pt)
+        - Language model from LLM base (gemma-3-270m)
+        """
+        # If loading from a saved checkpoint (not initial creation)
+        if kwargs.get('_from_checkpoint', False):
+            return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        # Load config
+        config = Gemma3Pi06Config.from_pretrained(
+            pretrained_model_name_or_path,
+            **kwargs.get('config_kwargs', {})
+        )
+        # Get base model IDs
+        vlm_base = config.vlm_base_model
+        llm_base = config.llm_base_model
+        print(f"Loading Gemma3Pi06 model:")
+        print(f"  Vision components from: {vlm_base}")
+        print(f"  Language model from: {llm_base}")
+        # Initialize model with config
+        model = cls(config)
+        # Load vision tower and projector from VLM
+        print(f"  [1/3] Loading vision tower from {vlm_base}...")
+        vlm_model = AutoModelForImageTextToText.from_pretrained(
+            vlm_base,
+            trust_remote_code=True,
+            torch_dtype=kwargs.get('torch_dtype', torch.bfloat16),
+            low_cpu_mem_usage=True,
+        )
+        # Copy vision tower weights
+        model.model.vision_tower.load_state_dict(vlm_model.model.vision_tower.state_dict())
+        print(f"    ✓ Vision tower loaded")
+        # Note: projector will be randomly initialized (new dimensions)
+        print(f"    ⚠ Multi-modal projector randomly initialized (1152 -> 640)")
+        # Load language model from LLM
+        print(f"  [2/3] Loading language model from {llm_base}...")
+        llm_model = AutoModelForCausalLM.from_pretrained(
+            llm_base,
+            trust_remote_code=True,
+            torch_dtype=kwargs.get('torch_dtype', torch.bfloat16),
+            low_cpu_mem_usage=True,
+        )
+        # Copy language model weights with vocab size handling
+        llm_vocab_size = llm_model.model.embed_tokens.weight.shape[0]  # 262144
+        vlm_vocab_size = config.text_config.vocab_size  # 262208 (includes image tokens)
+        # Load LLM state dict
+        llm_state_dict = llm_model.model.state_dict()
+        # Handle embed_tokens: extend with random init for image tokens
+        if llm_vocab_size < vlm_vocab_size:
+            print(f"    ⚠ Extending embed_tokens: {llm_vocab_size} -> {vlm_vocab_size}")
+            llm_embed = llm_state_dict['embed_tokens.weight']
+            # Create extended embedding with same dtype
+            extended_embed = torch.randn(
+                vlm_vocab_size,
+                llm_embed.shape[1],
+                dtype=llm_embed.dtype,
+                device=llm_embed.device
+            ) * 0.02
+            # Copy original embeddings
+            extended_embed[:llm_vocab_size] = llm_embed
+            llm_state_dict['embed_tokens.weight'] = extended_embed
+        model.model.language_model.load_state_dict(llm_state_dict)
+        print(f"    ✓ Language model loaded (vocab extended for image tokens)")
+        # Copy lm_head with vocab size handling
+        print(f"  [3/3] Loading lm_head...")
+        llm_lm_head = llm_model.lm_head.weight
+        if llm_vocab_size < vlm_vocab_size:
+            print(f"    ⚠ Extending lm_head: {llm_vocab_size} -> {vlm_vocab_size}")
+            # Create extended lm_head
+            extended_lm_head = torch.randn(
+                vlm_vocab_size,
+                llm_lm_head.shape[1],
+                dtype=llm_lm_head.dtype,
+                device=llm_lm_head.device
+            ) * 0.02
+            # Copy original weights
+            extended_lm_head[:llm_vocab_size] = llm_lm_head
+            model.lm_head.weight.data = extended_lm_head
+        else:
+            model.lm_head.weight.data = llm_lm_head
+        print(f"    ✓ lm_head loaded (vocab extended for image tokens)")
+        # Move to device if specified
+        if 'device_map' in kwargs:
+            device_map = kwargs['device_map']
+            if device_map != 'auto':
+                model = model.to(device_map)
+        print(f"✓ Gemma3Pi06 model loaded successfully")
+        return model
+    def save_pretrained(self, save_directory, **kwargs):
+        """Save model with special marker to load correctly"""
+        # Mark this as a checkpoint so from_pretrained doesn't try to reload from bases
+        kwargs['_from_checkpoint'] = True
+        return super().save_pretrained(save_directory, **kwargs)

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pan_and_scan": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Gemma3ImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "pan_and_scan_max_num_crops": null,
+  "pan_and_scan_min_crop_size": null,
+  "pan_and_scan_min_ratio_to_activate": null,
+  "processor_class": "Gemma3Processor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "image_seq_length": 256,
+  "processor_class": "Gemma3Processor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff