sonsus commited on
Commit
86096d3
·
verified ·
1 Parent(s): f9c4e08

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gemma3-270m-VLM (Pi0.6)
2
+
3
+ A Vision-Language Model combining:
4
+ - **Vision Tower**: SigLIP from google/gemma-3-4b-pt (417M params)
5
+ - **Multi-modal Projector**: Randomly initialized (739K params)
6
+ - **Language Model**: google/gemma-3-270m (268M params)
7
+
8
+ **Total**: 686M parameters
9
+
10
+ ## Architecture
11
+
12
+ - Vision hidden size: 1152
13
+ - LLM hidden size: 640
14
+ - Vocab size: 262,208 (includes 64 image tokens)
15
+ - Image token index: 262,144
16
+
17
+ ## Usage
18
+
19
+ ### With LLaMAFactory
20
+
21
+ ```bash
22
+ llamafactory-cli train \
23
+ --stage sft \
24
+ --model_name_or_path models/gemma3-270m-vlm-with-weights \
25
+ --template gemma3 \
26
+ --dataset mllm_demo \
27
+ --freeze_vision_tower True \
28
+ --freeze_multi_modal_projector True \
29
+ --bf16 True \
30
+ ...
31
+ ```
32
+
33
+ ### With Transformers
34
+
35
+ ```python
36
+ from transformers import AutoModelForImageTextToText, AutoProcessor
37
+
38
+ model = AutoModelForImageTextToText.from_pretrained(
39
+ "models/gemma3-270m-vlm-with-weights",
40
+ torch_dtype="bfloat16"
41
+ )
42
+ processor = AutoProcessor.from_pretrained("models/gemma3-270m-vlm-with-weights")
43
+ ```
44
+
45
+ ## Training Recommendations
46
+
47
+ 1. **Freeze vision tower and projector initially** to train only the LLM
48
+ 2. **Use small learning rate** (e.g., 5e-5 or 1e-4)
49
+ 3. **Gradually unfreeze** projector after LLM converges
50
+ 4. Vision tower can remain frozen if using pretrained vision encoder
51
+
52
+ ## Notes
53
+
54
+ - Multi-modal projector is randomly initialized and needs training
55
+ - The model uses Gemma3 tokenizer with 262,144 base tokens + 64 image tokens
56
+ - Compatible with all Gemma3 features (sliding window attention, etc.)
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3Pi06ForConditionalGeneration"
4
+ ],
5
+ "attn_implementation": null,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_gemma3_pi06.Gemma3Pi06Config",
8
+ "AutoModelForImageTextToText": "modeling_gemma3_pi06.Gemma3Pi06ForConditionalGeneration"
9
+ },
10
+ "boi_token_index": 255999,
11
+ "dtype": "float32",
12
+ "eoi_token_index": 256000,
13
+ "image_token_index": 262144,
14
+ "initializer_range": 0.02,
15
+ "llm_base_model": "google/gemma-3-270m",
16
+ "mm_tokens_per_image": 256,
17
+ "model_type": "gemma3",
18
+ "model_variant": "pi06",
19
+ "text_config": {
20
+ "_sliding_window_pattern": 6,
21
+ "attention_bias": false,
22
+ "attention_dropout": 0.0,
23
+ "attn_logit_softcapping": null,
24
+ "dtype": "bfloat16",
25
+ "final_logit_softcapping": null,
26
+ "head_dim": 256,
27
+ "hidden_activation": "gelu_pytorch_tanh",
28
+ "hidden_size": 640,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 2048,
31
+ "layer_types": [
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "full_attention"
50
+ ],
51
+ "max_position_embeddings": 32768,
52
+ "model_type": "gemma3_text",
53
+ "num_attention_heads": 4,
54
+ "num_hidden_layers": 18,
55
+ "num_key_value_heads": 1,
56
+ "query_pre_attn_scalar": 256,
57
+ "rms_norm_eps": 1e-06,
58
+ "rope_local_base_freq": 10000.0,
59
+ "rope_scaling": null,
60
+ "rope_theta": 1000000.0,
61
+ "sliding_window": 512,
62
+ "use_bidirectional_attention": false,
63
+ "use_cache": true,
64
+ "vocab_size": 262208
65
+ },
66
+ "transformers_version": "4.57.1",
67
+ "vision_config": {
68
+ "attention_dropout": 0.0,
69
+ "hidden_act": "gelu_pytorch_tanh",
70
+ "hidden_size": 1152,
71
+ "image_size": 896,
72
+ "intermediate_size": 4304,
73
+ "layer_norm_eps": 1e-06,
74
+ "model_type": "siglip_vision_model",
75
+ "num_attention_heads": 16,
76
+ "num_channels": 3,
77
+ "num_hidden_layers": 27,
78
+ "patch_size": 14,
79
+ "vision_use_head": false
80
+ },
81
+ "vlm_base_model": "google/gemma-3-4b-pt"
82
+ }
configuration_gemma3_pi06.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gemma3 Pi0.6 (270m VLM) configuration"""
2
+
3
+ from transformers import Gemma3Config
4
+
5
+
6
+ class Gemma3Pi06Config(Gemma3Config):
7
+ """
8
+ Configuration for Gemma3 Pi0.6 - a VLM with 270m language model.
9
+
10
+ This config combines:
11
+ - Vision tower from google/gemma-3-4b-pt (SigLIP)
12
+ - Multi-modal projector (reinitialize for dimension compatibility)
13
+ - Language model from google/gemma-3-270m
14
+ """
15
+
16
+ model_type = "gemma3" # Keep gemma3 for LLaMAFactory compatibility
17
+
18
+ def __init__(
19
+ self,
20
+ vlm_base_model="google/gemma-3-4b-pt",
21
+ llm_base_model="google/gemma-3-270m",
22
+ **kwargs
23
+ ):
24
+ # Store base model IDs for reference
25
+ self.vlm_base_model = vlm_base_model
26
+ self.llm_base_model = llm_base_model
27
+
28
+ # Load base configs
29
+ from transformers import AutoConfig
30
+
31
+ vlm_config = AutoConfig.from_pretrained(vlm_base_model, trust_remote_code=True)
32
+ llm_config = AutoConfig.from_pretrained(llm_base_model, trust_remote_code=True)
33
+
34
+ # Initialize with VLM config (keeps vision_config)
35
+ super().__init__(**vlm_config.to_dict())
36
+
37
+ # Override text_config with LLM config (except vocab_size)
38
+ for key, value in llm_config.to_dict().items():
39
+ if key not in ['_name_or_path', 'transformers_version', 'model_type', 'architectures', 'vocab_size']:
40
+ setattr(self.text_config, key, value)
41
+
42
+ # Keep VLM vocab size for image tokens
43
+ # VLM: 262208 (262144 base + 64 image tokens)
44
+ # LLM: 262144 (base only)
45
+ # We need the extended vocab for multimodal functionality
46
+ self.text_config.vocab_size = vlm_config.text_config.vocab_size
47
+
48
+ # Apply any user overrides
49
+ for key, value in kwargs.items():
50
+ setattr(self, key, value)
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.1"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88da8abcdebae0e4d38bff59a488e752dae7c8a344b41c1776d3f958584df649
3
+ size 2206791152
modeling_gemma3_pi06.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gemma3 Pi0.6 (270m VLM) modeling"""
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
6
+ from transformers.models.gemma3.modeling_gemma3 import Gemma3ForConditionalGeneration
7
+
8
+ from .configuration_gemma3_pi06 import Gemma3Pi06Config
9
+
10
+
11
+ class Gemma3Pi06ForConditionalGeneration(Gemma3ForConditionalGeneration):
12
+ """
13
+ Gemma3 Pi0.6 - VLM with 270m language model.
14
+
15
+ Combines vision components from gemma-3-4b-pt with language model from gemma-3-270m.
16
+ """
17
+
18
+ config_class = Gemma3Pi06Config
19
+
20
+ def __init__(self, config: Gemma3Pi06Config):
21
+ # Initialize with the config (creates architecture with 270m LLM size)
22
+ super().__init__(config)
23
+
24
+ # Reinitialize projector for correct dimensions
25
+ # Vision hidden: 1152 -> LLM hidden: 640 (for 270m)
26
+ vision_hidden = config.vision_config.hidden_size
27
+ llm_hidden = config.text_config.hidden_size
28
+
29
+ # Recreate projector with correct dimensions
30
+ self.model.multi_modal_projector.mm_input_projection_weight = nn.Parameter(
31
+ torch.randn(vision_hidden, llm_hidden) * 0.02
32
+ )
33
+ self.model.multi_modal_projector.mm_soft_emb_norm = nn.LayerNorm(
34
+ vision_hidden, eps=config.text_config.rms_norm_eps
35
+ )
36
+
37
+ @classmethod
38
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
39
+ """
40
+ Load model with weights from two sources:
41
+ - Vision tower + processor from VLM base (gemma-3-4b-pt)
42
+ - Language model from LLM base (gemma-3-270m)
43
+ """
44
+ # If loading from a saved checkpoint (not initial creation)
45
+ if kwargs.get('_from_checkpoint', False):
46
+ return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
47
+
48
+ # Load config
49
+ config = Gemma3Pi06Config.from_pretrained(
50
+ pretrained_model_name_or_path,
51
+ **kwargs.get('config_kwargs', {})
52
+ )
53
+
54
+ # Get base model IDs
55
+ vlm_base = config.vlm_base_model
56
+ llm_base = config.llm_base_model
57
+
58
+ print(f"Loading Gemma3Pi06 model:")
59
+ print(f" Vision components from: {vlm_base}")
60
+ print(f" Language model from: {llm_base}")
61
+
62
+ # Initialize model with config
63
+ model = cls(config)
64
+
65
+ # Load vision tower and projector from VLM
66
+ print(f" [1/3] Loading vision tower from {vlm_base}...")
67
+ vlm_model = AutoModelForImageTextToText.from_pretrained(
68
+ vlm_base,
69
+ trust_remote_code=True,
70
+ torch_dtype=kwargs.get('torch_dtype', torch.bfloat16),
71
+ low_cpu_mem_usage=True,
72
+ )
73
+
74
+ # Copy vision tower weights
75
+ model.model.vision_tower.load_state_dict(vlm_model.model.vision_tower.state_dict())
76
+ print(f" ✓ Vision tower loaded")
77
+
78
+ # Note: projector will be randomly initialized (new dimensions)
79
+ print(f" ⚠ Multi-modal projector randomly initialized (1152 -> 640)")
80
+
81
+ # Load language model from LLM
82
+ print(f" [2/3] Loading language model from {llm_base}...")
83
+ llm_model = AutoModelForCausalLM.from_pretrained(
84
+ llm_base,
85
+ trust_remote_code=True,
86
+ torch_dtype=kwargs.get('torch_dtype', torch.bfloat16),
87
+ low_cpu_mem_usage=True,
88
+ )
89
+
90
+ # Copy language model weights with vocab size handling
91
+ llm_vocab_size = llm_model.model.embed_tokens.weight.shape[0] # 262144
92
+ vlm_vocab_size = config.text_config.vocab_size # 262208 (includes image tokens)
93
+
94
+ # Load LLM state dict
95
+ llm_state_dict = llm_model.model.state_dict()
96
+
97
+ # Handle embed_tokens: extend with random init for image tokens
98
+ if llm_vocab_size < vlm_vocab_size:
99
+ print(f" ⚠ Extending embed_tokens: {llm_vocab_size} -> {vlm_vocab_size}")
100
+ llm_embed = llm_state_dict['embed_tokens.weight']
101
+
102
+ # Create extended embedding with same dtype
103
+ extended_embed = torch.randn(
104
+ vlm_vocab_size,
105
+ llm_embed.shape[1],
106
+ dtype=llm_embed.dtype,
107
+ device=llm_embed.device
108
+ ) * 0.02
109
+
110
+ # Copy original embeddings
111
+ extended_embed[:llm_vocab_size] = llm_embed
112
+ llm_state_dict['embed_tokens.weight'] = extended_embed
113
+
114
+ model.model.language_model.load_state_dict(llm_state_dict)
115
+ print(f" ✓ Language model loaded (vocab extended for image tokens)")
116
+
117
+ # Copy lm_head with vocab size handling
118
+ print(f" [3/3] Loading lm_head...")
119
+ llm_lm_head = llm_model.lm_head.weight
120
+
121
+ if llm_vocab_size < vlm_vocab_size:
122
+ print(f" ⚠ Extending lm_head: {llm_vocab_size} -> {vlm_vocab_size}")
123
+ # Create extended lm_head
124
+ extended_lm_head = torch.randn(
125
+ vlm_vocab_size,
126
+ llm_lm_head.shape[1],
127
+ dtype=llm_lm_head.dtype,
128
+ device=llm_lm_head.device
129
+ ) * 0.02
130
+
131
+ # Copy original weights
132
+ extended_lm_head[:llm_vocab_size] = llm_lm_head
133
+ model.lm_head.weight.data = extended_lm_head
134
+ else:
135
+ model.lm_head.weight.data = llm_lm_head
136
+
137
+ print(f" ✓ lm_head loaded (vocab extended for image tokens)")
138
+
139
+ # Move to device if specified
140
+ if 'device_map' in kwargs:
141
+ device_map = kwargs['device_map']
142
+ if device_map != 'auto':
143
+ model = model.to(device_map)
144
+
145
+ print(f"✓ Gemma3Pi06 model loaded successfully")
146
+
147
+ return model
148
+
149
+ def save_pretrained(self, save_directory, **kwargs):
150
+ """Save model with special marker to load correctly"""
151
+ # Mark this as a checkpoint so from_pretrained doesn't try to reload from bases
152
+ kwargs['_from_checkpoint'] = True
153
+ return super().save_pretrained(save_directory, **kwargs)
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff