Mineral-Nano-1 / create_safetensors.py
Luke-Bergen's picture
Create create_safetensors.py
6c759b5 verified
#!/usr/bin/env python3
"""
Create SafeTensors files for Mineral Nano 1 Vision (Mid-Range 4.1B)
This creates randomly initialized weights ready for upload to Hugging Face
"""
import torch
import json
from safetensors.torch import save_file
from pathlib import Path
import math
def initialize_weights(shape, init_type="normal", std=0.02):
"""Initialize tensor with proper initialization"""
if init_type == "normal":
return torch.randn(shape) * std
elif init_type == "uniform":
limit = math.sqrt(3.0 * std)
return torch.rand(shape) * 2 * limit - limit
else:
return torch.zeros(shape)
def create_mineral_nano_weights():
"""Create all model weights for Mineral Nano 1 Vision"""
print("Creating Mineral Nano 1 Vision SafeTensors (4.1B parameters)...")
print("This will take a few minutes and use ~16GB RAM temporarily...")
state_dict = {}
# Configuration
vocab_size = 64000
hidden_size = 2048
intermediate_size = 8192
num_layers = 28
num_heads = 32
num_kv_heads = 8
vision_hidden = 1536
vision_intermediate = 6144
vision_layers = 24
vision_heads = 24
vision_patches = 1024 # 448/14 = 32, 32*32 = 1024
# ============ LANGUAGE MODEL WEIGHTS ============
print("\n[1/4] Creating language model embeddings...")
# Token embeddings
state_dict["model.embed_tokens.weight"] = initialize_weights(
(vocab_size, hidden_size), "normal", 0.02
)
print("[2/4] Creating language model layers (28 layers)...")
# Transformer layers
for layer_idx in range(num_layers):
prefix = f"model.layers.{layer_idx}"
# Self attention
state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
(hidden_size, hidden_size), "normal", 0.02
)
state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
(num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
)
state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
(num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
)
state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
(hidden_size, hidden_size), "normal", 0.02
)
# MLP
state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
(intermediate_size, hidden_size), "normal", 0.02
)
state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
(intermediate_size, hidden_size), "normal", 0.02
)
state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
(hidden_size, intermediate_size), "normal", 0.02
)
# Layer norms
state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
if (layer_idx + 1) % 5 == 0:
print(f" ✓ Completed {layer_idx + 1}/{num_layers} layers")
# Final layer norm and LM head
state_dict["model.norm.weight"] = torch.ones(hidden_size)
state_dict["lm_head.weight"] = initialize_weights(
(vocab_size, hidden_size), "normal", 0.02
)
print("[3/4] Creating vision encoder (24 layers)...")
# ============ VISION ENCODER WEIGHTS ============
# Vision embeddings
state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
(vision_hidden, 3, 14, 14), "normal", 0.02
)
state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
(vision_hidden,), "normal", 0.02
)
state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
(vision_patches + 1, vision_hidden), "normal", 0.02
)
# Vision transformer layers
for layer_idx in range(vision_layers):
prefix = f"vision_model.encoder.layers.{layer_idx}"
# Self attention
state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
(vision_hidden, vision_hidden), "normal", 0.02
)
state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
(vision_hidden, vision_hidden), "normal", 0.02
)
state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
(vision_hidden, vision_hidden), "normal", 0.02
)
state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
(vision_hidden, vision_hidden), "normal", 0.02
)
state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
# MLP
state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
(vision_intermediate, vision_hidden), "normal", 0.02
)
state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
(vision_hidden, vision_intermediate), "normal", 0.02
)
state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
# Layer norms
state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
if (layer_idx + 1) % 5 == 0:
print(f" ✓ Completed {layer_idx + 1}/{vision_layers} vision layers")
# Vision post-layernorm
state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
print("[4/4] Creating vision-language projection...")
# ============ MULTI-MODAL PROJECTOR ============
# Project vision features to language model dimension
state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
(hidden_size, vision_hidden), "normal", 0.02
)
state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
(hidden_size, hidden_size), "normal", 0.02
)
state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
return state_dict
def save_sharded_safetensors(state_dict, output_dir="./mineral-nano-weights"):
"""Save model in sharded format for Hugging Face"""
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
print(f"\nSaving SafeTensors files to {output_dir}/")
# Calculate sizes and split into shards
max_shard_size = 4.5 * 1024 * 1024 * 1024 # 4.5 GB per shard
shards = []
current_shard = {}
current_size = 0
shard_idx = 1
for key, tensor in state_dict.items():
tensor_size = tensor.numel() * tensor.element_size()
if current_size + tensor_size > max_shard_size and current_shard:
# Save current shard
shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
save_file(current_shard, output_path / shard_file)
shards.append((shard_file, list(current_shard.keys())))
# Start new shard
current_shard = {}
current_size = 0
shard_idx += 1
current_shard[key] = tensor
current_size += tensor_size
# Save final shard
if current_shard:
shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
save_file(current_shard, output_path / shard_file)
shards.append((shard_file, list(current_shard.keys())))
# Create index file
index = {
"metadata": {
"total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
},
"weight_map": {}
}
for shard_file, keys in shards:
for key in keys:
index["weight_map"][key] = shard_file
index_path = output_path / "model.safetensors.index.json"
with open(index_path, "w") as f:
json.dump(index, f, indent=2)
print(f" ✓ Saved index file: model.safetensors.index.json")
# Calculate total parameters
total_params = sum(t.numel() for t in state_dict.values())
total_size = sum(t.numel() * t.element_size() for t in state_dict.values())
print(f"\n{'='*60}")
print(f"✓ SUCCESS! Model weights created:")
print(f" Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)")
print(f" Total Size: {total_size/1e9:.2f} GB")
print(f" Number of Shards: {len(shards)}")
print(f" Output Directory: {output_path.absolute()}")
print(f"{'='*60}")
print(f"\nNext steps:")
print(f"1. Upload these files to Hugging Face:")
print(f" - model-00001-of-00002.safetensors")
print(f" - model-00002-of-00002.safetensors")
print(f" - model.safetensors.index.json")
print(f"2. Your model will be ready to load!")
print(f"\nNote: These are randomly initialized weights.")
print(f"For a working model, you need to train or fine-tune.")
if __name__ == "__main__":
print("="*60)
print("Mineral Nano 1 Vision - SafeTensors Generator")
print("Mid-Range 4.1B Parameter Model")
print("="*60)
# Create weights
state_dict = create_mineral_nano_weights()
# Save to disk
save_sharded_safetensors(state_dict)
print("\n✓ All done! Your SafeTensors files are ready.")