Mineral-Nano-1 / create_safetensors.py

Create create_safetensors.py

6c759b5 verified 8 months ago

10.2 kB

	#!/usr/bin/env python3
	"""
	Create SafeTensors files for Mineral Nano 1 Vision (Mid-Range 4.1B)
	This creates randomly initialized weights ready for upload to Hugging Face
	"""

	import torch
	import json
	from safetensors.torch import save_file
	from pathlib import Path
	import math

	def initialize_weights(shape, init_type="normal", std=0.02):
	"""Initialize tensor with proper initialization"""
	if init_type == "normal":
	return torch.randn(shape) * std
	elif init_type == "uniform":
	limit = math.sqrt(3.0 * std)
	return torch.rand(shape) * 2 * limit - limit
	else:
	return torch.zeros(shape)

	def create_mineral_nano_weights():
	"""Create all model weights for Mineral Nano 1 Vision"""

	print("Creating Mineral Nano 1 Vision SafeTensors (4.1B parameters)...")
	print("This will take a few minutes and use ~16GB RAM temporarily...")

	state_dict = {}

	# Configuration
	vocab_size = 64000
	hidden_size = 2048
	intermediate_size = 8192
	num_layers = 28
	num_heads = 32
	num_kv_heads = 8

	vision_hidden = 1536
	vision_intermediate = 6144
	vision_layers = 24
	vision_heads = 24
	vision_patches = 1024 # 448/14 = 32, 32*32 = 1024

	# ============ LANGUAGE MODEL WEIGHTS ============

	print("\n[1/4] Creating language model embeddings...")
	# Token embeddings
	state_dict["model.embed_tokens.weight"] = initialize_weights(
	(vocab_size, hidden_size), "normal", 0.02
	)

	print("[2/4] Creating language model layers (28 layers)...")
	# Transformer layers
	for layer_idx in range(num_layers):
	prefix = f"model.layers.{layer_idx}"

	# Self attention
	state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
	(hidden_size, hidden_size), "normal", 0.02
	)
	state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
	(num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
	)
	state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
	(num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
	)
	state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
	(hidden_size, hidden_size), "normal", 0.02
	)

	# MLP
	state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
	(intermediate_size, hidden_size), "normal", 0.02
	)
	state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
	(intermediate_size, hidden_size), "normal", 0.02
	)
	state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
	(hidden_size, intermediate_size), "normal", 0.02
	)

	# Layer norms
	state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
	state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)

	if (layer_idx + 1) % 5 == 0:
	print(f" ✓ Completed {layer_idx + 1}/{num_layers} layers")

	# Final layer norm and LM head
	state_dict["model.norm.weight"] = torch.ones(hidden_size)
	state_dict["lm_head.weight"] = initialize_weights(
	(vocab_size, hidden_size), "normal", 0.02
	)

	print("[3/4] Creating vision encoder (24 layers)...")

	# ============ VISION ENCODER WEIGHTS ============

	# Vision embeddings
	state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
	(vision_hidden, 3, 14, 14), "normal", 0.02
	)
	state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
	(vision_hidden,), "normal", 0.02
	)
	state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
	(vision_patches + 1, vision_hidden), "normal", 0.02
	)

	# Vision transformer layers
	for layer_idx in range(vision_layers):
	prefix = f"vision_model.encoder.layers.{layer_idx}"

	# Self attention
	state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
	(vision_hidden, vision_hidden), "normal", 0.02
	)
	state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)

	state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
	(vision_hidden, vision_hidden), "normal", 0.02
	)
	state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)

	state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
	(vision_hidden, vision_hidden), "normal", 0.02
	)
	state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)

	state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
	(vision_hidden, vision_hidden), "normal", 0.02
	)
	state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)

	# MLP
	state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
	(vision_intermediate, vision_hidden), "normal", 0.02
	)
	state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)

	state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
	(vision_hidden, vision_intermediate), "normal", 0.02
	)
	state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)

	# Layer norms
	state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
	state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
	state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
	state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)

	if (layer_idx + 1) % 5 == 0:
	print(f" ✓ Completed {layer_idx + 1}/{vision_layers} vision layers")

	# Vision post-layernorm
	state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
	state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)

	print("[4/4] Creating vision-language projection...")

	# ============ MULTI-MODAL PROJECTOR ============

	# Project vision features to language model dimension
	state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
	(hidden_size, vision_hidden), "normal", 0.02
	)
	state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)

	state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
	(hidden_size, hidden_size), "normal", 0.02
	)
	state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)

	return state_dict

	def save_sharded_safetensors(state_dict, output_dir="./mineral-nano-weights"):
	"""Save model in sharded format for Hugging Face"""

	output_path = Path(output_dir)
	output_path.mkdir(exist_ok=True)

	print(f"\nSaving SafeTensors files to {output_dir}/")

	# Calculate sizes and split into shards
	max_shard_size = 4.5 * 1024 * 1024 * 1024 # 4.5 GB per shard

	shards = []
	current_shard = {}
	current_size = 0
	shard_idx = 1

	for key, tensor in state_dict.items():
	tensor_size = tensor.numel() * tensor.element_size()

	if current_size + tensor_size > max_shard_size and current_shard:
	# Save current shard
	shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
	print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
	save_file(current_shard, output_path / shard_file)
	shards.append((shard_file, list(current_shard.keys())))

	# Start new shard
	current_shard = {}
	current_size = 0
	shard_idx += 1

	current_shard[key] = tensor
	current_size += tensor_size

	# Save final shard
	if current_shard:
	shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
	print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
	save_file(current_shard, output_path / shard_file)
	shards.append((shard_file, list(current_shard.keys())))

	# Create index file
	index = {
	"metadata": {
	"total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
	},
	"weight_map": {}
	}

	for shard_file, keys in shards:
	for key in keys:
	index["weight_map"][key] = shard_file

	index_path = output_path / "model.safetensors.index.json"
	with open(index_path, "w") as f:
	json.dump(index, f, indent=2)

	print(f" ✓ Saved index file: model.safetensors.index.json")

	# Calculate total parameters
	total_params = sum(t.numel() for t in state_dict.values())
	total_size = sum(t.numel() * t.element_size() for t in state_dict.values())

	print(f"\n{'='*60}")
	print(f"✓ SUCCESS! Model weights created:")
	print(f" Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)")
	print(f" Total Size: {total_size/1e9:.2f} GB")
	print(f" Number of Shards: {len(shards)}")
	print(f" Output Directory: {output_path.absolute()}")
	print(f"{'='*60}")
	print(f"\nNext steps:")
	print(f"1. Upload these files to Hugging Face:")
	print(f" - model-00001-of-00002.safetensors")
	print(f" - model-00002-of-00002.safetensors")
	print(f" - model.safetensors.index.json")
	print(f"2. Your model will be ready to load!")
	print(f"\nNote: These are randomly initialized weights.")
	print(f"For a working model, you need to train or fine-tune.")

	if __name__ == "__main__":
	print("="*60)
	print("Mineral Nano 1 Vision - SafeTensors Generator")
	print("Mid-Range 4.1B Parameter Model")
	print("="*60)

	# Create weights
	state_dict = create_mineral_nano_weights()

	# Save to disk
	save_sharded_safetensors(state_dict)

	print("\n✓ All done! Your SafeTensors files are ready.")