File size: 10,171 Bytes

6c759b5

#!/usr/bin/env python3
"""
Create SafeTensors files for Mineral Nano 1 Vision (Mid-Range 4.1B)
This creates randomly initialized weights ready for upload to Hugging Face
"""

import torch
import json
from safetensors.torch import save_file
from pathlib import Path
import math

def initialize_weights(shape, init_type="normal", std=0.02):
    """Initialize tensor with proper initialization"""
    if init_type == "normal":
        return torch.randn(shape) * std
    elif init_type == "uniform":
        limit = math.sqrt(3.0 * std)
        return torch.rand(shape) * 2 * limit - limit
    else:
        return torch.zeros(shape)

def create_mineral_nano_weights():
    """Create all model weights for Mineral Nano 1 Vision"""
    
    print("Creating Mineral Nano 1 Vision SafeTensors (4.1B parameters)...")
    print("This will take a few minutes and use ~16GB RAM temporarily...")
    
    state_dict = {}
    
    # Configuration
    vocab_size = 64000
    hidden_size = 2048
    intermediate_size = 8192
    num_layers = 28
    num_heads = 32
    num_kv_heads = 8
    
    vision_hidden = 1536
    vision_intermediate = 6144
    vision_layers = 24
    vision_heads = 24
    vision_patches = 1024  # 448/14 = 32, 32*32 = 1024
    
    # ============ LANGUAGE MODEL WEIGHTS ============
    
    print("\n[1/4] Creating language model embeddings...")
    # Token embeddings
    state_dict["model.embed_tokens.weight"] = initialize_weights(
        (vocab_size, hidden_size), "normal", 0.02
    )
    
    print("[2/4] Creating language model layers (28 layers)...")
    # Transformer layers
    for layer_idx in range(num_layers):
        prefix = f"model.layers.{layer_idx}"
        
        # Self attention
        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
            (hidden_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
            (hidden_size, hidden_size), "normal", 0.02
        )
        
        # MLP
        state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
            (intermediate_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
            (intermediate_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
            (hidden_size, intermediate_size), "normal", 0.02
        )
        
        # Layer norms
        state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
        state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
        
        if (layer_idx + 1) % 5 == 0:
            print(f"  ✓ Completed {layer_idx + 1}/{num_layers} layers")
    
    # Final layer norm and LM head
    state_dict["model.norm.weight"] = torch.ones(hidden_size)
    state_dict["lm_head.weight"] = initialize_weights(
        (vocab_size, hidden_size), "normal", 0.02
    )
    
    print("[3/4] Creating vision encoder (24 layers)...")
    
    # ============ VISION ENCODER WEIGHTS ============
    
    # Vision embeddings
    state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
        (vision_hidden, 3, 14, 14), "normal", 0.02
    )
    state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
        (vision_hidden,), "normal", 0.02
    )
    state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
        (vision_patches + 1, vision_hidden), "normal", 0.02
    )
    
    # Vision transformer layers
    for layer_idx in range(vision_layers):
        prefix = f"vision_model.encoder.layers.{layer_idx}"
        
        # Self attention
        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
        
        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
        
        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
        
        state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
        
        # MLP
        state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
            (vision_intermediate, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
        
        state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
            (vision_hidden, vision_intermediate), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
        
        # Layer norms
        state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
        state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
        state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
        
        if (layer_idx + 1) % 5 == 0:
            print(f"  ✓ Completed {layer_idx + 1}/{vision_layers} vision layers")
    
    # Vision post-layernorm
    state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
    state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
    
    print("[4/4] Creating vision-language projection...")
    
    # ============ MULTI-MODAL PROJECTOR ============
    
    # Project vision features to language model dimension
    state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
        (hidden_size, vision_hidden), "normal", 0.02
    )
    state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
    
    state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
        (hidden_size, hidden_size), "normal", 0.02
    )
    state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
    
    return state_dict

def save_sharded_safetensors(state_dict, output_dir="./mineral-nano-weights"):
    """Save model in sharded format for Hugging Face"""
    
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    print(f"\nSaving SafeTensors files to {output_dir}/")
    
    # Calculate sizes and split into shards
    max_shard_size = 4.5 * 1024 * 1024 * 1024  # 4.5 GB per shard
    
    shards = []
    current_shard = {}
    current_size = 0
    shard_idx = 1
    
    for key, tensor in state_dict.items():
        tensor_size = tensor.numel() * tensor.element_size()
        
        if current_size + tensor_size > max_shard_size and current_shard:
            # Save current shard
            shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
            print(f"  Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
            save_file(current_shard, output_path / shard_file)
            shards.append((shard_file, list(current_shard.keys())))
            
            # Start new shard
            current_shard = {}
            current_size = 0
            shard_idx += 1
        
        current_shard[key] = tensor
        current_size += tensor_size
    
    # Save final shard
    if current_shard:
        shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
        print(f"  Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
        save_file(current_shard, output_path / shard_file)
        shards.append((shard_file, list(current_shard.keys())))
    
    # Create index file
    index = {
        "metadata": {
            "total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
        },
        "weight_map": {}
    }
    
    for shard_file, keys in shards:
        for key in keys:
            index["weight_map"][key] = shard_file
    
    index_path = output_path / "model.safetensors.index.json"
    with open(index_path, "w") as f:
        json.dump(index, f, indent=2)
    
    print(f"  ✓ Saved index file: model.safetensors.index.json")
    
    # Calculate total parameters
    total_params = sum(t.numel() for t in state_dict.values())
    total_size = sum(t.numel() * t.element_size() for t in state_dict.values())
    
    print(f"\n{'='*60}")
    print(f"✓ SUCCESS! Model weights created:")
    print(f"  Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)")
    print(f"  Total Size: {total_size/1e9:.2f} GB")
    print(f"  Number of Shards: {len(shards)}")
    print(f"  Output Directory: {output_path.absolute()}")
    print(f"{'='*60}")
    print(f"\nNext steps:")
    print(f"1. Upload these files to Hugging Face:")
    print(f"   - model-00001-of-00002.safetensors")
    print(f"   - model-00002-of-00002.safetensors")
    print(f"   - model.safetensors.index.json")
    print(f"2. Your model will be ready to load!")
    print(f"\nNote: These are randomly initialized weights.")
    print(f"For a working model, you need to train or fine-tune.")

if __name__ == "__main__":
    print("="*60)
    print("Mineral Nano 1 Vision - SafeTensors Generator")
    print("Mid-Range 4.1B Parameter Model")
    print("="*60)
    
    # Create weights
    state_dict = create_mineral_nano_weights()
    
    # Save to disk
    save_sharded_safetensors(state_dict)
    
    print("\n✓ All done! Your SafeTensors files are ready.")