Instructions to use Luke-Bergen/Mineral-Nano-1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- fastText
How to use Luke-Bergen/Mineral-Nano-1 with fastText:
from huggingface_hub import hf_hub_download import fasttext model = fasttext.load_model(hf_hub_download("Luke-Bergen/Mineral-Nano-1", "model.bin")) - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """ | |
| Create SafeTensors files for Mineral Nano 1 Vision (Mid-Range 4.1B) | |
| This creates randomly initialized weights ready for upload to Hugging Face | |
| """ | |
| import torch | |
| import json | |
| from safetensors.torch import save_file | |
| from pathlib import Path | |
| import math | |
| def initialize_weights(shape, init_type="normal", std=0.02): | |
| """Initialize tensor with proper initialization""" | |
| if init_type == "normal": | |
| return torch.randn(shape) * std | |
| elif init_type == "uniform": | |
| limit = math.sqrt(3.0 * std) | |
| return torch.rand(shape) * 2 * limit - limit | |
| else: | |
| return torch.zeros(shape) | |
| def create_mineral_nano_weights(): | |
| """Create all model weights for Mineral Nano 1 Vision""" | |
| print("Creating Mineral Nano 1 Vision SafeTensors (4.1B parameters)...") | |
| print("This will take a few minutes and use ~16GB RAM temporarily...") | |
| state_dict = {} | |
| # Configuration | |
| vocab_size = 64000 | |
| hidden_size = 2048 | |
| intermediate_size = 8192 | |
| num_layers = 28 | |
| num_heads = 32 | |
| num_kv_heads = 8 | |
| vision_hidden = 1536 | |
| vision_intermediate = 6144 | |
| vision_layers = 24 | |
| vision_heads = 24 | |
| vision_patches = 1024 # 448/14 = 32, 32*32 = 1024 | |
| # ============ LANGUAGE MODEL WEIGHTS ============ | |
| print("\n[1/4] Creating language model embeddings...") | |
| # Token embeddings | |
| state_dict["model.embed_tokens.weight"] = initialize_weights( | |
| (vocab_size, hidden_size), "normal", 0.02 | |
| ) | |
| print("[2/4] Creating language model layers (28 layers)...") | |
| # Transformer layers | |
| for layer_idx in range(num_layers): | |
| prefix = f"model.layers.{layer_idx}" | |
| # Self attention | |
| state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights( | |
| (hidden_size, hidden_size), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights( | |
| (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights( | |
| (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights( | |
| (hidden_size, hidden_size), "normal", 0.02 | |
| ) | |
| # MLP | |
| state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights( | |
| (intermediate_size, hidden_size), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights( | |
| (intermediate_size, hidden_size), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights( | |
| (hidden_size, intermediate_size), "normal", 0.02 | |
| ) | |
| # Layer norms | |
| state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size) | |
| state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size) | |
| if (layer_idx + 1) % 5 == 0: | |
| print(f" ✓ Completed {layer_idx + 1}/{num_layers} layers") | |
| # Final layer norm and LM head | |
| state_dict["model.norm.weight"] = torch.ones(hidden_size) | |
| state_dict["lm_head.weight"] = initialize_weights( | |
| (vocab_size, hidden_size), "normal", 0.02 | |
| ) | |
| print("[3/4] Creating vision encoder (24 layers)...") | |
| # ============ VISION ENCODER WEIGHTS ============ | |
| # Vision embeddings | |
| state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights( | |
| (vision_hidden, 3, 14, 14), "normal", 0.02 | |
| ) | |
| state_dict["vision_model.embeddings.class_embedding"] = initialize_weights( | |
| (vision_hidden,), "normal", 0.02 | |
| ) | |
| state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights( | |
| (vision_patches + 1, vision_hidden), "normal", 0.02 | |
| ) | |
| # Vision transformer layers | |
| for layer_idx in range(vision_layers): | |
| prefix = f"vision_model.encoder.layers.{layer_idx}" | |
| # Self attention | |
| state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights( | |
| (vision_hidden, vision_hidden), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden) | |
| state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights( | |
| (vision_hidden, vision_hidden), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden) | |
| state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights( | |
| (vision_hidden, vision_hidden), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden) | |
| state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights( | |
| (vision_hidden, vision_hidden), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden) | |
| # MLP | |
| state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights( | |
| (vision_intermediate, vision_hidden), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate) | |
| state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights( | |
| (vision_hidden, vision_intermediate), "normal", 0.02 | |
| ) | |
| state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden) | |
| # Layer norms | |
| state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden) | |
| state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden) | |
| state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden) | |
| state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden) | |
| if (layer_idx + 1) % 5 == 0: | |
| print(f" ✓ Completed {layer_idx + 1}/{vision_layers} vision layers") | |
| # Vision post-layernorm | |
| state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden) | |
| state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden) | |
| print("[4/4] Creating vision-language projection...") | |
| # ============ MULTI-MODAL PROJECTOR ============ | |
| # Project vision features to language model dimension | |
| state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights( | |
| (hidden_size, vision_hidden), "normal", 0.02 | |
| ) | |
| state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size) | |
| state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights( | |
| (hidden_size, hidden_size), "normal", 0.02 | |
| ) | |
| state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size) | |
| return state_dict | |
| def save_sharded_safetensors(state_dict, output_dir="./mineral-nano-weights"): | |
| """Save model in sharded format for Hugging Face""" | |
| output_path = Path(output_dir) | |
| output_path.mkdir(exist_ok=True) | |
| print(f"\nSaving SafeTensors files to {output_dir}/") | |
| # Calculate sizes and split into shards | |
| max_shard_size = 4.5 * 1024 * 1024 * 1024 # 4.5 GB per shard | |
| shards = [] | |
| current_shard = {} | |
| current_size = 0 | |
| shard_idx = 1 | |
| for key, tensor in state_dict.items(): | |
| tensor_size = tensor.numel() * tensor.element_size() | |
| if current_size + tensor_size > max_shard_size and current_shard: | |
| # Save current shard | |
| shard_file = f"model-{shard_idx:05d}-of-00002.safetensors" | |
| print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...") | |
| save_file(current_shard, output_path / shard_file) | |
| shards.append((shard_file, list(current_shard.keys()))) | |
| # Start new shard | |
| current_shard = {} | |
| current_size = 0 | |
| shard_idx += 1 | |
| current_shard[key] = tensor | |
| current_size += tensor_size | |
| # Save final shard | |
| if current_shard: | |
| shard_file = f"model-{shard_idx:05d}-of-00002.safetensors" | |
| print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...") | |
| save_file(current_shard, output_path / shard_file) | |
| shards.append((shard_file, list(current_shard.keys()))) | |
| # Create index file | |
| index = { | |
| "metadata": { | |
| "total_size": sum(t.numel() * t.element_size() for t in state_dict.values()) | |
| }, | |
| "weight_map": {} | |
| } | |
| for shard_file, keys in shards: | |
| for key in keys: | |
| index["weight_map"][key] = shard_file | |
| index_path = output_path / "model.safetensors.index.json" | |
| with open(index_path, "w") as f: | |
| json.dump(index, f, indent=2) | |
| print(f" ✓ Saved index file: model.safetensors.index.json") | |
| # Calculate total parameters | |
| total_params = sum(t.numel() for t in state_dict.values()) | |
| total_size = sum(t.numel() * t.element_size() for t in state_dict.values()) | |
| print(f"\n{'='*60}") | |
| print(f"✓ SUCCESS! Model weights created:") | |
| print(f" Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)") | |
| print(f" Total Size: {total_size/1e9:.2f} GB") | |
| print(f" Number of Shards: {len(shards)}") | |
| print(f" Output Directory: {output_path.absolute()}") | |
| print(f"{'='*60}") | |
| print(f"\nNext steps:") | |
| print(f"1. Upload these files to Hugging Face:") | |
| print(f" - model-00001-of-00002.safetensors") | |
| print(f" - model-00002-of-00002.safetensors") | |
| print(f" - model.safetensors.index.json") | |
| print(f"2. Your model will be ready to load!") | |
| print(f"\nNote: These are randomly initialized weights.") | |
| print(f"For a working model, you need to train or fine-tune.") | |
| if __name__ == "__main__": | |
| print("="*60) | |
| print("Mineral Nano 1 Vision - SafeTensors Generator") | |
| print("Mid-Range 4.1B Parameter Model") | |
| print("="*60) | |
| # Create weights | |
| state_dict = create_mineral_nano_weights() | |
| # Save to disk | |
| save_sharded_safetensors(state_dict) | |
| print("\n✓ All done! Your SafeTensors files are ready.") |