#!/usr/bin/env python3 """ Vocabulary Expansion Script for Model Distillation This script expands the vocabulary of an existing model to match a larger tokenizer from a teacher model, enabling distillation between models with different vocabularies. The core architectural problem: A model's vocabulary is fixed in its embedding layer and output projection. This script surgically expands these layers while preserving all existing knowledge and intelligently initializing new tokens. Author: Transfer-First LLM Framework """ import argparse import logging import torch from transformers import AutoModelForCausalLM, AutoTokenizer import os # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def expand_model_vocabulary(model_repo_id: str, new_tokenizer_repo_id: str, output_dir: str): """ Expand a model's vocabulary to match a new, larger tokenizer. Args: model_repo_id: HuggingFace repo ID of the student model to expand new_tokenizer_repo_id: HuggingFace repo ID of the teacher model's tokenizer output_dir: Local directory to save the expanded model """ logger.info("=" * 60) logger.info("VOCABULARY EXPANSION FOR DISTILLATION") logger.info("=" * 60) # Step 1: Load original model and tokenizer logger.info(f"Loading original model from: {model_repo_id}") try: original_model = AutoModelForCausalLM.from_pretrained( model_repo_id, torch_dtype=torch.bfloat16, trust_remote_code=True ) original_tokenizer = AutoTokenizer.from_pretrained(model_repo_id) logger.info(f"✓ Original model loaded successfully") logger.info(f" Model type: {original_model.__class__.__name__}") logger.info(f" Parameters: {sum(p.numel() for p in original_model.parameters()):,}") except Exception as e: logger.error(f"Failed to load original model: {e}") raise # Step 2: Load new tokenizer (from teacher model) logger.info(f"Loading new tokenizer from: {new_tokenizer_repo_id}") try: new_tokenizer = AutoTokenizer.from_pretrained(new_tokenizer_repo_id) logger.info(f"✓ New tokenizer loaded successfully") except Exception as e: logger.error(f"Failed to load new tokenizer: {e}") raise # Step 3: Log initial state original_vocab_size = len(original_tokenizer) new_vocab_size = len(new_tokenizer) tokens_to_add = new_vocab_size - original_vocab_size logger.info("=" * 40) logger.info("VOCABULARY ANALYSIS") logger.info("=" * 40) logger.info(f"Original vocabulary size: {original_vocab_size:,}") logger.info(f"New vocabulary size: {new_vocab_size:,}") logger.info(f"Tokens to add: {tokens_to_add:,}") logger.info(f"Expansion ratio: {new_vocab_size/original_vocab_size:.2f}x") if tokens_to_add <= 0: logger.warning("New vocabulary is not larger than original. No expansion needed.") logger.info("Saving model with new tokenizer anyway...") else: logger.info(f"Will expand model by {tokens_to_add:,} tokens") # Step 4: Get model's current embedding dimensions if hasattr(original_model, 'model') and hasattr(original_model.model, 'embed_tokens'): # For Llama-style models embed_layer = original_model.model.embed_tokens lm_head = original_model.lm_head elif hasattr(original_model, 'transformer') and hasattr(original_model.transformer, 'wte'): # For GPT-style models embed_layer = original_model.transformer.wte lm_head = original_model.lm_head else: logger.error("Could not identify embedding layer. Model architecture not supported.") raise ValueError("Unsupported model architecture") original_embed_size = embed_layer.weight.shape[0] embed_dim = embed_layer.weight.shape[1] logger.info(f"Current embedding matrix: {original_embed_size} x {embed_dim}") logger.info(f"Current LM head: {lm_head.weight.shape}") # Step 5: Resize model embeddings using HuggingFace's built-in method logger.info("=" * 40) logger.info("RESIZING MODEL EMBEDDINGS") logger.info("=" * 40) try: # This is the key method that handles everything: # - Creates new, larger embedding matrix # - Copies existing weights # - Initializes new token embeddings (usually with mean of existing) # - Updates the LM head accordingly logger.info("Calling model.resize_token_embeddings()...") original_model.resize_token_embeddings(new_vocab_size) logger.info("✓ Model embeddings resized successfully") # Verify the resize worked if hasattr(original_model, 'model') and hasattr(original_model.model, 'embed_tokens'): new_embed_layer = original_model.model.embed_tokens new_lm_head = original_model.lm_head else: new_embed_layer = original_model.transformer.wte new_lm_head = original_model.lm_head new_embed_size = new_embed_layer.weight.shape[0] logger.info(f"New embedding matrix: {new_embed_size} x {embed_dim}") logger.info(f"New LM head: {new_lm_head.weight.shape}") # Verify the sizes match expectations if new_embed_size == new_vocab_size: logger.info("✓ Embedding resize verification passed") else: logger.error(f"Resize verification failed: expected {new_vocab_size}, got {new_embed_size}") raise ValueError("Embedding resize verification failed") except Exception as e: logger.error(f"Failed to resize embeddings: {e}") raise # Step 6: Update model config logger.info("Updating model configuration...") original_model.config.vocab_size = new_vocab_size logger.info(f"✓ Model config updated: vocab_size = {new_vocab_size}") # Step 7: Save everything logger.info("=" * 40) logger.info("SAVING EXPANDED MODEL") logger.info("=" * 40) # Create output directory os.makedirs(output_dir, exist_ok=True) logger.info(f"Output directory: {output_dir}") try: # Save the resized model logger.info("Saving expanded model...") original_model.save_pretrained(output_dir) logger.info("✓ Model saved successfully") # Save the new tokenizer (CRITICAL!) logger.info("Saving new tokenizer...") new_tokenizer.save_pretrained(output_dir) logger.info("✓ Tokenizer saved successfully") # Save a summary file summary_path = os.path.join(output_dir, "vocab_expansion_summary.txt") with open(summary_path, 'w') as f: f.write("Vocabulary Expansion Summary\n") f.write("=" * 30 + "\n") f.write(f"Original model: {model_repo_id}\n") f.write(f"New tokenizer source: {new_tokenizer_repo_id}\n") f.write(f"Original vocab size: {original_vocab_size:,}\n") f.write(f"New vocab size: {new_vocab_size:,}\n") f.write(f"Tokens added: {tokens_to_add:,}\n") f.write(f"Expansion ratio: {new_vocab_size/original_vocab_size:.2f}x\n") f.write(f"Output directory: {output_dir}\n") logger.info(f"✓ Summary saved to: {summary_path}") except Exception as e: logger.error(f"Failed to save model: {e}") raise # Step 8: Final verification and success message logger.info("=" * 60) logger.info("VOCABULARY EXPANSION COMPLETED SUCCESSFULLY!") logger.info("=" * 60) logger.info(f"✓ Original vocabulary: {original_vocab_size:,} tokens") logger.info(f"✓ Expanded vocabulary: {new_vocab_size:,} tokens") logger.info(f"✓ Added tokens: {tokens_to_add:,}") logger.info(f"✓ Model saved to: {output_dir}") logger.info("") logger.info("The expanded model is now ready for:") logger.info(" • Knowledge distillation from teacher models") logger.info(" • Fine-tuning with the new vocabulary") logger.info(" • Direct inference with the new tokenizer") logger.info("") logger.info("Next steps:") logger.info(f" 1. Use this model as the student in distillation") logger.info(f" 2. Use tokenizer from: {new_tokenizer_repo_id}") logger.info(f" 3. The model will now understand the teacher's full vocabulary") def main(): parser = argparse.ArgumentParser( description="Expand a model's vocabulary to match a larger tokenizer for distillation", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Expand vocabulary to match Qwen2 tokenizer python expand_vocab.py \\ --model_repo_id "shivash/MyAwesome-299M-Model" \\ --new_tokenizer_repo_id "Qwen/Qwen2-1.5B" \\ --output_dir "./MyAwesome-299M-Model-Qwen-Vocab" # Expand vocabulary to match Llama 3 tokenizer python expand_vocab.py \\ --model_repo_id "shivash/MyAwesome-299M-Model" \\ --new_tokenizer_repo_id "meta-llama/Meta-Llama-3-8B" \\ --output_dir "./MyAwesome-299M-Model-Llama3-Vocab" """ ) parser.add_argument( "--model_repo_id", type=str, required=True, help="HuggingFace repository ID of the student model to expand (e.g., 'shivash/MyAwesome-299M-Model')" ) parser.add_argument( "--new_tokenizer_repo_id", type=str, required=True, help="HuggingFace repository ID of the teacher model whose tokenizer to adopt (e.g., 'Qwen/Qwen2-1.5B')" ) parser.add_argument( "--output_dir", type=str, required=True, help="Local directory where the expanded model will be saved" ) args = parser.parse_args() try: expand_model_vocabulary( model_repo_id=args.model_repo_id, new_tokenizer_repo_id=args.new_tokenizer_repo_id, output_dir=args.output_dir ) return 0 except Exception as e: logger.error(f"Vocabulary expansion failed: {e}") return 1 if __name__ == "__main__": exit(main()) # # EXAMPLE USAGE: # # python expand_vocab.py \ # --model_repo_id "shivash/MyAwesome-299M-Model" \ # --new_tokenizer_repo_id "Qwen/Qwen2-1.5B" \ # --output_dir "./MyAwesome-299M-Model-Qwen-Vocab" # # python expand_vocab.py \ # --model_repo_id "shivash/MyAwesome-299M-Model" \ # --new_tokenizer_repo_id "meta-llama/Meta-Llama-3-8B" \ # --output_dir "./MyAwesome-299M-Model-Llama3-Vocab" #