#!/usr/bin/env python3 """ Train a Hugging Face native tokenizer (BPE + Metaspace) for Sanskrit-English. Compatible with Axolotl. Uses ▁ internally for word boundaries but hides it on decode. """ import os import json from datasets import load_dataset from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders from transformers import PreTrainedTokenizerFast def prepare_bilingual_corpus(): """Prepare the Sanskrit + English dataset.""" print("📚 Loading datasets...") sanskrit_dataset = load_dataset("diabolic6045/Sanskrit-shlok-collection", split="train") sanskrit_texts = [item["text"] for item in sanskrit_dataset] english_dataset = load_dataset("roneneldan/TinyStories", split="train[:100000]") english_texts = [item["text"] for item in english_dataset] print(f"✅ Loaded {len(sanskrit_texts)} Sanskrit texts") print(f"✅ Loaded {len(english_texts)} English texts") balanced_texts = sanskrit_texts + english_texts print(f"✅ Total balanced corpus: {len(balanced_texts)} texts") return balanced_texts def train_native_hf_tokenizer(texts, output_dir="native_hf_tokenizer"): """Train a Hugging Face BPE tokenizer with Metaspace pre-tokenizer.""" print("🤖 Training native Hugging Face tokenizer...") os.makedirs(output_dir, exist_ok=True) tokenizer = Tokenizer(models.BPE()) # Pre-tokenizer: SentencePiece-style word boundary marker tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement="▁", prepend_scheme="always" ) # Post-processor: adds proper Qwen chat tokens tokenizer.post_processor = processors.TemplateProcessing( single="$A", pair="$A $B:1", special_tokens=[ ("<|im_start|>", 1), ("<|im_end|>", 2), ("", 0), ("", 3), ], ) # Decoder: strips ▁ back to normal spaces tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always") trainer = trainers.BpeTrainer( vocab_size=120000, min_frequency=2, special_tokens=["", "<|im_start|>", "<|im_end|>", ""], continuing_subword_prefix="", end_of_word_suffix="", show_progress=True ) print("🔥 Training tokenizer on bilingual corpus...") tokenizer.train_from_iterator(texts, trainer=trainer) wrapped_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, unk_token="", eos_token="<|im_end|>", pad_token="", model_max_length=131072 ) wrapped_tokenizer.clean_up_tokenization_spaces = True wrapped_tokenizer.save_pretrained(output_dir) config = { "model_type": "qwen2", "architectures": ["Qwen2ForCausalLM"], "vocab_size": 120000, "hidden_size": 3584, "intermediate_size": 8960, "num_hidden_layers": 28, "num_attention_heads": 28, "num_key_value_heads": 2, "hidden_act": "silu", "max_position_embeddings": 131072, "initializer_range": 0.02, "rms_norm_eps": 1e-06, "use_cache": True, "tie_word_embeddings": False, "rope_theta": 1000000.0, "attention_dropout": 0.0, "eos_token_id": 2, "pad_token_id": 0, "unk_token_id": 3 } with open(os.path.join(output_dir, "config.json"), "w") as f: json.dump(config, f, indent=2) print(f"✅ Tokenizer + config saved to: {output_dir}") return wrapped_tokenizer def test_tokenizer(tokenizer): """Quick sanity check.""" print("\n🧪 Testing tokenizer...") test_text = "हरे कृष्ण हरे कृष्ण कृष्ण कृष्ण हरे हरे" tokens = tokenizer.tokenize(test_text) ids = tokenizer.encode(test_text) decoded = tokenizer.decode(ids) print(f"Input: '{test_text}'") print(f"Tokens: {tokens}") print(f"Token IDs: {ids}") print(f"Decoded: '{decoded}'") if decoded.replace(" ", "") == test_text.replace(" ", ""): print("✅ Clean decode, no ▁ issues!") else: print("❌ Something is off with decoding!") def main(): print("🌟 Training Native Hugging Face Tokenizer 🌟") texts = prepare_bilingual_corpus() tokenizer = train_native_hf_tokenizer(texts) test_tokenizer(tokenizer) print("\n🎯 Done! Use `tokenizer_config: ./native_hf_tokenizer` in Axolotl.") if __name__ == "__main__": main()