| |
| """ |
| Train a Hugging Face native tokenizer (BPE + Metaspace) for Sanskrit-English. |
| Compatible with Axolotl. Uses ▁ internally for word boundaries but hides it on decode. |
| """ |
|
|
| import os |
| import json |
| from datasets import load_dataset |
| from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders |
| from transformers import PreTrainedTokenizerFast |
|
|
| def prepare_bilingual_corpus(): |
| """Prepare the Sanskrit + English dataset.""" |
| print("📚 Loading datasets...") |
| |
| sanskrit_dataset = load_dataset("diabolic6045/Sanskrit-shlok-collection", split="train") |
| sanskrit_texts = [item["text"] for item in sanskrit_dataset] |
| |
| english_dataset = load_dataset("roneneldan/TinyStories", split="train[:100000]") |
| english_texts = [item["text"] for item in english_dataset] |
| |
| print(f"✅ Loaded {len(sanskrit_texts)} Sanskrit texts") |
| print(f"✅ Loaded {len(english_texts)} English texts") |
| |
| balanced_texts = sanskrit_texts + english_texts |
| print(f"✅ Total balanced corpus: {len(balanced_texts)} texts") |
| |
| return balanced_texts |
|
|
| def train_native_hf_tokenizer(texts, output_dir="native_hf_tokenizer"): |
| """Train a Hugging Face BPE tokenizer with Metaspace pre-tokenizer.""" |
| print("🤖 Training native Hugging Face tokenizer...") |
| os.makedirs(output_dir, exist_ok=True) |
| |
| tokenizer = Tokenizer(models.BPE()) |
| |
| |
| tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( |
| replacement="▁", prepend_scheme="always" |
| ) |
| |
| |
| tokenizer.post_processor = processors.TemplateProcessing( |
| single="$A", |
| pair="$A $B:1", |
| special_tokens=[ |
| ("<|im_start|>", 1), |
| ("<|im_end|>", 2), |
| ("<pad>", 0), |
| ("<unk>", 3), |
| ], |
| ) |
| |
| |
| tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always") |
| |
| trainer = trainers.BpeTrainer( |
| vocab_size=120000, |
| min_frequency=2, |
| special_tokens=["<unk>", "<|im_start|>", "<|im_end|>", "<pad>"], |
| continuing_subword_prefix="", |
| end_of_word_suffix="", |
| show_progress=True |
| ) |
| |
| print("🔥 Training tokenizer on bilingual corpus...") |
| tokenizer.train_from_iterator(texts, trainer=trainer) |
| |
| wrapped_tokenizer = PreTrainedTokenizerFast( |
| tokenizer_object=tokenizer, |
| unk_token="<unk>", |
| eos_token="<|im_end|>", |
| pad_token="<pad>", |
| model_max_length=131072 |
| ) |
| wrapped_tokenizer.clean_up_tokenization_spaces = True |
| |
| wrapped_tokenizer.save_pretrained(output_dir) |
| |
| config = { |
| "model_type": "qwen2", |
| "architectures": ["Qwen2ForCausalLM"], |
| "vocab_size": 120000, |
| "hidden_size": 3584, |
| "intermediate_size": 8960, |
| "num_hidden_layers": 28, |
| "num_attention_heads": 28, |
| "num_key_value_heads": 2, |
| "hidden_act": "silu", |
| "max_position_embeddings": 131072, |
| "initializer_range": 0.02, |
| "rms_norm_eps": 1e-06, |
| "use_cache": True, |
| "tie_word_embeddings": False, |
| "rope_theta": 1000000.0, |
| "attention_dropout": 0.0, |
| "eos_token_id": 2, |
| "pad_token_id": 0, |
| "unk_token_id": 3 |
| } |
| |
| with open(os.path.join(output_dir, "config.json"), "w") as f: |
| json.dump(config, f, indent=2) |
| |
| print(f"✅ Tokenizer + config saved to: {output_dir}") |
| return wrapped_tokenizer |
|
|
| def test_tokenizer(tokenizer): |
| """Quick sanity check.""" |
| print("\n🧪 Testing tokenizer...") |
| test_text = "हरे कृष्ण हरे कृष्ण कृष्ण कृष्ण हरे हरे" |
| |
| tokens = tokenizer.tokenize(test_text) |
| ids = tokenizer.encode(test_text) |
| decoded = tokenizer.decode(ids) |
| |
| print(f"Input: '{test_text}'") |
| print(f"Tokens: {tokens}") |
| print(f"Token IDs: {ids}") |
| print(f"Decoded: '{decoded}'") |
| |
| if decoded.replace(" ", "") == test_text.replace(" ", ""): |
| print("✅ Clean decode, no ▁ issues!") |
| else: |
| print("❌ Something is off with decoding!") |
|
|
| def main(): |
| print("🌟 Training Native Hugging Face Tokenizer 🌟") |
| texts = prepare_bilingual_corpus() |
| tokenizer = train_native_hf_tokenizer(texts) |
| test_tokenizer(tokenizer) |
| print("\n🎯 Done! Use `tokenizer_config: ./native_hf_tokenizer` in Axolotl.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|