Sanskrit-English-qwen2-tokenizer / train_tokenizer.py
Avinyaa
updated tokenizer
81f3642
#!/usr/bin/env python3
"""
Train a Hugging Face native tokenizer (BPE + Metaspace) for Sanskrit-English.
Compatible with Axolotl. Uses ▁ internally for word boundaries but hides it on decode.
"""
import os
import json
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders
from transformers import PreTrainedTokenizerFast
def prepare_bilingual_corpus():
"""Prepare the Sanskrit + English dataset."""
print("📚 Loading datasets...")
sanskrit_dataset = load_dataset("diabolic6045/Sanskrit-shlok-collection", split="train")
sanskrit_texts = [item["text"] for item in sanskrit_dataset]
english_dataset = load_dataset("roneneldan/TinyStories", split="train[:100000]")
english_texts = [item["text"] for item in english_dataset]
print(f"✅ Loaded {len(sanskrit_texts)} Sanskrit texts")
print(f"✅ Loaded {len(english_texts)} English texts")
balanced_texts = sanskrit_texts + english_texts
print(f"✅ Total balanced corpus: {len(balanced_texts)} texts")
return balanced_texts
def train_native_hf_tokenizer(texts, output_dir="native_hf_tokenizer"):
"""Train a Hugging Face BPE tokenizer with Metaspace pre-tokenizer."""
print("🤖 Training native Hugging Face tokenizer...")
os.makedirs(output_dir, exist_ok=True)
tokenizer = Tokenizer(models.BPE())
# Pre-tokenizer: SentencePiece-style word boundary marker
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement="▁", prepend_scheme="always"
)
# Post-processor: adds proper Qwen chat tokens
tokenizer.post_processor = processors.TemplateProcessing(
single="$A",
pair="$A $B:1",
special_tokens=[
("<|im_start|>", 1),
("<|im_end|>", 2),
("<pad>", 0),
("<unk>", 3),
],
)
# Decoder: strips ▁ back to normal spaces
tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always")
trainer = trainers.BpeTrainer(
vocab_size=120000,
min_frequency=2,
special_tokens=["<unk>", "<|im_start|>", "<|im_end|>", "<pad>"],
continuing_subword_prefix="",
end_of_word_suffix="",
show_progress=True
)
print("🔥 Training tokenizer on bilingual corpus...")
tokenizer.train_from_iterator(texts, trainer=trainer)
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
unk_token="<unk>",
eos_token="<|im_end|>",
pad_token="<pad>",
model_max_length=131072
)
wrapped_tokenizer.clean_up_tokenization_spaces = True
wrapped_tokenizer.save_pretrained(output_dir)
config = {
"model_type": "qwen2",
"architectures": ["Qwen2ForCausalLM"],
"vocab_size": 120000,
"hidden_size": 3584,
"intermediate_size": 8960,
"num_hidden_layers": 28,
"num_attention_heads": 28,
"num_key_value_heads": 2,
"hidden_act": "silu",
"max_position_embeddings": 131072,
"initializer_range": 0.02,
"rms_norm_eps": 1e-06,
"use_cache": True,
"tie_word_embeddings": False,
"rope_theta": 1000000.0,
"attention_dropout": 0.0,
"eos_token_id": 2,
"pad_token_id": 0,
"unk_token_id": 3
}
with open(os.path.join(output_dir, "config.json"), "w") as f:
json.dump(config, f, indent=2)
print(f"✅ Tokenizer + config saved to: {output_dir}")
return wrapped_tokenizer
def test_tokenizer(tokenizer):
"""Quick sanity check."""
print("\n🧪 Testing tokenizer...")
test_text = "हरे कृष्ण हरे कृष्ण कृष्ण कृष्ण हरे हरे"
tokens = tokenizer.tokenize(test_text)
ids = tokenizer.encode(test_text)
decoded = tokenizer.decode(ids)
print(f"Input: '{test_text}'")
print(f"Tokens: {tokens}")
print(f"Token IDs: {ids}")
print(f"Decoded: '{decoded}'")
if decoded.replace(" ", "") == test_text.replace(" ", ""):
print("✅ Clean decode, no ▁ issues!")
else:
print("❌ Something is off with decoding!")
def main():
print("🌟 Training Native Hugging Face Tokenizer 🌟")
texts = prepare_bilingual_corpus()
tokenizer = train_native_hf_tokenizer(texts)
test_tokenizer(tokenizer)
print("\n🎯 Done! Use `tokenizer_config: ./native_hf_tokenizer` in Axolotl.")
if __name__ == "__main__":
main()