Upload Thai tokenizer v1.0.0

Browse files

Files changed (4) hide show

README.md +62 -0
special_tokens_map.json +6 -0
tokenizer_config.json +12 -0
vocab.json +114 -0

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+language: th
+tags:
+- thai
+- tokenizer
+- nlp
+- text-processing
+license: mit
+---
+# ZombitX64 Thai Tokenizer
+A simple Thai language tokenizer that properly handles newlines and Thai text segmentation.
+## Features
+- **Newline Preservation**: Correctly handles and preserves newlines in tokenized text
+- **Thai Character Support**: Recognizes and processes Thai Unicode characters
+- **Hugging Face Compatible**: Works with transformers library
+- **Simple API**: Easy to use tokenize and detokenize methods
+## Usage
+```python
+from transformers import AutoTokenizer
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained("ZombitX64/zombitx64-thaitokenizer")
+# Tokenize text
+text = "สวัสดีครับ\nนี่คือตัวอย่าง"
+tokens = tokenizer.tokenize(text)
+print(tokens)
+# Encode to IDs
+token_ids = tokenizer.encode(text)
+print(token_ids)
+# Decode back
+decoded = tokenizer.decode(token_ids)
+print(decoded)
+```
+## Model Details
+- **Model Type**: Thai Tokenizer
+- **Language**: Thai (th)
+- **Vocab Size**: 112
+- **Max Length**: 512
+## Training Data
+This tokenizer was trained on basic Thai character sets and common patterns.
+## Limitations
+- Basic Thai word segmentation (can be improved with pythainlp)
+- Simple vocabulary (expandable for specific use cases)
+## Contact
+For questions or issues, please visit the [GitHub repository](https://github.com/ZombitX64/ZombitX64-Thaitokenizer).

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]",
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "name": "ZombitX64-Thaitokenizer",
+  "version": "1.0.0",
+  "description": "Thai language tokenizer with newline preservation",
+  "vocab_size": 112,
+  "max_length": 512,
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]",
+  "bos_token": "[BOS]",
+  "eos_token": "[EOS]",
+  "model_type": "thai_tokenizer"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+  "[PAD]": 0,
+  "[UNK]": 1,
+  "[BOS]": 2,
+  "[EOS]": 3,
+  "\n": 4,
+  "\r\n": 5,
+  "\r": 6,
+  " ": 7,
+  "\t": 8,
+  "  ": 9,
+  "   ": 10,
+  "ก": 11,
+  "ข": 12,
+  "ฃ": 13,
+  "ค": 14,
+  "ฅ": 15,
+  "ฆ": 16,
+  "ง": 17,
+  "จ": 18,
+  "ฉ": 19,
+  "ช": 20,
+  "ซ": 21,
+  "ฌ": 22,
+  "ญ": 23,
+  "ฎ": 24,
+  "ฏ": 25,
+  "ฐ": 26,
+  "ฑ": 27,
+  "ฒ": 28,
+  "ณ": 29,
+  "ด": 30,
+  "ต": 31,
+  "ถ": 32,
+  "ท": 33,
+  "ธ": 34,
+  "น": 35,
+  "บ": 36,
+  "ป": 37,
+  "ผ": 38,
+  "ฝ": 39,
+  "พ": 40,
+  "ฟ": 41,
+  "ภ": 42,
+  "ม": 43,
+  "ย": 44,
+  "ร": 45,
+  "ฤ": 46,
+  "ล": 47,
+  "ฦ": 48,
+  "ว": 49,
+  "ศ": 50,
+  "ษ": 51,
+  "ส": 52,
+  "ห": 53,
+  "ฬ": 54,
+  "อ": 55,
+  "ฮ": 56,
+  "ะ": 57,
+  "ั": 58,
+  "า": 59,
+  "ำ": 60,
+  "ิ": 61,
+  "ี": 62,
+  "ึ": 63,
+  "ื": 64,
+  "ุ": 65,
+  "ู": 66,
+  "ฺ": 67,
+  "฻": 68,
+  "฼": 69,
+  "฽": 70,
+  "฾": 71,
+  "฿": 72,
+  "เ": 73,
+  "แ": 74,
+  "โ": 75,
+  "ใ": 76,
+  "ไ": 77,
+  "ๅ": 78,
+  "ๆ": 79,
+  "็": 80,
+  "่": 81,
+  "้": 82,
+  "๊": 83,
+  "๋": 84,
+  "์": 85,
+  "ํ": 86,
+  "๎": 87,
+  "๐": 88,
+  "๑": 89,
+  "๒": 90,
+  "๓": 91,
+  "๔": 92,
+  "๕": 93,
+  "๖": 94,
+  "๗": 95,
+  "๘": 96,
+  "๙": 97,
+  ".": 98,
+  ",": 99,
+  "!": 100,
+  "?": 101,
+  ";": 102,
+  ":": 103,
+  "\"": 104,
+  "'": 105,
+  "(": 106,
+  ")": 107,
+  "[": 108,
+  "]": 109,
+  "{": 110,
+  "}": 111
+}