CompactAIModelRunner / tokenizer_glint.json
CompactAI's picture
Upload 3 files
1895b22 verified
{
"vocab_size": 500,
"hf_tokenizer": {
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<PAD>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<BOS>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "<EOS>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "<UNK>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": null,
"decoder": {
"type": "ByteLevel",
"add_prefix_space": true,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<UNK>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<PAD>": 0,
"<BOS>": 1,
"<EOS>": 2,
"<UNK>": 3,
"!": 4,
"\"": 5,
"#": 6,
"$": 7,
"%": 8,
"&": 9,
"'": 10,
"(": 11,
")": 12,
"*": 13,
"+": 14,
",": 15,
"-": 16,
".": 17,
"/": 18,
"0": 19,
"1": 20,
"2": 21,
"3": 22,
"4": 23,
"5": 24,
"6": 25,
"7": 26,
"8": 27,
"9": 28,
":": 29,
";": 30,
"<": 31,
"=": 32,
">": 33,
"?": 34,
"@": 35,
"A": 36,
"B": 37,
"C": 38,
"D": 39,
"E": 40,
"F": 41,
"G": 42,
"H": 43,
"I": 44,
"J": 45,
"K": 46,
"L": 47,
"M": 48,
"N": 49,
"O": 50,
"P": 51,
"Q": 52,
"R": 53,
"S": 54,
"T": 55,
"U": 56,
"V": 57,
"W": 58,
"X": 59,
"Y": 60,
"Z": 61,
"[": 62,
"\\": 63,
"]": 64,
"^": 65,
"_": 66,
"`": 67,
"a": 68,
"b": 69,
"c": 70,
"d": 71,
"e": 72,
"f": 73,
"g": 74,
"h": 75,
"i": 76,
"j": 77,
"k": 78,
"l": 79,
"m": 80,
"n": 81,
"o": 82,
"p": 83,
"q": 84,
"r": 85,
"s": 86,
"t": 87,
"u": 88,
"v": 89,
"w": 90,
"x": 91,
"y": 92,
"z": 93,
"{": 94,
"|": 95,
"}": 96,
"~": 97,
"¡": 98,
"¢": 99,
"£": 100,
"¤": 101,
"¥": 102,
"¦": 103,
"§": 104,
"¨": 105,
"©": 106,
"ª": 107,
"«": 108,
"¬": 109,
"®": 110,
"¯": 111,
"°": 112,
"±": 113,
"²": 114,
"³": 115,
"´": 116,
"µ": 117,
"¶": 118,
"·": 119,
"¸": 120,
"¹": 121,
"º": 122,
"»": 123,
"¼": 124,
"½": 125,
"¾": 126,
"¿": 127,
"À": 128,
"Á": 129,
"Â": 130,
"Ã": 131,
"Ä": 132,
"Å": 133,
"Æ": 134,
"Ç": 135,
"È": 136,
"É": 137,
"Ê": 138,
"Ë": 139,
"Ì": 140,
"Í": 141,
"Î": 142,
"Ï": 143,
"Ð": 144,
"Ñ": 145,
"Ò": 146,
"Ó": 147,
"Ô": 148,
"Õ": 149,
"Ö": 150,
"×": 151,
"Ø": 152,
"Ù": 153,
"Ú": 154,
"Û": 155,
"Ü": 156,
"Ý": 157,
"Þ": 158,
"ß": 159,
"à": 160,
"á": 161,
"â": 162,
"ã": 163,
"ä": 164,
"å": 165,
"æ": 166,
"ç": 167,
"è": 168,
"é": 169,
"ê": 170,
"ë": 171,
"ì": 172,
"í": 173,
"î": 174,
"ï": 175,
"ð": 176,
"ñ": 177,
"ò": 178,
"ó": 179,
"ô": 180,
"õ": 181,
"ö": 182,
"÷": 183,
"ø": 184,
"ù": 185,
"ú": 186,
"û": 187,
"ü": 188,
"ý": 189,
"þ": 190,
"ÿ": 191,
"Ā": 192,
"ā": 193,
"Ă": 194,
"ă": 195,
"Ą": 196,
"ą": 197,
"Ć": 198,
"ć": 199,
"Ĉ": 200,
"ĉ": 201,
"Ċ": 202,
"ċ": 203,
"Č": 204,
"č": 205,
"Ď": 206,
"ď": 207,
"Đ": 208,
"đ": 209,
"Ē": 210,
"ē": 211,
"Ĕ": 212,
"ĕ": 213,
"Ė": 214,
"ė": 215,
"Ę": 216,
"ę": 217,
"Ě": 218,
"ě": 219,
"Ĝ": 220,
"ĝ": 221,
"Ğ": 222,
"ğ": 223,
"Ġ": 224,
"ġ": 225,
"Ģ": 226,
"ģ": 227,
"Ĥ": 228,
"ĥ": 229,
"Ħ": 230,
"ħ": 231,
"Ĩ": 232,
"ĩ": 233,
"Ī": 234,
"ī": 235,
"Ĭ": 236,
"ĭ": 237,
"Į": 238,
"į": 239,
"İ": 240,
"ı": 241,
"IJ": 242,
"ij": 243,
"Ĵ": 244,
"ĵ": 245,
"Ķ": 246,
"ķ": 247,
"ĸ": 248,
"Ĺ": 249,
"ĺ": 250,
"Ļ": 251,
"ļ": 252,
"Ľ": 253,
"ľ": 254,
"Ŀ": 255,
"ŀ": 256,
"Ł": 257,
"ł": 258,
"Ń": 259,
"Ġt": 260,
"Ġa": 261,
"he": 262,
"in": 263,
"re": 264,
"Ġthe": 265,
"on": 266,
"er": 267,
"Ġo": 268,
"at": 269,
"Ġs": 270,
"en": 271,
"Ġc": 272,
"es": 273,
"Ġw": 274,
"is": 275,
"nd": 276,
"or": 277,
"it": 278,
"Ġp": 279,
"al": 280,
"ed": 281,
"Ġof": 282,
"Ġb": 283,
"an": 284,
"Ġf": 285,
"Ġin": 286,
"ar": 287,
"ing": 288,
"ou": 289,
"Ġm": 290,
"ic": 291,
"Ġand": 292,
"Ġto": 293,
"ion": 294,
"Ġd": 295,
"ro": 296,
"le": 297,
"as": 298,
"Ġh": 299,
"ent": 300,
"Ġth": 301,
"il": 302,
"ct": 303,
"Ġe": 304,
"Ġre": 305,
"om": 306,
"ve": 307,
"Ġn": 308,
"st": 309,
"Ġl": 310,
"ly": 311,
"Ġbe": 312,
"Ġis": 313,
"ĠT": 314,
"se": 315,
"ol": 316,
"ation": 317,
"Ġg": 318,
"id": 319,
"im": 320,
"et": 321,
"ĠA": 322,
"ut": 323,
"ce": 324,
"ot": 325,
"ur": 326,
"ra": 327,
"ch": 328,
"ĠS": 329,
"Ġfor": 330,
"ow": 331,
"ig": 332,
"Ġon": 333,
"ĠC": 334,
"Ġthat": 335,
"Ġu": 336,
"ver": 337,
"âĢ": 338,
"Ġst": 339,
"ĠI": 340,
"ir": 341,
"am": 342,
"ay": 343,
"ul": 344,
"ad": 345,
"el": 346,
"her": 347,
"Ġas": 348,
"ith": 349,
"Ġy": 350,
"Ġpro": 351,
"Ġcon": 352,
"ĠM": 353,
"Ġan": 354,
"Ġare": 355,
"Ġ(": 356,
"Ġwith": 357,
"Ġor": 358,
"Ġ1": 359,
"Ġwh": 360,
"ter": 361,
"if": 362,
"ĠP": 363,
"od": 364,
"Ġit": 365,
"ĠThe": 366,
"Ġal": 367,
"ment": 368,
"th": 369,
"ist": 370,
"ge": 371,
"op": 372,
"ate": 373,
"ers": 374,
"ĠB": 375,
"Ġde": 376,
"ies": 377,
"ab": 378,
"Ġhe": 379,
"ill": 380,
"Ġsu": 381,
"Ġyou": 382,
"Ġex": 383,
"res": 384,
"ĠH": 385,
"us": 386,
"ere": 387,
"est": 388,
"ect": 389,
"ess": 390,
"Ġby": 391,
"ore": 392,
"rom": 393,
"ac": 394,
"Ġcom": 395,
"os": 396,
"ity": 397,
"ld": 398,
"um": 399,
"and": 400,
"ri": 401,
"Ġat": 402,
"ĠW": 403,
"ĠD": 404,
"em": 405,
"Ġv": 406,
"ive": 407,
"ain": 408,
"Ġwas": 409,
"ĠR": 410,
"qu": 411,
"nt": 412,
"ant": 413,
"ĠE": 414,
"igh": 415,
"ke": 416,
"pp": 417,
"Ġfrom": 418,
"Ġha": 419,
"ud": 420,
"ĠF": 421,
"Ġ2": 422,
"ĠN": 423,
"oc": 424,
"Ġch": 425,
"iv": 426,
"ort": 427,
"Ġse": 428,
"Ġne": 429,
"Ġr": 430,
"ĠG": 431,
"Ġnot": 432,
"Ġcan": 433,
"00": 434,
"art": 435,
"ical": 436,
"ure": 437,
"un": 438,
"ĠL": 439,
"Ġhave": 440,
"ial": 441,
"Ġle": 442,
"og": 443,
"Ġsp": 444,
"Ġsh": 445,
"all": 446,
"ight": 447,
"'s": 448,
"ich": 449,
"ther": 450,
"Ġen": 451,
"pt": 452,
"Ġthis": 453,
"rou": 454,
"Ġab": 455,
"The": 456,
"ould": 457,
"gh": 458,
"âĢĻ": 459,
"ost": 460,
"our": 461,
"ions": 462,
"ated": 463,
"ome": 464,
"ear": 465,
"ĠJ": 466,
"ine": 467,
"Ġwor": 468,
"ag": 469,
"ĠO": 470,
"du": 471,
"ĠU": 472,
"ard": 473,
"out": 474,
"Ġwe": 475,
"ell": 476,
"ff": 477,
"ast": 478,
"ap": 479,
"Ġim": 480,
"ec": 481,
"Ġpl": 482,
"Ġus": 483,
"ans": 484,
"Ġint": 485,
"ew": 486,
"Ġtheir": 487,
"Ġwhich": 488,
"pl": 489,
"act": 490,
"ust": 491,
"age": 492,
"ĠIn": 493,
"Ġ\"": 494,
"ous": 495,
"ĠâĢ": 496,
"are": 497,
"ak": 498,
"Ġwhe": 499
},
"merges": [
[
"Ġ",
"t"
],
[
"Ġ",
"a"
],
[
"h",
"e"
],
[
"i",
"n"
],
[
"r",
"e"
],
[
"Ġt",
"he"
],
[
"o",
"n"
],
[
"e",
"r"
],
[
"Ġ",
"o"
],
[
"a",
"t"
],
[
"Ġ",
"s"
],
[
"e",
"n"
],
[
"Ġ",
"c"
],
[
"e",
"s"
],
[
"Ġ",
"w"
],
[
"i",
"s"
],
[
"n",
"d"
],
[
"o",
"r"
],
[
"i",
"t"
],
[
"Ġ",
"p"
],
[
"a",
"l"
],
[
"e",
"d"
],
[
"Ġo",
"f"
],
[
"Ġ",
"b"
],
[
"a",
"n"
],
[
"Ġ",
"f"
],
[
"Ġ",
"in"
],
[
"a",
"r"
],
[
"in",
"g"
],
[
"o",
"u"
],
[
"Ġ",
"m"
],
[
"i",
"c"
],
[
"Ġa",
"nd"
],
[
"Ġt",
"o"
],
[
"i",
"on"
],
[
"Ġ",
"d"
],
[
"r",
"o"
],
[
"l",
"e"
],
[
"a",
"s"
],
[
"Ġ",
"h"
],
[
"en",
"t"
],
[
"Ġt",
"h"
],
[
"i",
"l"
],
[
"c",
"t"
],
[
"Ġ",
"e"
],
[
"Ġ",
"re"
],
[
"o",
"m"
],
[
"v",
"e"
],
[
"Ġ",
"n"
],
[
"s",
"t"
],
[
"Ġ",
"l"
],
[
"l",
"y"
],
[
"Ġb",
"e"
],
[
"Ġ",
"is"
],
[
"Ġ",
"T"
],
[
"s",
"e"
],
[
"o",
"l"
],
[
"at",
"ion"
],
[
"Ġ",
"g"
],
[
"i",
"d"
],
[
"i",
"m"
],
[
"e",
"t"
],
[
"Ġ",
"A"
],
[
"u",
"t"
],
[
"c",
"e"
],
[
"o",
"t"
],
[
"u",
"r"
],
[
"r",
"a"
],
[
"c",
"h"
],
[
"Ġ",
"S"
],
[
"Ġf",
"or"
],
[
"o",
"w"
],
[
"i",
"g"
],
[
"Ġ",
"on"
],
[
"Ġ",
"C"
],
[
"Ġth",
"at"
],
[
"Ġ",
"u"
],
[
"v",
"er"
],
[
"â",
"Ģ"
],
[
"Ġs",
"t"
],
[
"Ġ",
"I"
],
[
"i",
"r"
],
[
"a",
"m"
],
[
"a",
"y"
],
[
"u",
"l"
],
[
"a",
"d"
],
[
"e",
"l"
],
[
"he",
"r"
],
[
"Ġa",
"s"
],
[
"it",
"h"
],
[
"Ġ",
"y"
],
[
"Ġp",
"ro"
],
[
"Ġc",
"on"
],
[
"Ġ",
"M"
],
[
"Ġa",
"n"
],
[
"Ġa",
"re"
],
[
"Ġ",
"("
],
[
"Ġw",
"ith"
],
[
"Ġo",
"r"
],
[
"Ġ",
"1"
],
[
"Ġw",
"h"
],
[
"t",
"er"
],
[
"i",
"f"
],
[
"Ġ",
"P"
],
[
"o",
"d"
],
[
"Ġ",
"it"
],
[
"ĠT",
"he"
],
[
"Ġa",
"l"
],
[
"m",
"ent"
],
[
"t",
"h"
],
[
"is",
"t"
],
[
"g",
"e"
],
[
"o",
"p"
],
[
"at",
"e"
],
[
"er",
"s"
],
[
"Ġ",
"B"
],
[
"Ġd",
"e"
],
[
"i",
"es"
],
[
"a",
"b"
],
[
"Ġ",
"he"
],
[
"il",
"l"
],
[
"Ġs",
"u"
],
[
"Ġy",
"ou"
],
[
"Ġe",
"x"
],
[
"re",
"s"
],
[
"Ġ",
"H"
],
[
"u",
"s"
],
[
"e",
"re"
],
[
"es",
"t"
],
[
"e",
"ct"
],
[
"es",
"s"
],
[
"Ġb",
"y"
],
[
"o",
"re"
],
[
"ro",
"m"
],
[
"a",
"c"
],
[
"Ġc",
"om"
],
[
"o",
"s"
],
[
"it",
"y"
],
[
"l",
"d"
],
[
"u",
"m"
],
[
"a",
"nd"
],
[
"r",
"i"
],
[
"Ġa",
"t"
],
[
"Ġ",
"W"
],
[
"Ġ",
"D"
],
[
"e",
"m"
],
[
"Ġ",
"v"
],
[
"i",
"ve"
],
[
"a",
"in"
],
[
"Ġw",
"as"
],
[
"Ġ",
"R"
],
[
"q",
"u"
],
[
"n",
"t"
],
[
"an",
"t"
],
[
"Ġ",
"E"
],
[
"ig",
"h"
],
[
"k",
"e"
],
[
"p",
"p"
],
[
"Ġf",
"rom"
],
[
"Ġh",
"a"
],
[
"u",
"d"
],
[
"Ġ",
"F"
],
[
"Ġ",
"2"
],
[
"Ġ",
"N"
],
[
"o",
"c"
],
[
"Ġc",
"h"
],
[
"i",
"v"
],
[
"or",
"t"
],
[
"Ġs",
"e"
],
[
"Ġn",
"e"
],
[
"Ġ",
"r"
],
[
"Ġ",
"G"
],
[
"Ġn",
"ot"
],
[
"Ġc",
"an"
],
[
"0",
"0"
],
[
"ar",
"t"
],
[
"ic",
"al"
],
[
"u",
"re"
],
[
"u",
"n"
],
[
"Ġ",
"L"
],
[
"Ġha",
"ve"
],
[
"i",
"al"
],
[
"Ġ",
"le"
],
[
"o",
"g"
],
[
"Ġs",
"p"
],
[
"Ġs",
"h"
],
[
"al",
"l"
],
[
"igh",
"t"
],
[
"'",
"s"
],
[
"ic",
"h"
],
[
"t",
"her"
],
[
"Ġ",
"en"
],
[
"p",
"t"
],
[
"Ġth",
"is"
],
[
"r",
"ou"
],
[
"Ġa",
"b"
],
[
"T",
"he"
],
[
"ou",
"ld"
],
[
"g",
"h"
],
[
"âĢ",
"Ļ"
],
[
"o",
"st"
],
[
"ou",
"r"
],
[
"ion",
"s"
],
[
"at",
"ed"
],
[
"om",
"e"
],
[
"e",
"ar"
],
[
"Ġ",
"J"
],
[
"in",
"e"
],
[
"Ġw",
"or"
],
[
"a",
"g"
],
[
"Ġ",
"O"
],
[
"d",
"u"
],
[
"Ġ",
"U"
],
[
"ar",
"d"
],
[
"ou",
"t"
],
[
"Ġw",
"e"
],
[
"el",
"l"
],
[
"f",
"f"
],
[
"as",
"t"
],
[
"a",
"p"
],
[
"Ġ",
"im"
],
[
"e",
"c"
],
[
"Ġp",
"l"
],
[
"Ġu",
"s"
],
[
"an",
"s"
],
[
"Ġin",
"t"
],
[
"e",
"w"
],
[
"Ġthe",
"ir"
],
[
"Ġwh",
"ich"
],
[
"p",
"l"
],
[
"a",
"ct"
],
[
"u",
"st"
],
[
"a",
"ge"
],
[
"ĠI",
"n"
],
[
"Ġ",
"\""
],
[
"ou",
"s"
],
[
"Ġ",
"âĢ"
],
[
"a",
"re"
],
[
"a",
"k"
],
[
"Ġw",
"he"
]
]
}
}
}