#!/usr/bin/env python3 # Copyright 2026 Xiaomi Corp. (authors: Han Zhu) # # See ../../LICENSE for clarification regarding multiple authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Language name to ISO 639-3 code mapping. Auto-generated from ``docs/lang_id_name_map.tsv``. Provides ``LANG_NAME_TO_ID`` (for resolving language names to codes) and ``LANG_IDS`` (the set of supported ISO 639-3 codes). Used by ``OmniVoice.generate()`` to resolve user-provided language names. """ # Auto-generated from docs/lang_id_name_map.tsv # Maps lowercase language name -> language ID code LANG_NAME_TO_ID = { "abadi": "kbt", "abkhazian": "ab", "abron": "abr", "abua": "abn", "adamawa fulfulde": "fub", "adyghe": "ady", "afade": "aal", "afrikaans": "af", "agwagwune": "yay", "aja (benin)": "ajg", "akebu": "keu", "alago": "ala", "albanian": "sq", "algerian arabic": "arq", "algerian saharan arabic": "aao", "ambo-pasco quechua": "qva", "ambonese malay": "abs", "amdo tibetan": "adx", "amharic": "am", "anaang": "anw", "angika": "anp", "antankarana malagasy": "xmv", "aragonese": "an", "arbëreshë albanian": "aae", "arequipa-la unión quechua": "qxu", "armenian": "hy", "ashe": "ahs", "ashéninka perené": "prq", "askopan": "eiv", "assamese": "as", "asturian": "ast", "atayal": "tay", "awak": "awo", "ayacucho quechua": "quy", "azerbaijani": "az", "baatonum": "bba", "bacama": "bcy", "bade": "bde", "bafia": "ksf", "bafut": "bfd", "bagirmi fulfulde": "fui", "bago-kusuntu": "bqg", "baharna arabic": "abv", "bakoko": "bkh", "balanta-ganja": "bjt", "balti": "bft", "bamenyam": "bce", "bamun": "bax", "bangwinji": "bsj", "banjar": "bjn", "bankon": "abb", "baoulé": "bci", "bara malagasy": "bhr", "barok": "bjk", "basa (cameroon)": "bas", "basa (nigeria)": "bzw", "bashkir": "ba", "basque": "eu", "batak mandailing": "btm", "batanga": "bnm", "bateri": "btv", "bats": "bbl", "bayot": "bda", "bebele": "beb", "belarusian": "be", "bengali": "bn", "betawi": "bew", "bhili": "bhb", "bhojpuri": "bho", "bilur": "bxf", "bima": "bhp", "bodo": "brx", "boghom": "bux", "bokyi": "bky", "bomu": "bmq", "bondei": "bou", "borgu fulfulde": "fue", "bosnian": "bs", "brahui": "brh", "braj": "bra", "breton": "br", "buduma": "bdm", "buginese": "bug", "bukharic": "bhh", "bulgarian": "bg", "bulu (cameroon)": "bum", "bundeli": "bns", "bunun": "bnn", "bura-pabir": "bwr", "burak": "bys", "burmese": "my", "burushaski": "bsk", "cacaloxtepec mixtec": "miu", "cajatambo north lima quechua": "qvl", "cakfem-mushere": "cky", "cameroon pidgin": "wes", "campidanese sardinian": "sro", "cantonese": "yue", "catalan": "ca", "cebuano": "ceb", "cen": "cen", "central kurdish": "ckb", "central nahuatl": "nhn", "central pame": "pbs", "central pashto": "pst", "central puebla nahuatl": "ncx", "central tarahumara": "tar", "central yupik": "esu", "central-eastern niger fulfulde": "fuq", "chadian arabic": "shu", "chichewa": "ny", "chichicapan zapotec": "zpv", "chiga": "cgg", "chimalapa zoque": "zoh", "chimborazo highland quichua": "qug", "chinese": "zh", "chiquián ancash quechua": "qxa", "chitwania tharu": "the", "chokwe": "cjk", "chuvash": "cv", "cibak": "ckl", "coastal konjo": "kjc", "copainalá zoque": "zoc", "cornish": "kw", "corongo ancash quechua": "qwa", "croatian": "hr", "cross river mbembe": "mfn", "cuyamecalco mixtec": "xtu", "czech": "cs", "dadiya": "dbd", "dagbani": "dag", "dameli": "dml", "danish": "da", "dargwa": "dar", "dazaga": "dzg", "deccan": "dcc", "degema": "deg", "dera (nigeria)": "kna", "dghwede": "dgh", "dhatki": "mki", "dhivehi": "dv", "dhofari arabic": "adf", "dijim-bwilim": "cfa", "dogri": "dgo", "domaaki": "dmk", "dotyali": "dty", "duala": "dua", "dutch": "nl", "dũya": "ldb", "dyula": "dyu", "eastern balochi": "bgp", "eastern bolivian guaraní": "gui", "eastern egyptian bedawi arabic": "avl", "eastern krahn": "kqo", "eastern mari": "mhr", "eastern yiddish": "ydd", "ebrié": "ebr", "eggon": "ego", "egyptian arabic": "arz", "ejagham": "etu", "eleme": "elm", "eloyi": "afo", "embu": "ebu", "english": "en", "erzya": "myv", "esan": "ish", "esperanto": "eo", "estonian": "et", "eton (cameroon)": "eto", "ewondo": "ewo", "extremaduran": "ext", "fang (equatorial guinea)": "fan", "fanti": "fat", "farefare": "gur", "fe'fe'": "fmp", "filipino": "fil", "filomena mata-coahuitlán totonac": "tlp", "finnish": "fi", "fipa": "fip", "french": "fr", "fulah": "ff", "galician": "gl", "gambian wolof": "wof", "ganda": "lg", "garhwali": "gbm", "gawar-bati": "gwt", "gawri": "gwc", "gbagyi": "gbr", "gbari": "gby", "geji": "gyz", "gen": "gej", "georgian": "ka", "german": "de", "geser-gorom": "ges", "gheg albanian": "aln", "ghomálá'": "bbj", "gidar": "gid", "glavda": "glw", "goan konkani": "gom", "goaria": "gig", "goemai": "ank", "gola": "gol", "greek": "el", "guarani": "gn", "guduf-gava": "gdf", "guerrero amuzgo": "amu", "gujarati": "gu", "gujari": "gju", "gulf arabic": "afb", "gurgula": "ggg", "gusii": "guz", "gusilay": "gsl", "gweno": "gwe", "güilá zapotec": "ztu", "hadothi": "hoj", "hahon": "hah", "haitian": "ht", "hakha chin": "cnh", "hakö": "hao", "halia": "hla", "hausa": "ha", "hawaiian": "haw", "hazaragi": "haz", "hebrew": "he", "hemba": "hem", "herero": "hz", "highland konjo": "kjk", "hijazi arabic": "acw", "hindi": "hi", "huarijio": "var", "huautla mazatec": "mau", "huaxcaleca nahuatl": "nhq", "huba": "hbb", "huitepec mixtec": "mxs", "hula": "hul", "hungarian": "hu", "hunjara-kaina ke": "hkk", "hwana": "hwo", "ibibio": "ibb", "icelandic": "is", "idakho-isukha-tiriki": "ida", "idoma": "idu", "igbo": "ig", "igo": "ahl", "ikposo": "kpo", "ikwere": "ikw", "imbabura highland quichua": "qvi", "indonesian": "id", "indus kohistani": "mvy", "interlingua (international auxiliary language association)": "ia", "inupiaq": "ik", "irish": "ga", "iron ossetic": "os", "isekiri": "its", "isoko": "iso", "italian": "it", "ito": "itw", "itzá": "itz", "ixtayutla mixtec": "vmj", "izon": "ijc", "jambi malay": "jax", "japanese": "ja", "jaqaru": "jqr", "jauja wanca quechua": "qxw", "jaunsari": "jns", "javanese": "jv", "jiba": "juo", "jju": "kaj", "judeo-moroccan arabic": "aju", "juxtlahuaca mixtec": "vmc", "kabardian": "kbd", "kabras": "lkb", "kabuverdianu": "kea", "kabyle": "kab", "kachi koli": "gjk", "kairak": "ckr", "kalabari": "ijn", "kalasha": "kls", "kalenjin": "kln", "kalkoti": "xka", "kamba": "kam", "kamo": "kcq", "kanauji": "bjj", "kanembu": "kbl", "kannada": "kn", "karekare": "kai", "kashmiri": "ks", "kathoriya tharu": "tkt", "kati": "bsh", "kazakh": "kk", "keiyo": "eyo", "khams tibetan": "khg", "khana": "ogo", "khetrani": "xhe", "khmer": "km", "khowar": "khw", "kinga": "zga", "kinnauri": "kfk", "kinyarwanda": "rw", "kirghiz": "ky", "kirya-konzəl": "fkk", "kochila tharu": "thq", "kohistani shina": "plk", "kohumono": "bcs", "kok borok": "trp", "kol (papua new guinea)": "kol", "kom (cameroon)": "bkm", "koma": "kmy", "konkani": "knn", "konzo": "koo", "korean": "ko", "korwa": "kfp", "kota (india)": "kfe", "koti": "eko", "kuanua": "ksd", "kuanyama": "kj", "kui (india)": "uki", "kulung (nigeria)": "bbu", "kuot": "kto", "kushi": "kuh", "kwambi": "kwm", "kwasio": "nmg", "lala-roba": "lla", "lamang": "hia", "lao": "lo", "larike-wakasihu": "alo", "lasi": "lss", "latgalian": "ltg", "latvian": "lv", "levantine arabic": "apc", "liana-seti": "ste", "liberia kpelle": "xpe", "liberian english": "lir", "libyan arabic": "ayl", "ligurian": "lij", "lijili": "mgi", "lingala": "ln", "lithuanian": "lt", "loarki": "lrk", "logooli": "rag", "logudorese sardinian": "src", "loja highland quichua": "qvj", "loloda": "loa", "longuda": "lnu", "loxicha zapotec": "ztp", "luba-lulua": "lua", "luo": "luo", "lushai": "lus", "luxembourgish": "lb", "maasina fulfulde": "ffm", "maba (chad)": "mde", "macedo-romanian": "rup", "macedonian": "mk", "mada (cameroon)": "mxu", "mafa": "maf", "maithili": "mai", "malay": "ms", "malayalam": "ml", "mali": "gcc", "malinaltepec me'phaa": "tcf", "maltese": "mt", "mandara": "tbf", "mandjak": "mfv", "manggarai": "mqy", "manipuri": "mni", "mansoanka": "msw", "manx": "gv", "maori": "mi", "marathi": "mr", "marghi central": "mrt", "marghi south": "mfm", "maria (india)": "mrr", "marwari (pakistan)": "mve", "masana": "mcn", "masikoro malagasy": "msh", "matsés": "mcf", "mazaltepec zapotec": "zpy", "mazatlán mazatec": "vmz", "mazatlán mixe": "mzl", "mbe": "mfo", "mbo (cameroon)": "mbo", "mbum": "mdd", "medumba": "byv", "mekeo": "mek", "meru": "mer", "mesopotamian arabic": "acm", "mewari": "mtr", "min nan chinese": "nan", "mingrelian": "xmf", "mitlatongo mixtec": "vmm", "miya": "mkf", "mokpwe": "bri", "moksha": "mdf", "mom jango": "ver", "mongolian": "mn", "moroccan arabic": "ary", "motu": "meu", "mpiemo": "mcx", "mpumpong": "mgg", "mundang": "mua", "mungaka": "mhk", "musey": "mse", "musgu": "mug", "musi": "mui", "naba": "mne", "najdi arabic": "ars", "nalik": "nal", "nawdm": "nmz", "ndonga": "ng", "neapolitan": "nap", "nepali": "npi", "ngamo": "nbh", "ngas": "anc", "ngiemboon": "nnh", "ngizim": "ngi", "ngomba": "jgo", "ngombale": "nla", "nigerian fulfulde": "fuv", "nigerian pidgin": "pcm", "nimadi": "noe", "nobiin": "fia", "north mesopotamian arabic": "ayp", "north moluccan malay": "max", "northern betsimisaraka malagasy": "bmm", "northern hindko": "hno", "northern kurdish": "kmr", "northern pame": "pmq", "northern pashto": "pbu", "northern uzbek": "uzn", "northwest gbaya": "gya", "norwegian": "no", "norwegian bokmål": "nb", "norwegian nynorsk": "nn", "notsi": "ncf", "nyankpa": "yes", "nyungwe": "nyu", "nzanyi": "nja", "nüpode huitoto": "hux", "occitan": "oc", "od": "odk", "odia": "ory", "odual": "odu", "omani arabic": "acx", "orizaba nahuatl": "nlv", "orma": "orc", "ormuri": "oru", "oromo": "om", "pahari-potwari": "phr", "paiwan": "pwn", "panjabi": "pa", "papuan malay": "pmy", "parkari koli": "kvx", "pedi": "nso", "pero": "pip", "persian": "fa", "petats": "pex", "phalura": "phl", "piemontese": "pms", "piya-kwonci": "piy", "plateau malagasy": "plt", "polish": "pl", "poqomam": "poc", "portuguese": "pt", "pulaar": "fuc", "pular": "fuf", "puno quechua": "qxp", "pushto": "ps", "pökoot": "pko", "qaqet": "byx", "quiotepec chinantec": "chq", "rana tharu": "thr", "rangi": "lag", "rapoisi": "kyx", "ratahan": "rth", "rayón zoque": "zor", "romanian": "ro", "romansh": "rm", "rombo": "rof", "rotokas": "roo", "rukai": "dru", "russian": "ru", "sacapulteco": "quv", "saidi arabic": "aec", "sakalava malagasy": "skg", "sakizaya": "szy", "saleman": "sau", "samba daka": "ccg", "samba leko": "ndi", "san felipe otlaltepec popoloca": "pow", "san francisco del mar huave": "hue", "san juan atzingo popoloca": "poe", "san martín itunyoso triqui": "trq", "san miguel el grande mixtec": "mig", "sansi": "ssi", "sanskrit": "sa", "santa ana de tusi pasco quechua": "qxt", "santa catarina albarradas zapotec": "ztn", "santali": "sat", "santiago del estero quichua": "qus", "saposa": "sps", "saraiki": "skr", "sardinian": "sc", "saya": "say", "sediq": "trv", "serbian": "sr", "seri": "sei", "shina": "scl", "shona": "sn", "siar-lak": "sjr", "sibe": "nco", "sicilian": "scn", "sihuas ancash quechua": "qws", "sikkimese": "sip", "sinaugoro": "snc", "sindhi": "sd", "sindhi bhil": "sbn", "sinhala": "si", "sinicahua mixtec": "xti", "sipacapense": "qum", "siwai": "siw", "slovak": "sk", "slovenian": "sl", "solos": "sol", "somali": "so", "soninke": "snk", "south giziga": "giz", "south ucayali ashéninka": "cpy", "southeastern nochixtlán mixtec": "mxy", "southern betsimisaraka malagasy": "bzc", "southern pashto": "pbt", "southern pastaza quechua": "qup", "soyaltepec mazatec": "vmp", "spanish": "es", "standard arabic": "arb", "standard moroccan tamazight": "zgh", "sudanese arabic": "apd", "sulka": "sua", "svan": "sva", "swahili": "sw", "swedish": "sv", "tae'": "rob", "tahaggart tamahaq": "thv", "taita": "dav", "tajik": "tg", "tamil": "ta", "tandroy-mahafaly malagasy": "tdx", "tangale": "tan", "tanosy malagasy": "txy", "tarok": "yer", "tatar": "tt", "tedaga": "tuq", "telugu": "te", "tem": "kdh", "teop": "tio", "tepeuxila cuicatec": "cux", "tepinapa chinantec": "cte", "tera": "ttr", "terei": "buo", "termanu": "twu", "tesaka malagasy": "tkg", "tetelcingo nahuatl": "nhg", "teutila cuicatec": "cut", "thai": "th", "tibetan": "bo", "tidaá mixtec": "mtx", "tidore": "tvo", "tigak": "tgc", "tigre": "tig", "tigrinya": "ti", "tilquiapan zapotec": "zts", "tinputz": "tpz", "tlacoapa me'phaa": "tpl", "tlacoatzintepec chinantec": "ctl", "tlingit": "tli", "toki pona": "tok", "tomoip": "tqp", "tondano": "tdn", "tonsea": "txs", "tooro": "ttj", "torau": "ttu", "torwali": "trw", "tsimihety malagasy": "xmw", "tsotso": "lto", "tswana": "tn", "tugen": "tuy", "tuki": "bag", "tula": "tul", "tulu": "tcy", "tunen": "tvu", "tungag": "lcm", "tunisian arabic": "aeb", "tupuri": "tui", "turkana": "tuv", "turkish": "tr", "turkmen": "tk", "tututepec mixtec": "mtu", "twi": "tw", "ubaghara": "byc", "uighur": "ug", "ukrainian": "uk", "umbundu": "umb", "upper sorbian": "hsb", "urdu": "ur", "ushojo": "ush", "uzbek": "uz", "vai": "vai", "vietnamese": "vi", "votic": "vot", "võro": "vro", "waci gbe": "wci", "wadiyara koli": "kxp", "waja": "wja", "wakhi": "wbl", "wanga": "lwg", "wapan": "juk", "warji": "wji", "welsh": "cy", "wemale": "weo", "western frisian": "fy", "western highland purepecha": "pua", "western juxtlahuaca mixtec": "jmx", "western maninkakan": "mlq", "western mari": "mrj", "western niger fulfulde": "fuh", "western panjabi": "pnb", "wolof": "wo", "wuzlam": "udl", "xanaguía zapotec": "ztg", "xhosa": "xh", "yace": "ekr", "yakut": "sah", "yalahatan": "jal", "yanahuanca pasco quechua": "qur", "yangben": "yav", "yaqui": "yaq", "yauyos quechua": "qux", "yekhee": "ets", "yiddish": "yi", "yidgha": "ydg", "yoruba": "yo", "yutanduchi mixtec": "mab", "zacatlán-ahuacatlán-tepetzintla nahuatl": "nhi", "zarma": "dje", "zaza": "zza", "zulu": "zu", "ömie": "aom", } LANG_NAMES = set(LANG_NAME_TO_ID.keys()) LANG_IDS = set(LANG_NAME_TO_ID.values()) # Exceptions where .title() doesn't match the canonical casing from the TSV. _TITLE_EXCEPTIONS = { "fe'fe'": "Fe'fe'", "dũya": "Dũya", "santiago del estero quichua": "Santiago del Estero Quichua", "santa ana de tusi pasco quechua": "Santa Ana de Tusi Pasco Quechua", "malinaltepec me'phaa": "Malinaltepec Me'phaa", "tlacoapa me'phaa": "Tlacoapa Me'phaa", } def lang_display_name(name: str) -> str: """Return a display-friendly version of a lowercase language name. Uses .title() for most names, with manual exceptions for cases like apostrophes and small words (de, del) that should stay lowercase. """ return _TITLE_EXCEPTIONS.get(name, name.title())