| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import gradio as gr |
| import os, csv, re, base64, unicodedata, gzip |
| import torch |
| from transformers import AutoProcessor, VitsModel |
| import numpy as np |
| from html import escape |
|
|
| |
| os.environ['TRANSFORMERS_CACHE'] = os.environ.get('TRANSFORMERS_CACHE', '/tmp/cache') |
| os.environ['HF_HOME'] = os.environ.get('HF_HOME', '/tmp/hf') |
|
|
| DEBUG_MODE = False |
| def debug_print(msg): |
| if DEBUG_MODE: print(f"[DEBUG] {msg}") |
|
|
| |
| def _open_maybe_gzip(path): |
| if str(path).endswith(".gz"): |
| |
| return gzip.open(path, "rt", encoding="utf-8", newline="") |
| return open(path, "r", encoding="utf-8", newline="") |
|
|
| def norm(x): return (str(x).strip()) if x is not None else "" |
| def lower(x): return norm(x).lower() |
| def fold(s:str)->str: |
| return ''.join(c for c in unicodedata.normalize('NFD', s or "") if unicodedata.category(c)!="Mn") |
|
|
| |
| def _cand(*names): |
| for n in names: |
| if os.path.exists(n): return n |
| p = os.path.join("salida", n) |
| if os.path.exists(p): return p |
| return names[0] |
|
|
| |
| CSV_BI = _cand( |
| "LEXICON_v152_IBERIAN.csv.gz", |
| "LEXICON_v86_IBERIAN.csv.gz", |
| "LEXICON_v85_IBERIAN.csv.gz", |
| "LEXICON_v84_IBERIAN.csv.gz", |
| "LEXICON_v83_IBERIAN.csv.gz", |
| "LEXICON_v82_IBERIAN.csv.gz", |
| "LEXICON_v81_IBERIAN.csv.gz", |
| "LEXICON_v80_IBERIAN.csv.gz", |
| "LEXICON_v79_IBERIAN.csv.gz", |
| "LEXICON_v78_IBERIAN.csv.gz", |
| "LEXICON_v77_IBERIAN.csv.gz", |
| "LEXICON_v76_IBERIAN.csv.gz", |
| "LEXICON_v75_IBERIAN.csv.gz", |
| "LEXICON_v74_IBERIAN.csv.gz", |
| "LEXICON_v73_IBERIAN.csv.gz", |
| "LEXICON_v72_IBERIAN.csv.gz", |
| "LEXICON_v71_IBERIAN.csv.gz", |
| "LEXICON_v70_IBERIAN.csv.gz", |
| "LEXICON_v68_IBERIAN.csv.gz", |
| "LEXICON_v67_IBERIAN.csv.gz", |
| "LEXICON_v66_IBERIAN.csv.gz", |
| "LEXICON_v65_IBERIAN.csv.gz", |
| "LEXICON_v64_IBERIAN.csv.gz", |
| "LEXICON_v63_IBERIAN.csv.gz", |
| "LEXICON_v60_FINAL.csv.gz", |
| "LEXICON_v59_PATCHED.csv.gz", |
| "LEXICON_UNICO_1a1_v43_all_verbs.csv.gz", |
| "LEXICON_UNICO_1a1_v42_verbs_fix.csv.gz", |
| "LEXICON_UNICO_1a1_v41_family_fix.csv.gz", |
| "LEXICON_UNICO_1a1_v40_accent_fix.csv.gz", |
| "MASTER_SURFACE_READY.csv.gz", |
| "MASTER_REEXTENDED.csv.gz", |
| "BI_SURFACE_READY.csv.gz", |
| "HF_Pairs_BI_REEXTENDED.csv.gz", |
| "HF_Pairs_BI_EXPANDED1_EXTENDED_FILLED.csv.gz", |
| "HF_Pairs_BI_EXPANDED1.csv.gz" |
| ) |
|
|
| |
| |
| ES2NI = {} |
| NI2ES = {} |
| ES2NI_VERB = {} |
| ES2NI_POS = {} |
| ES2NI_MORPH = {} |
|
|
| |
| ESPHRASE2NI = {} |
| NIPHRASE2ES = {} |
| MAX_NGRAM = 3 |
|
|
| |
| ES_FOLD = {} |
| NI_FOLD = {} |
|
|
| |
| VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’")) |
|
|
| |
| |
| |
| |
| |
| NEWLINE_TOK = "⏎" |
| VISIBLE_PUNCT.add(NEWLINE_TOK) |
| _num_re = re.compile(r"^\d+([.,:]\d+)?$") |
| def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or "")) |
|
|
| |
| _NI_UNITS = {0:'',1:'ban',2:'bi',3:'irur',4:'laur',5:'borste', |
| 6:'sei',7:'sisbi',8:'sorse',9:'bedar',10:'abar'} |
| _NI_TWENTIES = {1:'orkei',2:'binorkei',3:'irurokei',4:'laurokei'} |
|
|
| def digit_to_ni(tok:str)->str: |
| """Convierte un número entero (str de dígitos) a numeral neoíbero.""" |
| try: |
| n = int(tok) |
| except (ValueError, TypeError): |
| return tok |
| if n <= 0: return tok |
| if n <= 10: return _NI_UNITS[n] |
| if n <= 19: return f"abar-ke-{_NI_UNITS[n-10]}" |
| if n == 20: return "orkei" |
| if n < 100: |
| twenties = n // 20 |
| remainder = n % 20 |
| base = _NI_TWENTIES.get(twenties, tok) |
| if remainder == 0: return base |
| elif remainder == 10: return f"{base}-abar" |
| elif remainder > 10: return f"{base}-abar-ke-{_NI_UNITS[remainder-10]}" |
| else: return f"{base}-ke-{_NI_UNITS[remainder]}" |
| if n == 100: return "atun" |
| if n <= 999: |
| hundreds = n // 100 |
| remainder = n % 100 |
| h = "atun" if hundreds == 1 else f"{_NI_UNITS[hundreds]}-atun" |
| if remainder == 0: return h |
| r = digit_to_ni(str(remainder)) |
| return f"{h}-ke-{r}" |
| return tok |
|
|
| |
| CLAUSE_BREAKS = {",", ";", "—", "–", ":"} |
| PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$") |
| def is_placeholder(tok: str) -> bool: |
| return bool(PLACEHOLDER_RE.match(tok or "")) |
|
|
| def _restore_brk(tok, protected): |
| m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "") |
| if not m: return tok |
| idx = int(m.group(1)) |
| suf = m.group(2) |
| base = protected[idx] if 0 <= idx < len(protected) else tok |
| return base + (f"-{suf}" if suf else "") |
|
|
| def simple_tokenize(text:str): |
| """Tokenización mínima, sin romper [ ... ] ni [ ... ]-na/-ba.""" |
| if not text: |
| return [] |
| protected = [] |
| def _repl(m): |
| key = f"__BRK{len(protected)}__" |
| protected.append(m.group(0)) |
| return key |
| t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip()) |
| |
| |
| t = re.sub(r"\d+[.,:]\d+", _repl, t) |
| t = re.sub(r"\s+"," ", t) |
| t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t) |
| toks = [tok for tok in t.split() if tok] |
| for i, tok in enumerate(toks): |
| if tok.startswith("__BRK") and "__" in tok: |
| toks[i] = _restore_brk(tok, protected) |
| return toks |
|
|
| |
| |
| _ENCLITICS = ('los','las','les','nos','me','te','lo','la','le','se','os') |
| _ACCENTED_VOWELS = str.maketrans('áéíóú', 'aeiou') |
|
|
| def _strip_accents(s): |
| return s.translate(_ACCENTED_VOWELS) |
|
|
| def expand_enclitics(toks): |
| """ |
| Separa formas verbo+pronombre enclítico en dos tokens cuando ES2NI |
| NO contiene la forma combinada. Si la forma entera SÍ está en el lex |
| (gracias a 044/053/059), se respeta y no se descompone — esto preserva |
| el 1:1 estricto que prometen los parches. |
| |
| Solo se descompone como fallback para palabras raras que no entraron |
| en los parches enclíticos. En ese caso se generan dos tokens normales |
| (sin marcas especiales) que el motor traduce por separado. |
| |
| Ejemplos del fallback: |
| 'ayudarme' → ['ayudar', 'me'] |
| 'tocarlo' → ['tocar', 'lo'] |
| 'ayudándome'→ ['ayudando', 'me'] |
| 'dárselo' → ['darse', 'lo'] |
| """ |
| if not toks: |
| return toks |
| if not ES2NI or not ES2NI_POS: |
| return toks |
|
|
| def _stem_is_verb_or_has_infinitive(stem): |
| """¿Es esta raíz un verbo conocido, o tiene un infinitivo en el lex?""" |
| if stem not in ES2NI: |
| pass |
| elif ES2NI_POS.get(stem, "") == "V": |
| return True |
| for suf in ("r", "er", "ir"): |
| inf_candidate = stem + suf |
| if inf_candidate in ES2NI and ES2NI_POS.get(inf_candidate, "") == "V": |
| return True |
| for diph, base in (("ue", "o"), ("ie", "e")): |
| idx = stem.rfind(diph) |
| if idx < 0: |
| continue |
| stem_undiph = stem[:idx] + base + stem[idx+2:] |
| stem_root = stem_undiph |
| if stem_root and stem_root[-1] in "ae": |
| stem_root = stem_root[:-1] |
| for suf in ("ar", "er", "ir"): |
| inf_candidate = stem_root + suf |
| if inf_candidate in ES2NI and ES2NI_POS.get(inf_candidate, "") == "V": |
| return True |
| return False |
|
|
| out = [] |
| for tok in toks: |
| tok_l = tok.lower() |
| |
| |
| if not tok_l.isalpha() or tok_l in ES2NI: |
| out.append(tok) |
| continue |
| tok_noacc = _strip_accents(tok_l) |
| if tok_noacc != tok_l and tok_noacc in ES2NI: |
| out.append(tok) |
| continue |
|
|
| split = None |
| for clit in _ENCLITICS: |
| if not tok_l.endswith(clit): |
| continue |
| stem = tok_l[:-len(clit)] |
| if len(stem) < 2: |
| continue |
| if len(stem) == 2 and stem not in ES2NI: |
| continue |
| |
| |
| |
| |
| |
| if stem in ES2NI: |
| stem_pos = ES2NI_POS.get(stem, "") |
| if stem_pos and stem_pos != "V": |
| |
| continue |
| |
| if _stem_is_verb_or_has_infinitive(stem): |
| split = (stem, clit, None) |
| break |
| |
| stem_noacc = _strip_accents(stem) |
| if stem_noacc != stem and _stem_is_verb_or_has_infinitive(stem_noacc): |
| split = (stem_noacc, clit, None) |
| break |
| |
| |
| |
| |
| for clit2 in _ENCLITICS: |
| if not stem.endswith(clit2): |
| continue |
| stem2 = stem[:-len(clit2)] |
| if len(stem2) < 3: |
| continue |
| if _stem_is_verb_or_has_infinitive(stem2): |
| split = (stem2, clit2, clit) |
| break |
| stem2_noacc = _strip_accents(stem2) |
| if stem2_noacc != stem2 and _stem_is_verb_or_has_infinitive(stem2_noacc): |
| split = (stem2_noacc, clit2, clit) |
| break |
| if split: |
| break |
|
|
| if split: |
| stem, clit1, clit2 = split |
| if tok[0].isupper(): |
| stem = stem[0].upper() + stem[1:] |
| |
| |
| out.append(stem) |
| out.append(clit1) |
| if clit2: |
| out.append(clit2) |
| else: |
| out.append(tok) |
| return out |
|
|
| def detokenize(tokens): |
| s = " ".join(tokens) |
| s = re.sub(r"\s+([,.;:!?])", r"\1", s) |
| s = re.sub(r"([¿¡])\s+", r"\1", s) |
| s = re.sub(r"\(\s+", "(", s) |
| s = re.sub(r"\s+\)", ")", s) |
| |
| |
| |
| s = re.sub(r"([«“‘\[])\s+", r"\1", s) |
| s = re.sub(r"\s+([»”’\]])", r"\1", s) |
| s = re.sub(r"\s{2,}", " ", s).strip() |
| return s |
|
|
| |
| |
| |
| _SAFE_CLITICS_RE = r"(me|te|nos|os|se)" |
| _AMBIG_CLITICS_RE = r"(lo|la|le|los|las|les)" |
| _SAFE_AFTER_AMBIG = ( |
| r"(?:\s*[.,;:!?)\]»\"”—–]" |
| r"|\s+(?:y|o|pero|sino|aunque|mientras|porque|si|cuando|donde|que|" |
| r"también|tampoco|ya|no|todavía|después|antes|ahora|luego|aquí|allí|" |
| r"ahí|así|sólo|solo|nunca|jamás|siempre|" |
| r"bien|mal|mucho|poco|muy|más|menos|tan|todo|todos|nada|algo|" |
| r"hoy|ayer|mañana|pronto|tarde|" |
| r"fuerte|fuertemente|suavemente|fijamente|atentamente|" |
| |
| r"el|la|los|las|" |
| |
| r"de|en|a|por|para|con|sin|sobre|bajo|tras|entre|hasta|hacia|desde|" |
| r"un|una|algún|alguna)\b" |
| |
| |
| |
| r"|\s+[a-záéíóúñü]+mente\b" |
| r"|\s+otra\s+vez\b" |
| r"|\s+otras\s+veces\b" |
| |
| r"|\s+\[" |
| r"|\s+(?:cada|todos\s+los|todas\s+las)\s+(?:\d+\s+|(?:dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieciséis|diecisiete|dieciocho|diecinueve|veinte|treinta|cuarenta|cincuenta|sesenta|setenta|ochenta|noventa|cien|mil|pocos|pocas|muchos|muchas|tantos|tantas|varios|varias|cuantos|cuantas|algunos|algunas)\s+)?(?:día|días|mañana|mañanas|tarde|tardes|noche|noches|mes|meses|año|años|semana|semanas|hora|horas|momento|momentos|vez|veces)\b" |
| r"|\s*$|\s*\n)" |
| ) |
| _FUSE_INF_SAFE_RE = re.compile( |
| r"\b([a-záéíóúñü]+(?:ar|er|ir))\s+" + _SAFE_CLITICS_RE + r"\b", |
| re.IGNORECASE) |
| _FUSE_INF_AMBIG_RE = re.compile( |
| r"\b([a-záéíóúñü]+(?:ar|er|ir))\s+" + _AMBIG_CLITICS_RE + r"(?=" + _SAFE_AFTER_AMBIG + r")", |
| re.IGNORECASE) |
| _FUSE_GER_SAFE_RE = re.compile( |
| r"\b([a-záéíóúñü]+(?:ando|iendo|yendo))\s+" + _SAFE_CLITICS_RE + r"\b", |
| re.IGNORECASE) |
| _FUSE_GER_AMBIG_RE = re.compile( |
| r"\b([a-záéíóúñü]+(?:ando|iendo|yendo))\s+" + _AMBIG_CLITICS_RE + r"(?=" + _SAFE_AFTER_AMBIG + r")", |
| re.IGNORECASE) |
| _FUSE_VERB_SAFE_RE = re.compile( |
| r"\b([a-záéíóúñü]+)\s+" + _SAFE_CLITICS_RE + r"\b", |
| re.IGNORECASE) |
| _FUSE_VERB_AMBIG_RE = re.compile( |
| r"\b([a-záéíóúñü]+)\s+" + _AMBIG_CLITICS_RE + r"(?=" + _SAFE_AFTER_AMBIG + r")", |
| re.IGNORECASE) |
| _GER_ACCENT_MAP = (("ando","ándo"), ("iendo","iéndo"), ("yendo","yéndo")) |
| |
| |
| |
| _MORPH_ADMITS_ENCLITIC = {"IMP", "INF", "GER"} |
|
|
| def _verb_admits_enclitic(verb): |
| v = (verb or "").lower() |
| if v in {"me","te","se","nos","os","lo","la","le","los","las","les"}: |
| return False |
| if v in {"como", "mientras", "para", "sobre", "luego", "casi", "según", |
| "salvo", "bajo", "sin", "pasada", "vista", "puesto", "dada", |
| "siendo", "habiendo"}: |
| return False |
| |
| |
| |
| |
| |
| |
| |
| |
| if v in {"agua", "cinta", "clara", "copa", "fina", "goma", "justa", |
| "maja", "manga", "mata", "misa", "mosca", "nada", "novela", |
| "novia", "obra", "pena", "perla", "punta", "raya", "recta", |
| "rima", "rosa", "sopa", "tabla", "taza", "tela", "tinta", |
| "trenza", "tribu", "tumba", "vaca", "venta", "visa"}: |
| return False |
| |
| |
| |
| |
| |
| if v in {"abra","abran","aprenda","aprendan","arregle","arreglen", |
| "ayude","ayuden","baje","bajen","beba","beban","cierre","cierren", |
| "coma","coman","continúe","continúen","cuente","cuenten", |
| "decida","decidan","deje","dejen","diga","digan", |
| "disculpe","disculpen","empiece","empiecen","encuentre","encuentren", |
| "envíe","envíen","escriba","escriban","escuche","escuchen", |
| "espere","esperen","firme","firmen","hable","hablen", |
| "haga","hagan","intente","intenten","limpie","limpien", |
| "llame","llamen","llegue","lleguen","llene","llenen", |
| "mire","miren","muestre","muestren","oiga","oigan", |
| "pase","pasen","perdone","perdonen","permita","permitan", |
| "piense","piensen","pinte","pinten","ponga","pongan", |
| "pregunte","pregunten","prepare","preparen","prometa","prometan", |
| "pruebe","prueben","quede","queden","reciba","reciban", |
| "recuerde","recuerden","responda","respondan","sea","sean", |
| "sepa","sepan","siga","sigan","sirva","sirvan", |
| "suba","suban","tenga","tengan","termine","terminen", |
| "tome","tomen","traiga","traigan","use","usen", |
| "vaya","vayan","vea","vean","venga","vengan", |
| "vuelva","vuelvan"}: |
| return True |
| |
| if v in ES2NI and ES2NI_POS.get(v, "") == "V": |
| morph = ES2NI_MORPH.get(v, "") |
| if morph in _MORPH_ADMITS_ENCLITIC: |
| return True |
| |
| |
| if len(v) >= 3 and v[-1] in 'aeií': |
| for suf in ('r','er','ir'): |
| if v+suf in ES2NI and ES2NI_POS.get(v+suf, "") == "V": |
| return True |
| |
| |
| |
| |
| |
| if len(v) >= 3 and v[-1] in 'aeií': |
| for suf in ('r','er','ir'): |
| if v+suf in ES2NI and ES2NI_POS.get(v+suf, "") == "V": |
| return True |
| |
| |
| |
| |
| |
| if len(v) >= 4 and v[-1] in 'aeií': |
| for diph, base in (("ue","o"),("ie","e")): |
| idx = v.rfind(diph) |
| if idx < 0: continue |
| stem_undiph = v[:idx] + base + v[idx+2:] |
| |
| stem_root = stem_undiph[:-1] if stem_undiph[-1] in 'ae' else stem_undiph |
| for suf in ('ar','er','ir'): |
| if stem_root+suf in ES2NI and ES2NI_POS.get(stem_root+suf, "") == "V": |
| return True |
| |
| if v in ES2NI: |
| pos_actual = ES2NI_POS.get(v, "") |
| if pos_actual and pos_actual != "V": |
| return False |
| |
| if v not in ES2NI and len(v) >= 3 and v[-1] in 'ae': |
| for suf in ('r', 'er', 'ir'): |
| inf_candidate = v + suf |
| if inf_candidate in ES2NI and ES2NI_POS.get(inf_candidate, "") == "V": |
| return True |
| return False |
|
|
| def _fuse_imp_with_accent(verb, clit): |
| VOWELS = "aeiouáéíóú" |
| fused = verb + clit |
| positions = [i for i,c in enumerate(fused) if c.lower() in VOWELS] |
| if len(positions) < 3: |
| return fused |
| verb_positions = [i for i,c in enumerate(verb) if c.lower() in VOWELS] |
| if not verb_positions: |
| return fused |
| if len(verb_positions) == 1: |
| tonic_idx = verb_positions[0] |
| else: |
| tonic_idx = verb_positions[-2] |
| if fused[tonic_idx] in 'áéíóú': |
| return fused |
| vowels_after = sum(1 for p in positions if p > tonic_idx) |
| if vowels_after >= 2: |
| accent_map = {'a':'á','e':'é','i':'í','o':'ó','u':'ú'} |
| ch = fused[tonic_idx].lower() |
| if ch in accent_map: |
| new_ch = accent_map[ch] |
| if fused[tonic_idx].isupper(): |
| new_ch = new_ch.upper() |
| return fused[:tonic_idx] + new_ch + fused[tonic_idx+1:] |
| return fused |
|
|
| def _is_real_infinitive(word): |
| w = (word or "").lower() |
| if w not in ES2NI: |
| return False |
| pos = ES2NI_POS.get(w, "") |
| return pos == "V" |
|
|
| def fuse_enclitics_es(es_text): |
| if not es_text: |
| return es_text |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| def _is_relative_lo_que(clit, m): |
| if (clit or "").lower() not in ("lo","la","los","las"): |
| return False |
| rest = m.string[m.end():] |
| return bool(re.match(r"\s+que\b", rest, re.IGNORECASE)) |
|
|
| def _inf_safe(m): |
| verb, clit = m.group(1), m.group(2) |
| if _is_real_infinitive(verb): |
| return verb + clit |
| return m.group(0) |
| def _inf_ambig(m): |
| verb, clit = m.group(1), m.group(2) |
| if _is_real_infinitive(verb): |
| if _is_relative_lo_que(clit, m): |
| return m.group(0) |
| return verb + clit |
| return m.group(0) |
|
|
| def _ger_accent(verb, clit): |
| for plain, accented in _GER_ACCENT_MAP: |
| if verb.lower().endswith(plain): |
| return verb[:-len(plain)] + accented + clit |
| return verb + " " + clit |
| def _ger_safe(m): |
| verb, clit = m.group(1), m.group(2) |
| if not _is_real_infinitive(verb): |
| return m.group(0) |
| return _ger_accent(verb, clit) |
| def _ger_ambig(m): |
| verb, clit = m.group(1), m.group(2) |
| if not _is_real_infinitive(verb): |
| return m.group(0) |
| if _is_relative_lo_que(clit, m): |
| return m.group(0) |
| return _ger_accent(verb, clit) |
|
|
| es_text = _FUSE_INF_SAFE_RE.sub(_inf_safe, es_text) |
| es_text = _FUSE_INF_AMBIG_RE.sub(_inf_ambig, es_text) |
| es_text = _FUSE_GER_SAFE_RE.sub(_ger_safe, es_text) |
| es_text = _FUSE_GER_AMBIG_RE.sub(_ger_ambig, es_text) |
|
|
| def _verb_safe(m): |
| verb, clit = m.group(1), m.group(2) |
| v = verb.lower() |
| if v.endswith(("ar","er","ir","ando","iendo","yendo")): |
| return m.group(0) |
| if not _verb_admits_enclitic(v): |
| return m.group(0) |
| return _fuse_imp_with_accent(verb, clit) |
| def _verb_ambig(m): |
| verb, clit = m.group(1), m.group(2) |
| v = verb.lower() |
| if v.endswith(("ar","er","ir","ando","iendo","yendo")): |
| return m.group(0) |
| if not _verb_admits_enclitic(v): |
| return m.group(0) |
| if _is_relative_lo_que(clit, m): |
| return m.group(0) |
| return _fuse_imp_with_accent(verb, clit) |
|
|
| es_text = _FUSE_VERB_SAFE_RE.sub(_verb_safe, es_text) |
| es_text = _FUSE_VERB_AMBIG_RE.sub(_verb_ambig, es_text) |
|
|
| |
| |
| |
| |
| |
| |
| |
| _SAFE_AFTER_AMBIG_DOUBLE = ( |
| r"(?:\s*[.,;:!?)\]»\"”—–]" |
| r"|\s+(?:y|o|pero|sino|aunque|mientras|porque|si|cuando|donde|" |
| r"también|tampoco|ya|no|todavía|después|antes|ahora|luego|aquí|allí|" |
| r"ahí|así|sólo|solo|nunca|jamás|siempre|" |
| r"bien|mal|mucho|poco|muy|más|menos|tan|todo|todos|nada|algo|" |
| r"hoy|ayer|mañana|pronto|tarde|" |
| r"fuerte|fuertemente|suavemente|fijamente|atentamente|" |
| r"un|una|algún|alguna)\b" |
| r"|\s+[a-záéíóúñü]+mente\b" |
| r"|\s+otra\s+vez\b" |
| r"|\s+otras\s+veces\b" |
| r"|\s+(?:cada|todos\s+los|todas\s+las)\s+(?:día|días|mañana|mañanas|tarde|tardes|noche|noches|mes|meses|año|años|semana|semanas|hora|horas|momento|momentos|vez|veces)\b" |
| r"|\s*$|\s*\n)" |
| ) |
| _DOUBLE_CLIT_RE = re.compile( |
| r"\b([a-záéíóúñü]+(?:me|te|se|nos|os))\s+" + _AMBIG_CLITICS_RE + |
| r"(?=" + _SAFE_AFTER_AMBIG_DOUBLE + r")", |
| re.IGNORECASE) |
| def _double_clit(m): |
| word, clit2 = m.group(1), m.group(2) |
| clit1_endings = ("me","te","se","nos","os") |
| verb_orig = None |
| for end in clit1_endings: |
| if word.lower().endswith(end): |
| cand = word[:-len(end)] |
| cand_noacc = (cand.replace('á','a').replace('é','e') |
| .replace('í','i').replace('ó','o').replace('ú','u')) |
| if (cand.lower() in ES2NI or cand_noacc.lower() in ES2NI): |
| verb_orig = cand_noacc |
| break |
| if not verb_orig: |
| return m.group(0) |
| if not _verb_admits_enclitic(verb_orig): |
| return m.group(0) |
| if _is_relative_lo_que(clit2, m): |
| return m.group(0) |
| return _fuse_imp_with_accent(word, clit2) |
|
|
| es_text = _DOUBLE_CLIT_RE.sub(_double_clit, es_text) |
| return es_text |
|
|
| |
| MODAL_SUFFIX_ENABLE = True |
| MODAL_ONLY_ON_FINITE = True |
| MODAL_STRIP_QE_IN_NI = True |
|
|
| SENT_END = {".", "!", "?", "…"} |
| OPEN_FOR = {"?": "¿", "!": "¡"} |
| WRAP_PREFIX = set(list("«“‘([{\"'—–")) |
| PERS_ENDINGS = ("-n","-śe","-ek","-śek","-k") |
| TAM_FINITE = ("-ke","-bo","-ta","-ni","-ir") |
|
|
| def looks_like_finite_ni(tok:str)->bool: |
| t = (tok or "").lower() |
| if not t or t.startswith("["): return False |
| base = re.sub(r"-(na|ba)$","", t) |
| for tam in TAM_FINITE: |
| if base.endswith(tam) or any(base.endswith(tam+pe) for pe in PERS_ENDINGS): |
| return True |
| return False |
|
|
| def last_content_index(tokens, start, end_exclusive): |
| i = end_exclusive - 1 |
| while i >= start and tokens[i] in VISIBLE_PUNCT: |
| i -= 1 |
| return i if i >= start else -1 |
|
|
| def strip_qe_punct(tokens): |
| |
| |
| |
| |
| |
| |
| |
| |
| return [t for t in tokens if t not in ("?", "!")] |
|
|
| def _is_numeric_comma(tokens, i): |
| return (0 < i < len(tokens)-1 and tokens[i] == "," and |
| is_number(tokens[i-1]) and is_number(tokens[i+1])) |
|
|
| def _is_time_colon(tokens, i): |
| return (0 < i < len(tokens)-1 and tokens[i] == ":" and |
| is_number(tokens[i-1]) and is_number(tokens[i+1])) |
|
|
| def _is_true_clause_break(tokens, i): |
| if tokens[i] not in CLAUSE_BREAKS: return False |
| if _is_numeric_comma(tokens, i): return False |
| if _is_time_colon(tokens, i): return False |
| return True |
|
|
| def add_modal_suffixes_es2ni(tokens): |
| if not MODAL_SUFFIX_ENABLE: |
| return tokens |
| out = tokens[:] |
| n = len(out) |
| i = 0 |
| sent_start = 0 |
| while i < n: |
| if out[i] in ("?", "!"): |
| closer = out[i] |
| target = last_content_index(out, sent_start, i) |
| if target != -1: |
| suf = "na" if closer == "?" else "ba" |
| |
| |
| |
| |
| |
| |
| |
| |
| out[target] = out[target] + "-" + suf |
| sent_start = i + 1 |
| elif out[i] in SENT_END: |
| sent_start = i + 1 |
| i += 1 |
| if MODAL_STRIP_QE_IN_NI: |
| out = strip_qe_punct(out) |
| return out |
|
|
| def strip_modal_suffixes_ni(tokens): |
| if not MODAL_SUFFIX_ENABLE: |
| return tokens |
|
|
| out = [] |
| buf = [] |
| pending_end = None |
| mode = None |
|
|
| def _emit(end_override=None, also_append=None): |
| nonlocal buf, mode, pending_end, out |
| local = [t for t in buf if t not in ("¿","?","¡","!")] |
| if local: |
| end_tok = end_override or ("?" if mode == "?" else "!" if mode == "!" else pending_end or ".") |
| out.extend(local) |
| |
| |
| |
| |
| last = local[-1] |
| is_punct_already = last in {".", "!", "?", "…", ":", ";"} |
| |
| |
| |
| |
| |
| |
| has_alpha = any(any(c.isalpha() for c in t) for t in local) |
| if is_punct_already or not has_alpha: |
| pass |
| else: |
| |
| |
| |
| |
| |
| |
| |
| out.append(end_tok) |
| buf.clear(); mode = None; pending_end = None |
| if also_append: |
| out.append(also_append) |
|
|
| |
| if tokens and tokens[-1] in SENT_END: |
| toks = list(tokens) |
| else: |
| toks = tokens + ["."] |
| for i, t in enumerate(toks): |
| if t in ("¿", "¡"): |
| |
| |
| |
| |
| |
| |
| local = [x for x in buf if x not in ("¿","?","¡","!")] |
| out.extend(local) |
| buf.clear() |
| out.append(t) |
| mode = "?" if t == "¿" else "!" |
| pending_end = None |
| continue |
| if t in ("?", "!"): |
| pending_end = t; _emit(); continue |
| if t in SENT_END: |
| pending_end = t; _emit(); continue |
|
|
| if t in CLAUSE_BREAKS and mode in ("?","!"): |
| buf.append(t) |
| continue |
|
|
| m = re.search(r"-(na|ba)$", (t or "").lower()) |
| if m: |
| if (t or "").lower() in NI2ES: |
| buf.append(t) |
| continue |
| t = t[:-len(m.group(0))] |
| if t: buf.append(t) |
| mode = "?" if m.group(1) == "na" else "!" |
| _emit() |
| continue |
|
|
| if t: |
| buf.append(t) |
|
|
| if len(out) >= 2 and out[-1] == "." and out[-2] == ".": out.pop() |
| return out |
|
|
|
|
|
|
| |
| _DIACR_ALWAYS = { |
| "cuando":"cuándo", "donde":"dónde", "como":"cómo", |
| "cuanto":"cuánto", "cuanta":"cuánta", |
| "cuantos":"cuántos", "cuantas":"cuántas", |
| "cuan":"cuán", "cual":"cuál", "cuales":"cuáles", |
| "adonde":"adónde", |
| } |
| _DIACR_HEAD_ONLY = {"que":"qué", "quien":"quién", "quienes":"quiénes"} |
|
|
| def apply_interrogative_tildes(tokens): |
| out = list(tokens) |
| i = 0 |
| n = len(out) |
| while i < n: |
| if out[i] != "¿": |
| i += 1 |
| continue |
| j = i + 1 |
| depth = 0 |
| end = -1 |
| while j < n: |
| t = out[j] |
| if t == "¿": |
| depth += 1 |
| elif t == "?": |
| if depth == 0: |
| end = j |
| break |
| depth -= 1 |
| elif t in SENT_END: |
| break |
| j += 1 |
| if end < 0: |
| i += 1 |
| continue |
| head_pending = True |
| for k in range(i+1, end): |
| tok = out[k] |
| if not tok or not tok[0].isalpha(): |
| if tok in (",", ";", ":"): |
| head_pending = True |
| continue |
| tok_l = tok.lower() |
| replacement = None |
| if tok_l in _DIACR_ALWAYS: |
| replacement = _DIACR_ALWAYS[tok_l] |
| elif head_pending and tok_l in _DIACR_HEAD_ONLY: |
| replacement = _DIACR_HEAD_ONLY[tok_l] |
| |
| |
| |
| |
| |
| if replacement and tok_l in ('cuando','donde','como','adonde'): |
| |
| |
| |
| |
| is_first_alpha = True |
| for prev_k in range(i+1, k): |
| pt = out[prev_k] |
| if pt and pt[0].isalpha(): |
| is_first_alpha = False |
| break |
| if not is_first_alpha: |
| for lookahead in range(1, 4): |
| if k + lookahead >= end: break |
| next_tok = out[k + lookahead] |
| if not next_tok or not next_tok[0].isalpha(): continue |
| next_l = next_tok.lower() |
| if next_l in ES2NI: |
| morph = ES2NI_MORPH.get(next_l, '') |
| pos = ES2NI_POS.get(next_l, '') |
| |
| if pos == 'V' and morph in ('SBJ', 'IPFV', 'PST'): |
| replacement = None |
| break |
| if replacement is not None: |
| if tok and tok[0].isupper(): |
| replacement = replacement[0].upper() + replacement[1:] |
| out[k] = replacement |
| head_pending = False |
| i = end + 1 |
| return out |
|
|
| def add_inverted_openers(tokens): |
| out = tokens[:] |
| START_BREAKS = SENT_END | CLAUSE_BREAKS |
|
|
| |
| |
| |
| |
| EXCL_ACCENTED = {'qué','cuán','cuánto','cuánta','cuántos','cuántas','cómo'} |
| EXCL_PLAIN = {'que','cuan','cuanto','cuanta','cuantos','cuantas','como'} |
| INTERR_ACCENTED = {'qué','quién','quiénes','cuándo','dónde','cómo','cuál', |
| 'cuáles','cuán','cuánto','cuánta','cuántos','cuántas'} |
| |
| |
| INTERR_PLAIN = {'que','quien','quienes','cuando','donde','como','cual', |
| 'cuales','cuan','cuanto','cuanta','cuantos','cuantas'} |
|
|
| def _is_true_start_break(idx): |
| if out[idx] in SENT_END: return True |
| if out[idx] == NEWLINE_TOK: return True |
| if out[idx] in CLAUSE_BREAKS: return _is_true_clause_break(out, idx) |
| return False |
|
|
| |
| |
| |
| |
| def _is_scope_break(idx): |
| if out[idx] in SENT_END: return True |
| if out[idx] == NEWLINE_TOK: return True |
| if out[idx] == ':' and not _is_time_colon(out, idx): return True |
| if out[idx] == ';': return True |
| return False |
|
|
| i = 0 |
| while i < len(out): |
| if out[i] in ("?", "!"): |
| closer = out[i]; opener = OPEN_FOR[closer] |
| accented_set = EXCL_ACCENTED if closer == "!" else INTERR_ACCENTED |
| plain_set = EXCL_PLAIN if closer == "!" else INTERR_PLAIN |
|
|
| |
| j = i - 1 |
| while j >= 0: |
| if _is_scope_break(j): |
| break |
| j -= 1 |
| scope_start = j + 1 |
|
|
| |
| |
| |
| qword_pos = -1 |
| for k in range(scope_start, i): |
| if out[k].lower() in accented_set: |
| qword_pos = k |
| break |
| if qword_pos == -1: |
| for k in range(i - 1, scope_start - 1, -1): |
| if out[k].lower() in plain_set: |
| qword_pos = k |
| break |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| if qword_pos != -1: |
| |
| |
| cl_start = scope_start |
| for k in range(qword_pos - 1, scope_start - 1, -1): |
| if out[k] in CLAUSE_BREAKS: |
| if _is_true_clause_break(out, k): |
| cl_start = k + 1 |
| break |
| if out[k] in SENT_END or out[k] == NEWLINE_TOK: |
| cl_start = k + 1 |
| break |
| start = cl_start |
| else: |
| |
| start = scope_start |
|
|
| |
| k = start |
| while k < i and out[k] in WRAP_PREFIX: |
| k += 1 |
| |
| |
| |
| |
| |
| anti_dup_start = 0 |
| for jj in range(i - 1, -1, -1): |
| if out[jj] in SENT_END or out[jj] in (';', ':'): |
| anti_dup_start = jj + 1 |
| break |
| has_opener_already = any(out[kk] == opener for kk in range(anti_dup_start, i)) |
| if not has_opener_already: |
| out.insert(k, opener); i += 1 |
| i += 1 |
| return out |
|
|
| |
| EXPANSION_ENABLE = True |
| FLAG_COLNAMES = ("flags","FLAGS","expand","EXPAND","tags","TAGS","morph","MORPH") |
| FLAG_PLURAL = ("S",) |
| FLAG_3PL = ("3","V3") |
|
|
| VOWELS = "aeiouáéíóúüAEIOUÁÉÍÓÚÜ" |
|
|
| def _has_flag(cell:str, wanted:tuple)->bool: |
| c = (cell or "") |
| return any(w in c for w in wanted) |
|
|
| def _pluralize_es_form(s: str) -> str: |
| if not s: return s |
| sl = s.lower() |
| if sl.endswith("z"): |
| return s[:-1] + ("ces" if s[-1].islower() else "CES") |
| if s[-1] not in VOWELS: |
| return s + ("es" if s[-1].islower() else "ES") |
| return s + ("s" if s[-1].islower() else "S") |
|
|
| def _present_3pl_from_3sg(s: str) -> str: |
| if not s: return s |
| return s + ("n" if s[-1].islower() else "N") |
|
|
| |
| print("Cargando modelo de voz (opcional)…") |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| processor = model = None |
| try: |
| processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa") |
| model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device) |
| print("Modelo de voz cargado.") |
| except Exception as e: |
| print(f"AVISO TTS: {e}") |
|
|
| def add_reading_pauses(text: str, level:int=3) -> str: |
| if level <= 1: return text |
| t = re.sub(r",\s*", ", , ", text) |
| t = re.sub(r"\.\s*", ". . ", text) |
| return re.sub(r'\s+',' ',t).strip() |
|
|
| def hispanize_for_tts(ni_text: str) -> str: |
| text=unicodedata.normalize('NFC', (ni_text or "").lower()) |
| text=text.replace('ŕ','rr').replace('ś','s').replace('eś','es').replace('-', ' ') |
| text=re.sub(r'\[.*?\]','',text); text=re.sub(r'\s+',' ',text).strip() |
| return add_reading_pauses(text, 3) |
|
|
| def synthesize_speech(text): |
| if not text or not text.strip() or model is None or processor is None: return None |
| try: |
| inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device) |
| with torch.no_grad(): output = model(**inputs).waveform |
| speech_np = output.cpu().numpy().squeeze() |
| mx = max(abs(speech_np.min()), abs(speech_np.max())) |
| if mx>0: speech_np = speech_np/mx*0.9 |
| return (16000, speech_np.astype(np.float32)) |
| except Exception as e: |
| print(f"Error TTS: {e}"); return None |
|
|
| |
| V = "aeiou" |
| SYL_FOR = { |
| "b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"], |
| "d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"], |
| "t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"], |
| "g":["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"], |
| "k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"] |
| } |
| ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›", |
| "l":"‹L›","r":"‹R›","ŕ":"‹Ŕ›","n":"‹N›","m":"‹M›"} |
| CODA_FOR={"":"","n":"‹N›","s":"‹S›","ś":"‹Ś›","r":"‹R›","ŕ":"‹Ŕ›","l":"‹L›","m":"‹M›","k":"‹K›","t":"‹T›"} |
|
|
| def tokens_from_latin(ni:str)->str: |
| out=[]; i=0; ni=unicodedata.normalize('NFC', (ni or "").lower()) |
| while i<len(ni): |
| c=ni[i] |
| if c=="p": c="b" |
| if c=="-": out.append("—"); i+=1; continue |
| if c in V: |
| out.append(ALPHA_FOR.get(c, c.upper())); i+=1; continue |
| if c in SYL_FOR and i+1<len(ni) and ni[i+1] in V: |
| idx=V.index(ni[i+1]); tok=SYL_FOR[c][idx] |
| coda=ni[i+2] if i+2<len(ni) else "" |
| if coda in CODA_FOR and coda!="": tok+=CODA_FOR[coda]; i+=3 |
| else: i+=2 |
| out.append(tok); continue |
| out.append(ALPHA_FOR.get(c, c.upper())); i+=1 |
| return "".join(out) |
|
|
| KEYS_MODE = "full" |
| KEYS_OVERRIDE = {} |
|
|
| def georgeos_keys(token_str:str, ni_plain:str)->str: |
| low=unicodedata.normalize('NFC', (ni_plain or "").lower()) |
| if low in KEYS_OVERRIDE: return KEYS_OVERRIDE[low] |
| m=re.findall(r"‹(.*?)›", token_str) |
| out=[] |
| for t in m: |
| if KEYS_MODE == "compact": |
| if len(t)==2 and t[0] in "BDTGK": out.append(t[0]) |
| elif t in ("A","E","I","O","U"): out.append(t) |
| elif t=="Ś": out.append("X") |
| elif t=="Ŕ": out.append("r") |
| else: out.append(t[0].upper()) |
| else: |
| if len(t)==2 and t[0] in "BDTGK": out.append(t) |
| elif t=="Ś": out.append("X") |
| elif t=="Ŕ": out.append("r") |
| else: out.append(t) |
| return "".join(out) |
|
|
| TRIDOT = "|" |
| def render_ib_with_tridots(ib_toks): |
| res=[]; prev_word=False |
| for tk in ib_toks: |
| is_punct = tk in VISIBLE_PUNCT |
| if is_punct: |
| res.append(" "+tk+" "); prev_word=False |
| else: |
| if prev_word: res.append(" "+TRIDOT+" ") |
| res.append(tk); prev_word=True |
| return "".join(res).strip() |
|
|
| |
|
|
| STRICT_BI_ENFORCE = True |
| AMBIG_NI = {} |
| BI_DIAG_HTML = "<em>Sin CSV cargado.</em>" |
|
|
| def load_bi_strict_and_diagnose(): |
| global BI_DIAG_HTML |
| ES2NI.clear(); NI2ES.clear(); ESPHRASE2NI.clear(); NIPHRASE2ES.clear() |
| AMBIG_NI.clear(); ES2NI_VERB.clear() |
| ES2NI_POS.clear() |
| ES2NI_MORPH.clear() |
| NI2ES_LEMMA = {} |
|
|
| if not os.path.exists(CSV_BI): |
| msg=f"[ERROR] No se encontró el CSV bilingüe: {CSV_BI}" |
| print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}" |
| return False |
|
|
| rows=0; dup_es=0; dup_ni=0; empty_pid=0 |
| mismatch_backmap = 0 |
| mismatch_samples = [] |
| pid_seen=set() |
|
|
| print(f"Detectado CSV bilingüe: {CSV_BI}") |
| try: |
| with _open_maybe_gzip(CSV_BI) as f: |
| rd = csv.DictReader(f) |
| flds=set(rd.fieldnames or []) |
| ES_COL = "source_es" if "source_es" in flds else "es_surface" if "es_surface" in flds else "es" |
| NI_COL = "target_ni" if "target_ni" in flds else "ni_surface" if "ni_surface" in flds else "ni" |
| IDCOL = "pair_id" if "pair_id" in flds else "id" if "id" in flds else None |
| FLAGCOL = None |
| for cand in FLAG_COLNAMES: |
| if cand in flds: |
| FLAGCOL = cand; break |
| POS_COL = "pos_es" if "pos_es" in flds else "pos" if "pos" in flds else None |
| LEMMA_COL = "es_lemma" if "es_lemma" in flds else "lemma" if "lemma" in flds else None |
| MORPH_COL = "es_morph" if "es_morph" in flds else "morph" if "morph" in flds else None |
|
|
| base_rows = [] |
| for r in rd: |
| es_orig = (r.get(ES_COL) or "").strip() |
| ni_orig = (r.get(NI_COL) or "").strip() |
| if not (es_orig and ni_orig): continue |
| pid = (r.get(IDCOL) or "").strip() if IDCOL else "" |
| if not pid: empty_pid += 1 |
| else: pid_seen.add(pid) |
| flags = (r.get(FLAGCOL) or "") if FLAGCOL else "" |
|
|
| es = lower(es_orig) |
| ni = lower(ni_orig) |
|
|
| if " " in es: |
| if es not in ESPHRASE2NI: |
| ESPHRASE2NI[es] = (ni_orig, pid) |
| if " " in ni: |
| if ni not in NIPHRASE2ES: |
| NIPHRASE2ES[ni] = (es_orig, pid) |
|
|
| pos = (r.get(POS_COL) or "").strip() if POS_COL else "" |
| morph = (r.get(MORPH_COL) or "").strip() if MORPH_COL else "" |
| _MORPH_PRIO = {"PRS":10,"PST":9,"IPFV":8,"FUT":7,"COND":6, |
| "INF":5,"GER":4,"PART":3,"SBJ":2,"SBJ_IPFV":1,"IMP":0} |
| _POS_PRIO = {"ADJ":3, "N":2, "V":1} |
| if es in ES2NI: |
| dup_es += 1 |
| old_pos = ES2NI_POS.get(es, "") |
| old_morph = ES2NI_MORPH.get(es, "") |
| replace = False |
| new_p = _POS_PRIO.get(pos, 0) |
| old_p = _POS_PRIO.get(old_pos, 0) |
| if new_p > old_p: |
| if old_pos == "V": |
| ES2NI_VERB[es] = ES2NI[es] |
| replace = True |
| elif pos == "V" and old_pos == "V": |
| new_m = _MORPH_PRIO.get(morph, -1) |
| old_m = _MORPH_PRIO.get(old_morph, -1) |
| if new_m > old_m: |
| ES2NI_VERB[es] = ES2NI[es] |
| replace = True |
| elif pos == "V" and old_pos in ("N", "ADJ"): |
| ES2NI_VERB[es] = (ni_orig, pid) |
| if replace: |
| ES2NI[es] = (ni_orig, pid) |
| ES2NI_POS[es] = pos |
| ES2NI_MORPH[es] = morph |
| else: |
| ES2NI[es] = (ni_orig, pid) |
| ES2NI_POS[es] = pos |
| ES2NI_MORPH[es] = morph |
|
|
| lemma = (r.get(LEMMA_COL) or "").strip().lower() if LEMMA_COL else "" |
| if ni in NI2ES: |
| dup_ni += 1 |
| old_lemma = NI2ES_LEMMA.get(ni, "") |
| if lemma and old_lemma and lemma == old_lemma: |
| NI2ES[ni] = (es_orig, pid) |
| else: |
| s = AMBIG_NI.get(ni, set()) |
| s.add(NI2ES[ni][0]); s.add(es_orig) |
| AMBIG_NI[ni] = s |
| if STRICT_BI_ENFORCE: |
| NI2ES.pop(ni, None) |
| else: |
| if STRICT_BI_ENFORCE and ni in AMBIG_NI: |
| pass |
| else: |
| NI2ES[ni] = (es_orig, pid) |
| NI2ES_LEMMA[ni] = lemma |
|
|
| base_rows.append((es_orig, ni_orig, pid, flags)) |
| rows += 1 |
|
|
| if EXPANSION_ENABLE: |
| for es_orig, ni_orig, pid, flags in base_rows: |
| if not flags: continue |
| if _has_flag(flags, FLAG_PLURAL): |
| pl = _pluralize_es_form(es_orig) |
| pl_key = lower(pl) |
| if pl_key not in ES2NI: |
| ES2NI[pl_key] = (ni_orig, pid) |
| if _has_flag(flags, FLAG_3PL): |
| p3 = _present_3pl_from_3sg(es_orig) |
| p3_key = lower(p3) |
| if p3_key not in ES2NI: |
| ES2NI[p3_key] = (ni_orig, pid) |
|
|
| for es_low, (ni_surf, _) in ES2NI.items(): |
| ni_low = lower(ni_surf) |
| back = NI2ES.get(ni_low) |
| if back and lower(back[0]) != es_low: |
| mismatch_backmap += 1 |
| if len(mismatch_samples) < 10: |
| mismatch_samples.append((es_low, ni_low, lower(back[0]))) |
|
|
| except Exception as e: |
| msg=f"[ERROR] Al leer {CSV_BI}: {e}" |
| print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}" |
| return False |
|
|
| ES_FOLD.clear(); NI_FOLD.clear() |
| for es_key in ES2NI: |
| fk = fold(es_key) |
| if fk != es_key and fk not in ES_FOLD: |
| ES_FOLD[fk] = es_key |
| for ni_key in NI2ES: |
| fk = fold(ni_key) |
| if fk != ni_key and fk not in NI_FOLD: |
| NI_FOLD[fk] = ni_key |
| debug_print(f"Fold maps: ES_FOLD={len(ES_FOLD)}, NI_FOLD={len(NI_FOLD)}") |
|
|
| es_unique = len(ES2NI) |
| ni_unique = len(NI2ES) |
| pid_unique = len(pid_seen) |
|
|
| print(f"✓ BI-ONLY ESTRICTO cargado: {rows:,} filas.") |
| if dup_es: print(f"[AVISO] {dup_es:,} duplicados ES (se usó la primera).") |
| if dup_ni: print(f"[AVISO] {dup_ni:,} duplicados NI (bloqueados en modo estricto).") |
| if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.") |
| if mismatch_backmap: |
| print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).") |
|
|
| sam_html = "" |
| if mismatch_samples: |
| sam_rows = "".join( |
| f"<li><code>{escape(es)}</code> → <code>{escape(ni)}</code> → <code>{escape(es2)}</code></li>" |
| for es,ni,es2 in mismatch_samples |
| ) |
| sam_html = f"<details><summary>Muestras</summary><ul>{sam_rows}</ul></details>" |
|
|
| ambN = sum(len(v) > 1 for v in AMBIG_NI.values()) |
| ambList = ", ".join(f"{k}→{sorted(list(v))[:3]}" for k,v in list(AMBIG_NI.items())[:5]) |
|
|
| BI_DIAG_HTML = f""" |
| <div style="font-family:Georgia,serif"> |
| <b>Diagnóstico del CSV BI</b><br> |
| Archivo: <b>{escape(CSV_BI)}</b><br> |
| Filas base (CSV): <b>{rows:,}</b><br> |
| ES únicas (tras expansiones): <b>{es_unique:,}</b> | NI únicas: <b>{ni_unique:,}</b> | pair_id únicos: <b>{pid_unique:,}</b><br> |
| Duplicados ES: <b>{dup_es:,}</b> | Duplicados NI: <b>{dup_ni:,}</b> (bloqueados en estricto) | Sin pair_id: <b>{empty_pid:,}</b><br> |
| Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b> |
| {sam_html} |
| <hr style="border:0;border-top:1px solid #caa"> |
| <small>NI ambiguas bloqueadas: <b>{ambN:,}</b>{(' · ej.: ' + escape(ambList)) if ambN else ''}</small><br> |
| <small>Regla: el motor usa <b>sólo</b> tablas 1:1; NI duplicadas se bloquean y se muestran como <code>[AMB-NI:...]</code>.</small> |
| </div> |
| """ |
| return rows > 0 |
|
|
| print("Cargando léxico/pares (BI-estricto)…") |
| load_bi_strict_and_diagnose() |
|
|
| |
| |
| |
| import glob, re as _re_patches |
|
|
| LEX_PATCH_PATTERN = _re_patches.compile(r"^\d{3}_.+\.csv(\.gz)?$") |
| LEX_PATCH_LOG = [] |
| LEX_DEPRECATED_DIR = "deprecated" |
| STRICT_PATCHES = False |
|
|
| _KNOWN_LEGACY_ORPHANS = set() |
| _LEGACY_ORPHANS_FILE = os.path.join(LEX_DEPRECATED_DIR, "_known_legacy_orphans.csv") |
|
|
| def _load_known_legacy_orphans(): |
| if not os.path.exists(_LEGACY_ORPHANS_FILE): |
| return |
| try: |
| with open(_LEGACY_ORPHANS_FILE, "r", encoding="utf-8", newline="") as f: |
| for row in csv.DictReader(f): |
| es = (row.get("source_es") or "").strip().lower() |
| ni = (row.get("target_ni") or "").strip().lower() |
| if es and ni: |
| _KNOWN_LEGACY_ORPHANS.add((es, ni)) |
| debug_print(f"[PATCH] Legacy orphans aceptados: {len(_KNOWN_LEGACY_ORPHANS)}") |
| except Exception as e: |
| debug_print(f"[PATCH] No se pudo leer {_LEGACY_ORPHANS_FILE}: {e}") |
|
|
| def _append_to_graveyard(patch_name, row_data): |
| try: |
| os.makedirs(LEX_DEPRECATED_DIR, exist_ok=True) |
| path = os.path.join(LEX_DEPRECATED_DIR, patch_name) |
| new_file = not os.path.exists(path) |
| with open(path, "a", encoding="utf-8", newline="") as f: |
| w = csv.DictWriter(f, fieldnames=[ |
| "source_es","target_ni","pos_es","es_morph","pair_id","reason" |
| ]) |
| if new_file: |
| w.writeheader() |
| w.writerow({k: row_data.get(k, "") for k in |
| ["source_es","target_ni","pos_es","es_morph","pair_id","reason"]}) |
| except Exception as e: |
| debug_print(f"[PATCH] No se pudo escribir cementerio: {e}") |
|
|
| def _patch_pre_lint(rows, patch_name, future_targets=None, future_es=None): |
| errors = [] |
| warns = [] |
| future_targets = future_targets or set() |
| future_es = future_es or set() |
|
|
| es_added = {} |
| ni_targets = {} |
| for r in rows: |
| op = (r.get("op") or "").strip().lower() |
| es = (r.get("source_es") or "").strip().lower() |
| ni = (r.get("target_ni") or "").strip().lower() |
| if op in ("add", "alias", "replace") and es and ni: |
| es_added[es] = ni |
| ni_targets.setdefault(ni, set()).add(es) |
|
|
| for r in rows: |
| op = (r.get("op") or "").strip().lower() |
| es = (r.get("source_es") or "").strip().lower() |
| ni = (r.get("target_ni") or "").strip().lower() |
| reason = (r.get("reason") or "").strip() |
|
|
| if op == "delete": |
| if es in ES2NI: |
| old_ni = ES2NI[es][0].lower() if isinstance(ES2NI[es], tuple) else ES2NI[es] |
| ni_rescued = (old_ni in ni_targets) or (es in es_added) |
| if not ni_rescued: |
| pair = (es, old_ni) |
| if pair in _KNOWN_LEGACY_ORPHANS: |
| warns.append(("delete-orphan-known", es, old_ni, |
| "huérfano legacy aceptado")) |
| elif old_ni in future_targets or es in future_es: |
| warns.append(("delete-orphan-deferred-rescue", es, old_ni, |
| "rescatado en parche posterior")) |
| else: |
| errors.append(("delete-orphan-ni", es, old_ni, |
| f"NI {old_ni!r} quedaría huérfano sin reemplazo")) |
| elif op == "retire": |
| if not reason: |
| errors.append(("retire-no-reason", es, ni, |
| "retire requiere campo reason")) |
| elif op == "replace": |
| if not (es and ni): |
| errors.append(("replace-incomplete", es, ni, |
| "replace requiere ES y NI")) |
|
|
| return errors, warns |
|
|
| def _cleanup_ambig_after_remove(removed_es_l, old_ni_surf): |
| """v102: tras eliminar una entrada (delete/retire/replace), si el NI |
| estaba bloqueado en AMBIG_NI por una colisión con el ES borrado, recalcular. |
| Si tras la baja queda un solo candidato, sacarlo de AMBIG_NI y restaurarlo |
| en NI2ES para que la inversa vuelva a funcionar. |
| """ |
| if not old_ni_surf: |
| return |
| ni_l = old_ni_surf.lower() |
| if ni_l not in AMBIG_NI: |
| return |
| AMBIG_NI[ni_l] = {x for x in AMBIG_NI[ni_l] if x.lower() != removed_es_l} |
| if len(AMBIG_NI[ni_l]) <= 1: |
| survivors = AMBIG_NI.pop(ni_l, set()) |
| if survivors and ni_l not in NI2ES: |
| survivor_es = next(iter(survivors)) |
| if survivor_es.lower() in ES2NI: |
| NI2ES[ni_l] = (survivor_es, "") |
|
|
| def apply_lex_patches(): |
| _load_known_legacy_orphans() |
|
|
| try: |
| all_files = os.listdir(".") |
| except Exception as e: |
| debug_print(f"[PATCH] No se pudo listar el directorio: {e}") |
| return |
|
|
| patch_files = sorted([f for f in all_files if LEX_PATCH_PATTERN.match(f)]) |
| |
| |
| _gz_bases = {f[:-3] for f in patch_files if f.endswith(".csv.gz")} |
| _shadowed = [f for f in patch_files if f.endswith(".csv") and f in _gz_bases] |
| if _shadowed: |
| patch_files = [f for f in patch_files if f not in _shadowed] |
| debug_print(f"[PATCH] Ignorando {len(_shadowed)} .csv sombreados por su .csv.gz") |
| if not patch_files: |
| debug_print("[PATCH] No se encontraron archivos NNN_*.csv") |
| return |
|
|
| print(f"[PATCH] Aplicando {len(patch_files)} archivo(s) de parche...") |
| totals = {"add":0,"override":0,"alias":0,"delete":0,"replace":0,"retire":0,"skipped":0} |
|
|
| parsed_patches = [] |
| for pf in patch_files: |
| try: |
| if pf.endswith(".gz"): |
| _f = gzip.open(pf, "rt", encoding="utf-8", newline="") |
| else: |
| _f = open(pf, "r", encoding="utf-8", newline="") |
| with _f as f: |
| parsed_patches.append((pf, list(csv.DictReader(f)))) |
| except Exception: |
| parsed_patches.append((pf, None)) |
|
|
| for idx, (patch_path, rows) in enumerate(parsed_patches): |
| patch_name = patch_path |
| if rows is None: |
| print(f"[PATCH] Error leyendo {patch_name}") |
| continue |
| future_ni = set() |
| future_es = set() |
| for fp, frows in parsed_patches[idx+1:]: |
| if frows is None: continue |
| for r in frows: |
| op = (r.get("op") or "").strip().lower() |
| es = (r.get("source_es") or "").strip().lower() |
| ni = (r.get("target_ni") or "").strip().lower() |
| if op in ("add", "alias", "replace") and es and ni: |
| future_ni.add(ni) |
| future_es.add(es) |
|
|
| ops = {"add":0,"override":0,"alias":0,"delete":0,"replace":0,"retire":0,"skipped":0} |
|
|
| errors, warns = _patch_pre_lint(rows, patch_name, |
| future_targets=future_ni, |
| future_es=future_es) |
| if errors: |
| print(f"[PATCH] {patch_name}: {len(errors)} error(es) destructivo(s) detectado(s):") |
| for kind, es, ni, msg in errors[:5]: |
| print(f" ✗ [{kind}] {es!r}: {msg}") |
| if len(errors) > 5: |
| print(f" ... y {len(errors)-5} más") |
| if STRICT_PATCHES: |
| raise RuntimeError( |
| f"Parche {patch_name} no pasa el linter (STRICT_PATCHES=True). " |
| "Documenta en deprecated/_known_legacy_orphans.csv o usa op=replace.") |
| for kind, es, ni, msg in errors: |
| LEX_PATCH_LOG.append((patch_name, kind, es, ni, "lint-error", msg)) |
| if warns: |
| for kind, es, ni, msg in warns: |
| LEX_PATCH_LOG.append((patch_name, kind, es, ni, "lint-warn", msg)) |
|
|
| for row in rows: |
| op = (row.get("op") or "").strip().lower() |
| es = (row.get("source_es") or "").strip() |
| ni = (row.get("target_ni") or "").strip() |
| pos = (row.get("pos_es") or "").strip() |
| morph = (row.get("es_morph") or "").strip() |
| pid = (row.get("pair_id") or "").strip() or f"patch::{patch_name}" |
| reason = (row.get("reason") or "").strip() |
|
|
| es_l = es.lower() |
| ni_l = ni.lower() |
|
|
| if op == "add": |
| if es_l in ES2NI: |
| |
| |
| |
| |
| |
| old_pos = ES2NI_POS.get(es_l, "") |
| if pos == "V" and old_pos in ("ADJ", "N") and es_l not in ES2NI_VERB: |
| ES2NI_VERB[es_l] = (ni, pid) |
| |
| |
| |
| |
| |
| |
| |
| if ni_l in NI2ES: |
| old_inv = NI2ES[ni_l][0].lower() |
| if old_inv != es_l: |
| replaced = False |
| for diph, base in (('ie','e'), ('ue','o')): |
| if diph in es_l: |
| for idx in range(len(es_l) - 1): |
| if es_l[idx:idx+2] == diph: |
| cand = es_l[:idx] + base + es_l[idx+2:] |
| if cand == old_inv: |
| NI2ES[ni_l] = (es, pid) |
| replaced = True |
| break |
| if replaced: break |
| ops["add"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "ok", |
| f"lectura verbal alternativa (principal {old_pos} preservado)")) |
| continue |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "skip", "ES ya existe")) |
| continue |
| |
| |
| |
| ni_already = ni_l in NI2ES |
| ES2NI[es_l] = (ni, pid) |
| if not ni_already: |
| NI2ES[ni_l] = (es, pid) |
| if pos: ES2NI_POS[es_l] = pos |
| if morph: ES2NI_MORPH[es_l] = morph |
| if " " in es_l and es_l not in ESPHRASE2NI: |
| ESPHRASE2NI[es_l] = (ni, pid) |
| if " " in ni_l and ni_l not in NIPHRASE2ES and not ni_already: |
| NIPHRASE2ES[ni_l] = (es, pid) |
| ops["add"] += 1 |
| note = "ok (sinónimo de NI existente)" if ni_already else "ok" |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "ok", note)) |
|
|
| elif op == "override": |
| prev = ES2NI.get(es_l) |
| ES2NI[es_l] = (ni, pid) |
| NI2ES[ni_l] = (es, pid) |
| if pos: ES2NI_POS[es_l] = pos |
| if morph: ES2NI_MORPH[es_l] = morph |
| if " " in es_l: |
| ESPHRASE2NI[es_l] = (ni, pid) |
| if " " in ni_l: |
| NIPHRASE2ES[ni_l] = (es, pid) |
| ops["override"] += 1 |
| prev_str = f"era {prev[0]}" if prev else "no existía" |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "ok", prev_str)) |
|
|
| elif op == "alias": |
| if es_l in ES2NI: |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "skip", "ES ya existe")) |
| continue |
| ES2NI[es_l] = (ni, pid) |
| if pos: ES2NI_POS[es_l] = pos |
| if morph: ES2NI_MORPH[es_l] = morph |
| if " " in es_l and es_l not in ESPHRASE2NI: |
| ESPHRASE2NI[es_l] = (ni, pid) |
| ops["alias"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "ok", "alias ortográfico")) |
|
|
| elif op == "delete": |
| if es_l not in ES2NI: |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "skip", "no existía")) |
| continue |
| old_ni_surf, old_pid = ES2NI[es_l] |
| old_pos = ES2NI_POS.get(es_l, "") |
| old_morph = ES2NI_MORPH.get(es_l, "") |
| _append_to_graveyard(patch_name, { |
| "source_es": es, "target_ni": old_ni_surf, |
| "pos_es": old_pos, "es_morph": old_morph, |
| "pair_id": old_pid, |
| "reason": reason or "delete sin reason (legacy)", |
| }) |
| del ES2NI[es_l] |
| ES2NI_POS.pop(es_l, None) |
| ES2NI_MORPH.pop(es_l, None) |
| if old_ni_surf.lower() in NI2ES and \ |
| NI2ES[old_ni_surf.lower()][0].lower() == es_l: |
| del NI2ES[old_ni_surf.lower()] |
| _cleanup_ambig_after_remove(es_l, old_ni_surf) |
| ops["delete"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "ok", "movido a deprecated/")) |
|
|
| elif op == "replace": |
| if not (es and ni): |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "skip", |
| "replace requiere ES y NI")) |
| continue |
| if ni_l in NI2ES and NI2ES[ni_l][0].lower() != es_l: |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "skip", |
| f"NI nuevo ya pertenece a {NI2ES[ni_l][0]!r}")) |
| continue |
| if es_l in ES2NI: |
| old_ni_surf, old_pid = ES2NI[es_l] |
| old_pos = ES2NI_POS.get(es_l, "") |
| old_morph = ES2NI_MORPH.get(es_l, "") |
| _append_to_graveyard(patch_name, { |
| "source_es": es, "target_ni": old_ni_surf, |
| "pos_es": old_pos, "es_morph": old_morph, |
| "pair_id": old_pid, |
| "reason": reason or f"replaced by {ni}", |
| }) |
| if old_ni_surf.lower() in NI2ES and \ |
| NI2ES[old_ni_surf.lower()][0].lower() == es_l: |
| del NI2ES[old_ni_surf.lower()] |
| ES2NI[es_l] = (ni, pid) |
| NI2ES[ni_l] = (es, pid) |
| if pos: ES2NI_POS[es_l] = pos |
| if morph: ES2NI_MORPH[es_l] = morph |
| ops["replace"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "ok", reason or "")) |
|
|
| elif op == "retire": |
| if not reason: |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "skip", |
| "retire requiere reason")) |
| continue |
| if es_l not in ES2NI: |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "skip", |
| "no existía")) |
| continue |
| old_ni_surf, old_pid = ES2NI[es_l] |
| _append_to_graveyard(patch_name, { |
| "source_es": es, "target_ni": old_ni_surf, |
| "pos_es": ES2NI_POS.get(es_l,""), |
| "es_morph": ES2NI_MORPH.get(es_l,""), |
| "pair_id": old_pid, "reason": reason, |
| }) |
| del ES2NI[es_l] |
| ES2NI_POS.pop(es_l, None) |
| ES2NI_MORPH.pop(es_l, None) |
| if old_ni_surf.lower() in NI2ES and \ |
| NI2ES[old_ni_surf.lower()][0].lower() == es_l: |
| del NI2ES[old_ni_surf.lower()] |
| _cleanup_ambig_after_remove(es_l, old_ni_surf) |
| ops["retire"] += 1 |
| LEX_PATCH_LOG.append((patch_name, op, es, ni, "ok", reason)) |
|
|
| else: |
| ops["skipped"] += 1 |
| LEX_PATCH_LOG.append((patch_name, "?", es, ni, "skip", |
| f"op desconocida: {op!r}")) |
|
|
| summary = ", ".join(f"{k}={v}" for k,v in ops.items() if v) |
| print(f"[PATCH] {patch_name}: {summary or 'sin cambios'}") |
| for k in totals: |
| totals[k] += ops[k] |
|
|
| print(f"[PATCH] Total: " + ", ".join(f"{k}={v}" for k,v in totals.items() if v)) |
|
|
| if any(totals[k] for k in ("add","override","replace","delete","retire")): |
| ES_FOLD.clear(); NI_FOLD.clear() |
| for es_key in ES2NI: |
| fk = fold(es_key) |
| if fk != es_key and fk not in ES_FOLD: |
| ES_FOLD[fk] = es_key |
| for ni_key in NI2ES: |
| fk = fold(ni_key) |
| if fk != ni_key and fk not in NI_FOLD: |
| NI_FOLD[fk] = ni_key |
|
|
| def _restore_orphan_ni_after_patches(): |
| """v112: tras aplicar todos los parches, restaurar entradas NI2ES huérfanas. |
| Si una entrada ES→NI existe pero el NI no está en NI2ES NI en AMBIG_NI |
| (es decir, NI completamente huérfano sin colisión), restaurarla en NI2ES. |
| Esto cubre el caso típico: DELETE de fem espuria (invierna) deja NI2ES[anśutdiś] |
| vacío pero invierno→anśutdiś sigue siendo válido. |
| NO actúa cuando NI está en AMBIG_NI (sin criterio universal para elegir winner). |
| """ |
| restored = 0 |
| for es_k, val_k in ES2NI.items(): |
| ni_k, pid_k = val_k |
| ni_l = ni_k.lower() |
| if ni_l not in NI2ES and ni_l not in AMBIG_NI: |
| NI2ES[ni_l] = (es_k, pid_k) |
| restored += 1 |
| if restored: |
| debug_print(f"[PATCH] v112: restauradas {restored} entradas NI huérfanas tras deletes") |
|
|
| apply_lex_patches() |
| _restore_orphan_ni_after_patches() |
|
|
| def _recompute_ambig_ni_after_patches(): |
| """v133: tras aplicar todos los parches, sincronizar AMBIG_NI y NI2ES con |
| el estado real de ES2NI. Dos correcciones complementarias: |
| |
| 1) AMBIG fantasmas (v132): entradas marcadas como ambiguas durante la |
| carga inicial pero cuyos contribuyentes cambiaron de NI vía override |
| sin limpiar la marca. Ej: śunleŕśet-ir era AMBIG{abierto, abrutado}; |
| parche 127 movió 'abrutado' a sirudata-ir, así que solo 'abierto' |
| apunta ya a śunleŕśet-ir, pero AMBIG seguía bloqueando la inversa. |
| |
| 2) NI2ES obsoleto (v133): entradas en NI2ES que apuntan a un ES cuyo |
| ES2NI actual ya no apunta a ese NI. Esto pasa cuando el motor en |
| `override` sobrescribe NI2ES[ni_l] = (es, pid) sin comprobar si el |
| NI estaba en AMBIG ni si el ES viejo allí presente sigue siendo |
| coherente. Ej real: NI2ES['bemuŕ-k']='gruesas' tras parche 103, |
| pero parche 128 movió gruesa→ti-bemuŕ-k; el único ES que apunta a |
| bemuŕ-k es ahora 'gruesos', así que NI2ES['bemuŕ-k'] debería ser |
| 'gruesos', no 'gruesas'. |
| |
| Opción de fix elegida (la conservadora): solo corregir cuando hay |
| EXACTAMENTE un superviviente único. NO eliminar huérfanas (NI2ES |
| apuntando a algo sin candidato actual): podrían venir de versiones |
| viejas de NI que aún aparezcan en textos guardados. NO tocar casos |
| multi-candidato (irían a AMBIG; preferimos dejarlos como están y no |
| introducir nuevos AMB inesperados). |
| |
| Coste: O(|ES2NI|) + O(|NI2ES|). ~6 s en arranque (medido sobre |
| 1.5M ES2NI + 2M NI2ES tras todos los parches). Una sola pasada al |
| final; no toca el bucle interno de override (intento previo de hacer |
| cleanup por cada override disparó el tiempo de arranque a >30 min). |
| """ |
| |
| |
| |
| actual_ni_to_es = {} |
| for es_l, val in ES2NI.items(): |
| ni_l = val[0].lower() |
| actual_ni_to_es.setdefault(ni_l, []).append((es_l, val[1])) |
|
|
| |
| cleaned_amb = 0 |
| restored_amb = 0 |
| for ni_l in list(AMBIG_NI.keys()): |
| actual = actual_ni_to_es.get(ni_l, []) |
| if len(actual) <= 1: |
| AMBIG_NI.pop(ni_l, None) |
| cleaned_amb += 1 |
| if len(actual) == 1 and ni_l not in NI2ES: |
| survivor_es, survivor_pid = actual[0] |
| NI2ES[ni_l] = (survivor_es, survivor_pid) |
| restored_amb += 1 |
|
|
| |
| fixed_obsolete = 0 |
| for ni_l in NI2ES: |
| cur_es = NI2ES[ni_l][0].lower() |
| |
| |
| if cur_es in ES2NI and ES2NI[cur_es][0].lower() == ni_l: |
| continue |
| |
| actual = actual_ni_to_es.get(ni_l, []) |
| if len(actual) == 1: |
| survivor_es, survivor_pid = actual[0] |
| NI2ES[ni_l] = (survivor_es, survivor_pid) |
| fixed_obsolete += 1 |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| restored_orphan = 0 |
| for ni_l, actual in actual_ni_to_es.items(): |
| if ni_l not in NI2ES and ni_l not in AMBIG_NI and len(actual) == 1: |
| survivor_es, survivor_pid = actual[0] |
| NI2ES[ni_l] = (survivor_es, survivor_pid) |
| restored_orphan += 1 |
|
|
| if cleaned_amb or fixed_obsolete or restored_orphan: |
| debug_print(f"[PATCH] v140: AMBIG limpiados={cleaned_amb} (restaurados={restored_amb}), NI2ES obsoletos corregidos={fixed_obsolete}, huérfanos restaurados={restored_orphan}") |
|
|
| _recompute_ambig_ni_after_patches() |
|
|
| def _register_ipfv_3s_reverse(): |
| suffixes_1s_to_3s = ['-ska-n', '-tei-n', '-na-n', '-nabo-n'] |
| added = 0 |
| skipped = 0 |
| new_entries = [] |
| for ni_key, val in list(NI2ES.items()): |
| for suf in suffixes_1s_to_3s: |
| if ni_key.endswith(suf): |
| ni_3s = ni_key[:-2] |
| if ni_3s in NI2ES: |
| skipped += 1 |
| break |
| new_entries.append((ni_3s, val)) |
| break |
| for ni_3s, val in new_entries: |
| NI2ES[ni_3s] = val |
| added += 1 |
| debug_print(f"3S reverse map (IPFV/COND/SBJ/SBJ_IPFV): {added} formas registradas, {skipped} ya existían") |
|
|
| _register_ipfv_3s_reverse() |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| VERSION_MARKER = "v142_2026_05_19_des_diptongacion_enclitica" |
| try: |
| print(f"[Neoíbero translator] versión cargada: {VERSION_MARKER}", flush=True) |
| print(f"[Neoíbero translator] léxico activo: {CSV_BI}", flush=True) |
| except Exception: |
| pass |
|
|
| |
| def _longest_match(tokens, i, phrase_map): |
| if not phrase_map: return (0, None) |
| max_span = 0; surface = None |
| for span in range(1, MAX_NGRAM+1): |
| if i+span > len(tokens): break |
| cand = " ".join(lower(t) for t in tokens[i:i+span]) |
| if cand in phrase_map: |
| max_span = span |
| surface = phrase_map[cand][0] |
| else: |
| fcand = " ".join(fold(lower(t)) for t in tokens[i:i+span]) |
| if fcand != cand and fcand in phrase_map: |
| max_span = span |
| surface = phrase_map[fcand][0] |
| return (max_span, surface) |
|
|
| |
| def sentence_case_spanish(s: str) -> str: |
| out = [] |
| start = True |
| in_br = False |
| WRAPS = "¿¡\"'«(“‘[—–" |
| last_real = None |
|
|
| for ch in s: |
| if ch == '[': |
| in_br = True |
|
|
| |
| |
| if not in_br and last_real == ':' and (ch == '\n' or ch == NEWLINE_TOK): |
| start = True |
|
|
| if not in_br and start: |
| |
| |
| if ch.isspace() or ch == NEWLINE_TOK: |
| out.append(ch) |
| elif ch in WRAPS: |
| out.append(ch) |
| elif ch in '»"”\'': |
| out.append(ch) |
| |
| elif ch.isalpha(): |
| out.append(ch.upper()); start = False |
| else: |
| out.append(ch) |
| start = ch in "¿¡" |
| else: |
| out.append(ch) |
| if not in_br and ch in ".?!…": |
| start = True |
| elif not in_br and ch in "¿¡": |
| start = True |
|
|
| if ch == ']': |
| in_br = False |
|
|
| |
| if not ch.isspace() and ch != NEWLINE_TOK: |
| last_real = ch |
|
|
| return "".join(out) |
|
|
| def postprocess_spanish(s: str) -> str: |
| s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s) |
| s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s) |
| s = re.sub(r"\s+([,.;:!?])", r"\1", s) |
| |
| |
| |
| |
| s = re.sub(rf"([?.!;])(?!\s|$|[.,;:!?]|{re.escape(NEWLINE_TOK)})([^\s])", r"\1 \2", s) |
| s = re.sub(r"([¿¡])\s+", r"\1", s) |
| s = re.sub(r"\s{2,}", " ", s).strip() |
| return sentence_case_spanish(s) |
|
|
| |
| def translate_es_to_ni_bi(text:str): |
| |
| |
| |
| |
| text = (text or "").replace("\r\n", "\n").replace("\r", "\n") |
| text = text.replace("\n", f" {NEWLINE_TOK} ") |
|
|
| toks = simple_tokenize(text) |
| toks = expand_enclitics(toks) |
|
|
| _NOUN_CTX = frozenset({'el','la','los','las','un','una','unos','unas','al','del', |
| 'de','en','con','por','para','a','sin','sobre','entre', |
| 'hacia','hasta','desde','contra','según','ante','bajo','tras', |
| 'mi','tu','su','mis','tus','sus','nuestro','nuestra', |
| 'nuestros','nuestras','vuestro','vuestra','vuestros','vuestras', |
| 'este','esta','estos','estas','ese','esa','esos','esas', |
| 'aquel','aquella','aquellos','aquellas','cada','otro','otra', |
| 'mucho','mucha','muchos','muchas','poco','poca','pocos','pocas', |
| 'todo','toda','todos','todas','algún','alguna','ningún','ninguna', |
| 'buen','mal','gran','primer','tercer','qué','cuánto','cuánta'}) |
| _VERB_CTX = frozenset({'yo','tú','él','ella','nosotros','nosotras','vosotros','vosotras', |
| 'ellos','ellas','usted','ustedes', |
| 'se','me','te','nos','os','le','les','lo', |
| 'no','ya','también','tampoco','nunca','siempre','aún','todavía', |
| 'que','quien','quienes','donde','cuando','como','si'}) |
| _INFINITIVE_ENDINGS = ('ar','er','ir') |
| _INFINITIVE_CTX = frozenset({'de','sin','para','por','al','antes','tras','hasta'}) |
| _VERB_ALWAYS = frozenset({'son','es','ha','he','era','fue','fui','van', |
| 'dan','das','den','des','hay','doy','soy','voy', |
| 'iba','di','haya'}) |
|
|
| _TIME_WORDS = frozenset({'año','años','día','días','mes','meses', |
| 'semana','semanas','hora','horas', |
| 'minuto','minutos','segundo','segundos', |
| 'tiempo','rato','siglo','siglos', |
| 'década','décadas','momento','momentos', |
| 'instante','instantes','jornada','jornadas', |
| 'noche','noches','tarde','tardes','mañana','mañanas', |
| 'milenio','milenios'}) |
| _TIME_QUANTIFIERS = frozenset({'mucho','muchos','mucha','muchas', |
| 'poco','pocos','poca','pocas', |
| 'tanto','tantos','tanta','tantas', |
| 'algunos','algunas','varios','varias', |
| 'demasiado','demasiados','demasiada','demasiadas', |
| 'un','una','unos','unas', |
| 'dos','tres','cuatro','cinco','seis','siete', |
| 'ocho','nueve','diez','once','doce','trece', |
| 'catorce','quince','veinte','treinta','cuarenta', |
| 'cincuenta','cien','mil','medio','media'}) |
|
|
| _FIRST_PERSON_SUBJECTS = frozenset({'yo'}) |
| _THIRD_SG_PRON_SUBJECTS = frozenset({'él','ella','ello','usted','esto','eso','aquello'}) |
| _SINGULAR_NP_DETS = frozenset({'el','la','un','una','este','esta','ese','esa','aquel','aquella', |
| 'mi','tu','su','nuestro','nuestra','vuestro','vuestra'}) |
|
|
| _PREP_CTX = frozenset({'a','ante','con','contra','de','desde','en','entre', |
| 'hacia','hasta','para','por','según','sin','sobre','tras'}) |
| _TILDE_MAP = {'mi': 'mí', 'el': 'él', 'si': 'sí', 'tu': 'tú'} |
| _PHRASE_BREAK = frozenset({'y','o','e','u','ni','que','pero','sino','como', |
| 'porque','cuando','donde','aunque','pues'}) |
|
|
| def _accented_lookup(key): |
| if key in _TILDE_MAP and _TILDE_MAP[key] in ES2NI: |
| return ES2NI[_TILDE_MAP[key]][0] |
| return None |
|
|
| def _resolve_forms(raw_key:str): |
| key = lower(raw_key) |
| if key in ES2NI: |
| return key, ES2NI[key][0], ES2NI_VERB.get(key, (None, None))[0] |
| fkey = fold(key) |
| if fkey in ES_FOLD: |
| actual = ES_FOLD[fkey] |
| return actual, ES2NI[actual][0], ES2NI_VERB.get(actual, (None, None))[0] |
| return key, None, None |
|
|
| def _choose_es_to_ni(current_tok:str, prev_key:str, next_key:str, next_next_key:str, sent_start:bool, prev_prev_key:str=""): |
| actual_key, ni_nom, ni_verb = _resolve_forms(current_tok) |
| if ni_nom is None and ni_verb is None: |
| return None |
|
|
| key = lower(actual_key) |
|
|
| if key == 'hace': |
| is_time_context = False |
| if next_key in _TIME_WORDS: |
| is_time_context = True |
| elif (next_key in _TIME_QUANTIFIERS or is_number(next_key)) and next_next_key in _TIME_WORDS: |
| is_time_context = True |
| if is_time_context: |
| if 'atrás' in ES2NI: |
| return ES2NI['atrás'][0] |
|
|
| if key.endswith(_INFINITIVE_ENDINGS): |
| if key in ES2NI: |
| return ES2NI[key][0] |
| fkey = fold(key) |
| if fkey in ES_FOLD: |
| return ES2NI[ES_FOLD[fkey]][0] |
|
|
| if key in _VERB_ALWAYS and ni_verb: |
| return ni_verb |
|
|
| if key in _TILDE_MAP: |
| accented_ni = _accented_lookup(key) |
| if accented_ni: |
| |
| |
| if prev_key in _PREP_CTX: |
| if not next_key or next_key in VISIBLE_PUNCT or next_key in _PHRASE_BREAK: |
| return accented_ni |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if key in ('tu', 'el') and next_key: |
| next_pos = ES2NI_POS.get(next_key, '') |
| if next_pos == 'V': |
| next_pl = _pluralize_es_form(next_key) |
| if ES2NI_POS.get(next_pl, '') != 'N': |
| return accented_ni |
| |
| |
| |
| |
| |
| |
| |
|
|
| if prev_key in _NOUN_CTX and ni_nom: |
| return ni_nom |
|
|
| if prev_key in _VERB_CTX and ni_verb: |
| return ni_verb |
|
|
| |
| |
| if ni_verb and ni_nom and prev_prev_key in _NOUN_CTX: |
| if ES2NI_POS.get(prev_key, "") == "N": |
| return ni_verb |
|
|
| if prev_key in _INFINITIVE_CTX: |
| inf_key = lower(current_tok) |
| if inf_key.endswith(_INFINITIVE_ENDINGS): |
| if inf_key in ES2NI: |
| return ES2NI[inf_key][0] |
| ff = fold(inf_key) |
| if ff in ES_FOLD: |
| return ES2NI[ES_FOLD[ff]][0] |
| if ni_verb: |
| return ni_verb |
|
|
| if sent_start: |
| if key.endswith(_INFINITIVE_ENDINGS): |
| if key in ES2NI: |
| return ES2NI[key][0] |
| ff = fold(key) |
| if ff in ES_FOLD: |
| return ES2NI[ES_FOLD[ff]][0] |
| if ni_verb and not ni_nom: |
| return ni_verb |
|
|
| if ni_verb and next_key in _NOUN_CTX: |
| pos_nom = ES2NI_POS.get(key, "") |
| if pos_nom != "ADJ": |
| return ni_verb |
|
|
| if ni_nom is not None: |
| return ni_nom |
| if ni_verb is not None: |
| return ni_verb |
| return None |
|
|
| def _has_explicit_3s_subject(left_context): |
| |
| |
| |
| |
| ctx = [lower(x) for x in (left_context or []) if x] |
| tail = ctx[-5:] if ctx else [] |
|
|
| |
| if any(tok in _FIRST_PERSON_SUBJECTS for tok in tail): |
| return False |
|
|
| |
| if tail and tail[-1] in _THIRD_SG_PRON_SUBJECTS: |
| return True |
|
|
| for j in range(len(tail)-2, -1, -1): |
| if tail[j] in _SINGULAR_NP_DETS: |
| if j > 0 and tail[j-1] in _PREP_CTX: |
| return False |
| if j < len(tail)-1: |
| return True |
|
|
| |
| return True |
|
|
| def _adjust_ipfv_ambiguous_person(ni, left_context): |
| if not (ni and isinstance(ni, str)): |
| return ni |
| ambiguous_suffixes = ("-ska-n", "-tei-n", "-na-n", "-nabo-n") |
| if any(ni.endswith(suf) for suf in ambiguous_suffixes): |
| if _has_explicit_3s_subject(left_context): |
| return ni[:-2] |
| return ni |
|
|
| out=[]; ib_toks=[] |
| i=0; prev_key="" |
| sent_start = True |
| left_context=[] |
| while i < len(toks): |
| t = toks[i] |
| if t in VISIBLE_PUNCT: |
| out.append(t); ib_toks.append(t); prev_key=""; i+=1 |
| if t in SENT_END: |
| sent_start = True |
| left_context=[] |
| elif t == NEWLINE_TOK: |
| |
| |
| sent_start = True |
| left_context=[] |
| elif t in CLAUSE_BREAKS: |
| left_context=[] |
| continue |
| if is_placeholder(t): |
| out.append(t); ib_toks.append(t); prev_key=""; i+=1 |
| sent_start = False |
| left_context.append(t) |
| continue |
|
|
| |
|
|
| span, ni_surface = _longest_match(toks, i, ESPHRASE2NI) |
| if span > 1: |
| ni_surface = _adjust_ipfv_ambiguous_person(ni_surface, left_context) |
| out.append(ni_surface) |
| ib_toks.append(georgeos_keys(tokens_from_latin(ni_surface), ni_surface)) |
| prev_key = lower(toks[i+span-1]) if i+span-1 < len(toks) else "" |
| for k_idx in range(i, i+span): |
| left_context.append(toks[k_idx]) |
| i += span |
| sent_start = False |
| continue |
|
|
| next_key = "" |
| next_next_key = "" |
| j = i + 1 |
| while j < len(toks): |
| if toks[j] in VISIBLE_PUNCT: |
| if toks[j] in SENT_END: |
| break |
| j += 1 |
| continue |
| next_key = lower(toks[j]) |
| break |
| if next_key: |
| k = j + 1 |
| while k < len(toks): |
| if toks[k] in VISIBLE_PUNCT: |
| if toks[k] in SENT_END: |
| break |
| k += 1 |
| continue |
| next_next_key = lower(toks[k]) |
| break |
|
|
| key = lower(t) |
| |
| |
| |
| |
| |
| key_in_lex = key in ES2NI or fold(key) in ES_FOLD |
| is_proper_noun = ( |
| len(t) >= 2 |
| and t[0].isupper() |
| and not t.isupper() |
| and t.isalpha() |
| and not key_in_lex |
| ) |
| if is_proper_noun: |
| out.append(t); ib_toks.append(t) |
| prev_key = key |
| left_context.append(t) |
| i += 1 |
| sent_start = False |
| continue |
|
|
| prev_prev_key = lower(left_context[-2]) if len(left_context) >= 2 else "" |
| ni = _choose_es_to_ni(t, prev_key, next_key, next_next_key, sent_start, prev_prev_key) |
| ni = _adjust_ipfv_ambiguous_person(ni, left_context) |
|
|
| if ni is not None: |
| out.append(ni) |
| ib_toks.append(georgeos_keys(tokens_from_latin(ni), ni)) |
| elif is_number(key): |
| |
| |
| |
| is_hour_context = False |
| if i+2 < len(toks) and toks[i+1] == ':' and is_number(toks[i+2]): |
| is_hour_context = True |
| elif i >= 2 and toks[i-1] == ':' and is_number(toks[i-2]): |
| is_hour_context = True |
| if is_hour_context: |
| out.append(key); ib_toks.append(key) |
| else: |
| ni_num = digit_to_ni(key) |
| out.append(ni_num); ib_toks.append(georgeos_keys(tokens_from_latin(ni_num), ni_num)) |
| else: |
| ph = f"[SIN-LEX:{t}]" |
| out.append(ph); ib_toks.append(ph) |
|
|
| prev_key = key |
| left_context.append(t) |
| i += 1 |
| sent_start = False |
|
|
| if MODAL_SUFFIX_ENABLE: |
| out = add_modal_suffixes_es2ni(out) |
| ib_toks = [] |
| for tt in out: |
| if tt in VISIBLE_PUNCT or tt.startswith("["): |
| ib_toks.append(tt) |
| else: |
| ib_toks.append(georgeos_keys(tokens_from_latin(tt), tt)) |
|
|
| ni_text = detokenize(out) |
| |
| ni_text = re.sub(rf"\s*{re.escape(NEWLINE_TOK)}\s*", "\n", ni_text) |
| ib_html = "<div class='ib-line'>" + escape(render_ib_with_tridots(ib_toks)) + "</div>" |
| ib_html = ib_html.replace(NEWLINE_TOK, "\n") |
| return ni_text, ib_html |
|
|
|
|
| |
| _APOCOPE_RULES = [ |
| ('ninguno', 'ningún', 'M'), |
| ('Ninguno', 'Ningún', 'M'), |
| ('alguno', 'algún', 'M'), |
| ('Alguno', 'Algún', 'M'), |
| ('bueno', 'buen', 'M'), |
| ('Bueno', 'Buen', 'M'), |
| ('malo', 'mal', 'M'), |
| ('Malo', 'Mal', 'M'), |
| ('primero', 'primer', 'M'), |
| ('Primero', 'Primer', 'M'), |
| ('tercero', 'tercer', 'M'), |
| ('Tercero', 'Tercer', 'M'), |
| ('grande', 'gran', 'X'), |
| ('Grande', 'Gran', 'X'), |
| ] |
|
|
| |
| |
| |
| _APOCOPE_BLACKLIST = { |
| "porque","que","aunque","sino","si","como","cuando","donde","mientras","pero", |
| "y","o","u","ni","ya","no","sí","muy","más","mas","menos","tan","tanto","tanta", |
| "todo","toda","todos","todas","nada","algo","alguno","alguna","algunos","algunas", |
| "este","esta","estos","estas","ese","esa","esos","esas","aquel","aquella", |
| "mi","tu","su","mis","tus","sus","nuestro","nuestra","nuestros","nuestras", |
| "vuestro","vuestra","vuestros","vuestras","de","del","en","a","al","por","para", |
| "con","sin","sobre","bajo","tras","entre","hacia","hasta","desde","durante", |
| "según","contra","mediante", |
| "lo","la","le","los","las","les","me","te","se","nos","os", |
| "fue","es","son","era","eran","fueron","será","serán","ha","han","había", |
| "habían","habrá","habrán", |
| } |
|
|
| def _is_masc_sg_noun_candidate(word): |
| if not word or not word[0].isalpha(): |
| return False |
| if not word[0].islower(): |
| return False |
| wl = word.lower() |
| |
| if wl in _APOCOPE_BLACKLIST: |
| return False |
| if len(wl) > 3 and wl.endswith('s'): |
| return False |
| if wl.endswith('a'): |
| return False |
| if wl.endswith(('dad', 'tad', 'ción', 'sión', 'tud', 'umbre', 'eza')): |
| return False |
| pos = ES2NI_POS.get(wl, "") |
| if pos and pos not in ("N", "ADJ"): |
| return False |
| return True |
|
|
| def _is_singular_noun_candidate(word): |
| if not word or not word[0].isalpha(): |
| return False |
| if not word[0].islower(): |
| return False |
| wl = word.lower() |
| |
| if wl in _APOCOPE_BLACKLIST: |
| return False |
| if len(wl) > 3 and wl.endswith('s'): |
| return False |
| pos = ES2NI_POS.get(wl, "") |
| if pos and pos not in ("N", "ADJ"): |
| return False |
| return True |
|
|
| def apply_apocope_es(text): |
| if not text: |
| return text |
| tokens = re.findall(r"\S+|\s+", text) |
| |
| for i in range(len(tokens) - 2): |
| tok = tokens[i] |
| if not tok or not tok[0].isalpha(): |
| continue |
| |
| prev_word = None |
| for j in range(i-1, -1, -1): |
| tj = tokens[j] |
| if tj.strip() == '': |
| continue |
| if tj[0].isalpha(): |
| prev_word = tj.lower().rstrip('.,;:!?"\'') |
| break |
| else: |
| break |
| if prev_word in ('más', 'mas', 'menos', 'tan', 'muy'): |
| continue |
| |
| next_word_idx = None |
| for j in range(i+1, len(tokens)): |
| tj = tokens[j] |
| if tj.strip() == '': |
| continue |
| if tj[0].isalpha(): |
| next_word_idx = j |
| break |
| else: |
| break |
| |
| if next_word_idx is None: |
| continue |
| |
| next_word = tokens[next_word_idx] |
| next_word_clean = re.sub(r'[.,;:!?"\']+$', '', next_word) |
| if not next_word_clean: |
| continue |
| |
| for plena, apocopada, genero in _APOCOPE_RULES: |
| tok_clean = re.sub(r'[.,;:!?"\']+$', '', tok) |
| if tok_clean != plena: |
| continue |
| |
| |
| |
| |
| if tok != plena: |
| |
| break |
| if genero == 'M': |
| if _is_masc_sg_noun_candidate(next_word_clean): |
| tokens[i] = apocopada |
| elif genero == 'X': |
| if _is_singular_noun_candidate(next_word_clean): |
| tokens[i] = apocopada |
| break |
| |
| return ''.join(tokens) |
|
|
|
|
| def translate_ni_to_es_bi(text:str): |
| |
| |
| |
| |
| text = (text or "").replace("\r\n", "\n").replace("\r", "\n") |
| text = text.replace("\n", f" {NEWLINE_TOK} ") |
|
|
| toks = simple_tokenize(text) |
|
|
| if MODAL_SUFFIX_ENABLE: |
| toks = strip_modal_suffixes_ni(toks) |
|
|
| def _is_doge_hace_context(idx, tokens): |
| if idx >= len(tokens) or lower(tokens[idx]) != 'doge': |
| return False |
| if idx+1 >= len(tokens): |
| return False |
| es_time_words = {'año','años','día','días','mes','meses','semana','semanas', |
| 'hora','horas','minuto','minutos','segundo','segundos', |
| 'momento','momentos','instante','instantes', |
| 'rato','ratos','tiempo','siglo','siglos', |
| 'década','décadas','milenio','milenios', |
| 'jornada','jornadas','noche','noches', |
| 'tarde','tardes','mañana','mañanas'} |
|
|
| def _es_of(ni_token): |
| t = lower(ni_token) |
| es = (NI2ES.get(t, (None,))[0] or "").lower() |
| return es |
|
|
| for probe in (idx+1, idx+2): |
| if probe >= len(tokens): |
| break |
| es = _es_of(tokens[probe]) |
| if es and es in es_time_words: |
| return True |
| return False |
|
|
| def _is_at_sentence_start(idx, tokens): |
| if idx == 0: |
| return True |
| prev = tokens[idx-1] |
| return prev in SENT_END or prev == "." or prev == "!" or prev == "?" |
|
|
| out=[] |
| i=0 |
| while i < len(toks): |
| t = toks[i] |
| if t in VISIBLE_PUNCT: |
| out.append(t); i+=1; continue |
| if is_placeholder(t): |
| |
| |
| |
| |
| inner = t[1:-1] |
| if ':' not in inner: |
| out.append(inner) |
| else: |
| out.append(t) |
| i += 1 |
| continue |
| span, es_surface = _longest_match(toks, i, NIPHRASE2ES) |
| if span > 1: |
| out.append(es_surface); i += span; continue |
|
|
| key = lower(t) |
| fkey = fold(key) |
|
|
| if key == 'doge' and _is_doge_hace_context(i, toks): |
| if _is_at_sentence_start(i, toks): |
| out.append('Hace') |
| else: |
| out.append('hace') |
| i += 1 |
| continue |
|
|
| if key == 'galbi-ke' and i+1 < len(toks): |
| nxt = lower(toks[i+1]) |
| is_part = nxt.endswith('-ir') or '-ir-' in nxt |
| if is_part: |
| out.append('ha') |
| i += 1 |
| continue |
|
|
| if key in NI2ES: |
| es = NI2ES[key][0] or "" |
| out.append(es if es else t) |
| elif fkey in NI_FOLD: |
| es = NI2ES[NI_FOLD[fkey]][0] or "" |
| out.append(es if es else t) |
| elif key in AMBIG_NI or fkey in AMBIG_NI and STRICT_BI_ENFORCE: |
| out.append(f"[AMB-NI:{t}]") |
| elif is_number(key): |
| out.append(t) |
| else: |
| out.append(t) |
| i += 1 |
|
|
| if MODAL_SUFFIX_ENABLE: |
| out = add_inverted_openers(out) |
| out = apply_interrogative_tildes(out) |
|
|
| es_text = detokenize(out) |
| es_text = postprocess_spanish(es_text) |
| es_text = apply_apocope_es(es_text) |
| es_text = fuse_enclitics_es(es_text) |
| |
| |
| |
| |
| |
| |
| _GER_TILDE = {"ando":"ándo", "iendo":"iéndo", "yendo":"yéndo"} |
| def _add_ger_tilde(m): |
| stem = m.group(1) |
| ger = m.group(2) |
| clitic = m.group(3) |
| return stem + _GER_TILDE[ger] + clitic |
| es_text = re.sub( |
| r"\b([a-záéíóúñü]*?)(ando|iendo|yendo)(me|te|se|nos|os|le|les|lo|la|los|las)\b", |
| _add_ger_tilde, es_text, flags=re.IGNORECASE |
| ) |
| |
| |
| |
| |
| |
| es_text = re.sub(r"([?!])\s*»", r"»\1", es_text) |
| |
| |
| |
| |
| |
| es_text = re.sub(r"\s+»", "»", es_text) |
| |
| es_text = re.sub(rf"\s*{re.escape(NEWLINE_TOK)}\s*", "\n", es_text) |
| |
| |
| |
| |
| es_text = re.sub(r"\.\s*\.\s*$", ".", es_text) |
| return es_text |
|
|
| |
| def diagnose_text(text, dir_label): |
| if not text or not text.strip(): |
| return "<em>Introduce texto para diagnosticar.</em>" |
|
|
| toks = simple_tokenize(text) |
| if dir_label.startswith("ES"): |
| toks = expand_enclitics(toks) |
| unknown=set(); asym=set(); amb=set() |
| total_tokens=0; covered=0 |
|
|
| if dir_label.startswith("ES"): |
| head = "ES→NI" |
| i=0 |
| while i < len(toks): |
| t = toks[i] |
| if t in VISIBLE_PUNCT or is_number(t): |
| i+=1; continue |
| total_tokens += 1 |
| span, _ = _longest_match(toks, i, ESPHRASE2NI) |
| if span > 1: |
| covered += 1; i += span; continue |
| k=lower(t) |
| fk=fold(k) |
| if k not in ES2NI and fk not in ES_FOLD: |
| unknown.add(t); i+=1; continue |
| if k not in ES2NI: k=ES_FOLD.get(fk, k) |
| covered += 1 |
| ni = ES2NI[k][0] |
| back = NI2ES.get(lower(ni)) |
| if back and lower(back[0]) != k: |
| asym.add(f"{t} → {ni} → {back[0]}") |
| i+=1 |
| else: |
| head = "NI→ES" |
| i=0 |
| while i < len(toks): |
| t = toks[i] |
| if t in VISIBLE_PUNCT or is_number(t): |
| i+=1; continue |
| total_tokens += 1 |
| span, _ = _longest_match(toks, i, NIPHRASE2ES) |
| if span > 1: |
| covered += 1; i += span; continue |
| k=lower(t) |
| fk=fold(k) |
| if k in AMBIG_NI or fk in AMBIG_NI: |
| amb.add(t); i+=1; continue |
| if k not in NI2ES and fk not in NI_FOLD: |
| unknown.add(t); i+=1; continue |
| if k not in NI2ES: k=NI_FOLD.get(fk, k) |
| covered += 1 |
| es = NI2ES[k][0] |
| back = ES2NI.get(lower(es)) |
| if back and lower(back[0]) != k: |
| asym.add(f"{t} → {es} → {back[0]}") |
| i+=1 |
|
|
| cov_pct = (covered/total_tokens*100) if total_tokens else 100.0 |
| cov_html = f"<div><b>Tokens (sin puntuación/numéricos):</b> {total_tokens} | <b>Cubiertos:</b> {covered} ({cov_pct:.1f}%)</div>" |
|
|
| unk_html = "".join(f"<li><code>{escape(u)}</code></li>" for u in sorted(unknown, key=lambda x: lower(x))) or "<li><i>—</i></li>" |
| amb_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(amb, key=lambda x: lower(x))) or "<li><i>—</i></li>" |
| asy_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(asym)) or "<li><i>—</i></li>" |
|
|
| return f"<b>Diagnóstico {head}</b>{cov_html}<b>Ambiguas (NI duplicada):</b><ul>{amb_html}</ul><b>Faltantes:</b><ul>{unk_html}</ul><b>Asimetrías:</b><ul>{asy_html}</ul>" |
|
|
| |
| LABELS={ |
| "ES":{ |
| "title":"Traductor Español ↔ Neoíbero", |
| "subtitle":"CSV estricto (BI-only 1:1; desambiguación ligera ES→NI; .gz) — determinista", |
| "in_label_es":"✏️ Entrada (Español)", |
| "in_label_ni":"✏️ Entrada (Neoíbero)", |
| "in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.", |
| "in_ph_ni":"Idatzi hemen. Adib.: nuker-ke ni etxe-ka.", |
| "out_lat_esni":"📜 Salida: Neoíbero (latín)", |
| "out_lat_nies":"📜 Salida: Español", |
| "out_ib":"🗿 Línea ibérica", |
| "out_audio":"🔊 Locución (Audio)", |
| "btn":"🔄 Traducir", |
| "combo":"🌍 Idioma (UI + explicación)", |
| "dir":"🔁 Dirección", |
| "dir_opts":["ES → NI","NI → ES"], |
| "doc_header":"📚 Documentación y Referencia", |
| "acc_titles":[ |
| "🌍 ¿Qué es el neoíbero?", |
| "🔤 Fonología y escritura", |
| "📐 Sistema nominal: género, número y caso", |
| "🔄 Sistema verbal: TAM, persona y clíticos", |
| "🌿 Derivación y familias de palabras", |
| "🔢 Sistema numérico vigesimal", |
| "📝 Sintaxis básica y partículas", |
| "❓ Modalidad vascoide (-na / -ba)", |
| "⚙️ Pipeline del traductor (1:1 estricto)", |
| "📚 Bibliografía y créditos", |
| "🧾 Glosario técnico" |
| ] |
| }, |
| "EN":{ |
| "title":"Spanish ↔ Neo-Iberian Translator", |
| "subtitle":"Strict BI-only (1:1 surfaces; light ES→NI disambiguation; .gz) — deterministic", |
| "in_label_es":"✏️ Input (Spanish)", |
| "in_label_ni":"✏️ Input (Neo-Iberian)", |
| "in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.", |
| "in_ph_ni":"Type here. E.g., nuker-ke ni etxe-ka.", |
| "out_lat_esni":"📜 Output: Neo-Iberian (Latin)", |
| "out_lat_nies":"📜 Output: Spanish", |
| "out_ib":"🗿 Iberian line", |
| "out_audio":"🔊 Speech (Audio)", |
| "btn":"🔄 Translate", |
| "combo":"🌍 Language (UI + docs)", |
| "dir":"🔁 Direction", |
| "dir_opts":["ES → NI","NI → ES"], |
| "doc_header":"📚 Documentation & Reference", |
| "acc_titles":[ |
| "🌍 What is Neo-Iberian?", |
| "🔤 Phonology and writing", |
| "📐 Nominal system: gender, number & case", |
| "🔄 Verbal system: TAM, person & clitics", |
| "🌿 Derivation and word families", |
| "🔢 Vigesimal number system", |
| "📝 Basic syntax and particles", |
| "❓ Vascoid modality (-na / -ba)", |
| "⚙️ Translator pipeline (strict 1:1)", |
| "📚 Bibliography and credits", |
| "🧾 Technical glossary" |
| ] |
| } |
| } |
|
|
| |
| def build_css(): |
| b64=None |
| if os.path.exists("Iberia-Georgeos.ttf"): |
| with open("Iberia-Georgeos.ttf","rb") as f: |
| b64=base64.b64encode(f.read()).decode("ascii") |
| font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')" |
| return f""" |
| @font-face {{ |
| font-family: 'IberiaGeorgeos'; |
| src: {font_src}; |
| font-weight: normal; font-style: normal; |
| }} |
| :root {{ |
| --iberian-clay:#8B4513; --iberian-ochre:#CC7722; --iberian-stone:#5C5C5C; |
| --iberian-sand:#D2B48C; --iberian-rust:#A0522D; --iberian-bronze:#CD7F32; |
| }} |
| .gradio-container {{ background:transparent!important; |
| font-family:'Georgia','Times New Roman',serif!important; }} |
| html, body {{ background: transparent !important; }} |
| .gradio-container h1, .gradio-container h2, .gradio-container h3 {{ |
| color:var(--iberian-clay)!important; text-shadow:2px 2px 4px rgba(139,69,19,.15)!important; |
| border-bottom:3px solid var(--iberian-bronze)!important; padding-bottom:.5rem!important; letter-spacing:.5px!important; |
| }} |
| .gradio-container .gr-group {{ background:linear-gradient(to bottom,#f9f6f0,#ede6dc)!important; |
| border:2px solid var(--iberian-sand)!important; border-radius:8px!important; box-shadow:0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5)!important; |
| padding:1.5rem!important; margin-bottom:0.2rem!important; }} |
| .gradio-container .gr-accordion {{ background:linear-gradient(145deg,#ebe3d5,#d9cec0)!important; |
| border:2px solid var(--iberian-rust)!important; border-radius:6px!important; margin-bottom:.8rem!important; box-shadow:2px 2px 6px rgba(0,0,0,.15)!important; }} |
| .gradio-container .gr-accordion .label-wrap {{ background:linear-gradient(to right,var(--iberian-ochre),var(--iberian-rust))!important; |
| color:#fff!important; font-weight:600!important; padding:.8rem 1rem!important; border-radius:4px!important; text-shadow:1px 1px 2px rgba(0,0,0,.3)!important; }} |
| .gradio-container .gr-textbox textarea, .gradio-container .gr-textbox input {{ background:linear-gradient(to bottom,#faf8f3,#f5f0e8)!important; |
| border:2px solid var(--iberian-sand)!important; border-radius:6px!important; color:#000!important; |
| font-family:'Georgia',serif!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1)!important; }} |
| .gradio-container .gr-textbox textarea:focus, .gradio-container .gr-textbox input:focus {{ |
| border-color:var(--iberian-bronze)!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3)!important; }} |
| .gradio-container .gr-button.gr-button-primary {{ background:linear-gradient(145deg,var(--iberian-bronze),var(--iberian-rust))!important; |
| border:2px solid var(--iberian-clay)!important; color:#fff!important; font-weight:bold!important; text-shadow:1px 2px 2px rgba(0,0,0,.4)!important; |
| box-shadow:0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2)!important; border-radius:8px!important; padding:.8rem 1.5rem!important; transition:all .3s ease!important; }} |
| .gradio-container .gr-button.gr-button-primary:hover {{ background:linear-gradient(145deg,var(--iberian-rust),var(--iberian-bronze))!important; |
| transform:translateY(-2px)!important; box-shadow:0 6px 12px rgba(139,69,19,.4)!important; }} |
| .gradio-container, gradio-app {{ |
| --button-primary-background-fill: linear-gradient(145deg,#CD7F32,#A0522D) !important; |
| --button-primary-background-fill-hover: linear-gradient(145deg,#A0522D,#CD7F32) !important; |
| --button-primary-text-color: #fff !important; |
| --button-primary-border-color: #8B4513 !important; |
| --checkbox-label-background-fill-selected: linear-gradient(145deg,#CD7F32,#A0522D) !important; |
| --checkbox-label-text-color-selected: #fff !important; |
| --checkbox-label-border-color-selected: #8B4513 !important; |
| --block-label-text-color: #4a2e15 !important; |
| --block-title-text-color: #4a2e15 !important; |
| --block-info-text-color: #4a2e15 !important; |
| --button-secondary-text-color: #4a2e15 !important; |
| --checkbox-label-text-color: #4a2e15 !important; |
| --input-placeholder-color: #8a6f4a !important; |
| --input-text-color: #000 !important; |
| --input-text-size: inherit !important; |
| }} |
| .gradio-container label.selected, |
| .gradio-container [data-testid$="-radio-label"].selected {{ |
| background: linear-gradient(145deg,#CD7F32,#A0522D) !important; |
| color: #fff !important; |
| border-color: #8B4513 !important; |
| }} |
| .gradio-container label.selected *, |
| .gradio-container [data-testid$="-radio-label"].selected * {{ |
| color: #fff !important; |
| }} |
| .gradio-container .gr-button.gr-button-secondary, |
| .gradio-container button.secondary {{ |
| color: #4a2e15 !important; |
| }} |
| .gradio-container span[data-testid="block-info"], |
| .gradio-container .block-title, |
| .gradio-container .block-label, |
| .gradio-container label > span:not(.selected), |
| .gradio-container .gr-form > label, |
| .gradio-container .gr-block label {{ |
| color: #4a2e15 !important; |
| }} |
| .ib-line {{ font-family:'IberiaGeorgeos',monospace,sans-serif!important; font-size:1.9rem!important; line-height:2.4rem!important; white-space:pre-wrap!important; |
| background:linear-gradient(135deg,#e8dcc8 0%,#d4c4a8 50%,#c4b098 100%)!important; padding:24px!important; border-radius:10px!important; |
| border:3px solid var(--iberian-rust)!important; border-left:6px solid var(--iberian-bronze)!important; |
| box-shadow:0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1)!important; color:var(--iberian-clay)!important; position:relative!important; }} |
| .ib-line::before {{ content:''!important; position:absolute!important; inset:0!important; |
| background-image:repeating-linear-gradient(0deg,transparent,transparent 2px, rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px)!important; |
| pointer-events:none!important; border-radius:10px!important; }} |
| @media (max-width:768px) {{ |
| .ib-line {{ font-size:1.5rem!important; line-height:2rem!important; padding:16px!important; }} |
| .gradio-container .gr-group {{ padding:1rem!important; }} |
| .gradio-container h1 {{ font-size:1.8rem!important; }} |
| }} |
| @media (max-width:480px) {{ |
| .ib-line {{ font-size:1.3rem!important; line-height:1.8rem!important; padding:12px!important; }} |
| .gradio-container h1 {{ font-size:1.5rem!important; }} |
| }} |
| .gradio-container button[role="tab"] {{ |
| background:linear-gradient(145deg,#ebe3d5,#d9cec0)!important; |
| border:2px solid var(--iberian-sand)!important; |
| border-bottom:none!important; |
| color:var(--iberian-clay)!important; |
| font-weight:600!important; |
| font-family:'Georgia','Times New Roman',serif!important; |
| font-size:1.05rem!important; |
| padding:0.8rem 2rem!important; |
| margin:0 0.3rem 0 0!important; |
| border-radius:8px 8px 0 0!important; |
| transition:all .25s ease!important; |
| box-shadow:2px 2px 6px rgba(0,0,0,.12)!important; |
| text-shadow:1px 1px 2px rgba(139,69,19,.08)!important; |
| }} |
| .gradio-container button[role="tab"]:hover {{ |
| background:linear-gradient(145deg,var(--iberian-ochre),#CC7722)!important; |
| color:#ffffff!important; |
| transform:translateY(-3px)!important; |
| box-shadow:0 5px 10px rgba(139,69,19,.25)!important; |
| text-shadow:1px 1px 3px rgba(0,0,0,.3)!important; |
| }} |
| .gradio-container button[role="tab"][aria-selected="true"] {{ |
| background:linear-gradient(145deg,var(--iberian-bronze),var(--iberian-rust))!important; |
| border:3px solid var(--iberian-clay)!important; |
| border-bottom:none!important; |
| color:#ffffff!important; |
| font-weight:700!important; |
| box-shadow:0 6px 12px rgba(139,69,19,.35), inset 0 1px 0 rgba(255,255,255,.25)!important; |
| text-shadow:1px 2px 3px rgba(0,0,0,.45)!important; |
| transform:translateY(0px)!important; |
| }} |
| .gradio-container div[role="tablist"] {{ |
| background:linear-gradient(145deg,#e8dcc8,#d9c4b0)!important; |
| border-bottom:4px solid var(--iberian-bronze)!important; |
| padding:0.5rem 1rem 0 1rem!important; |
| border-radius:10px 10px 0 0!important; |
| box-shadow:0 2px 8px rgba(139,69,19,.15)!important; |
| }} |
| """ |
| CSS = build_css() |
|
|
| |
| def _load_map_html() -> str: |
| for cand in ("mapa_iberos_neoibero.html", "salida/mapa_iberos_neoibero.html"): |
| if os.path.exists(cand): |
| with open(cand, "r", encoding="utf-8") as f: |
| return f.read() |
| return """<!doctype html><meta charset=utf-8> |
| <title>Mapa</title> |
| <style>html,body,#m{height:100%;margin:0}#m{height:100vh}</style> |
| <link rel=stylesheet href="https://unpkg.com/leaflet@1.9.4/dist/leaflet.css"> |
| <div id=m></div> |
| <script src="https://unpkg.com/leaflet@1.9.4/dist/leaflet.js"></script> |
| <script>var map=L.map('m').setView([40,-2],6); |
| L.tileLayer('https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png',{maxZoom:18,attribution:'© OpenStreetMap'}).addTo(map); |
| L.circle([39,-0.3],{radius:70000}).addTo(map);</script>""" |
|
|
| MAP_SRC = _load_map_html() |
| MAP_DATA_URL = "data:text/html;base64," + base64.b64encode(MAP_SRC.encode("utf-8")).decode("ascii") |
|
|
| |
| with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="stone", secondary_hue="stone", neutral_hue="stone")) as demo: |
| with gr.Group(): |
| with gr.Row(): |
| combo = gr.Dropdown(choices=["ES","EN"], value="ES", label=LABELS["ES"]["combo"]) |
| direction = gr.Radio(choices=LABELS["ES"]["dir_opts"], value="ES → NI", label=LABELS["ES"]["dir"]) |
|
|
| with gr.Group(): |
| es_in = gr.Textbox(label=LABELS["ES"]["in_label_es"], placeholder=LABELS["ES"]["in_ph_es"], lines=5, elem_id="ni_es_input") |
| with gr.Row(): |
| btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary") |
| btn_diag = gr.Button("🔎 Diagnosticar BI con este texto", variant="secondary") |
| btn_clear_in = gr.Button("🗑️ Borrar entrada", variant="secondary") |
| with gr.Row(): |
| with gr.Column(scale=2): |
| ni_out = gr.Textbox(label=LABELS["ES"]["out_lat_esni"], lines=5, interactive=False, elem_id="ni_es_output", show_copy_button=True) |
| with gr.Row(): |
| btn_copy_out = gr.Button("📋 Copiar salida", variant="secondary", size="sm") |
| btn_cut_out = gr.Button("✂️ Cortar salida", variant="secondary", size="sm") |
| btn_clear_out = gr.Button("🗑️ Borrar salida", variant="secondary", size="sm") |
| loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=True) |
| audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy") |
| with gr.Column(scale=1): |
| ib_out = gr.HTML(label=LABELS["ES"]["out_ib"]) |
| diag_out = gr.HTML(value="") |
|
|
| def do_translate(text, dir_label): |
| if not text or not text.strip(): |
| return (gr.update(value=""), |
| gr.update(value="<div class='ib-line'></div>"), |
| gr.update(visible=False), |
| gr.update(value=None), |
| gr.update(value="")) |
| if dir_label.startswith("ES"): |
| latin, ib = translate_es_to_ni_bi(text) |
| return (gr.update(label=LABELS["ES"]["out_lat_esni"], value=latin), |
| gr.update(value=ib), |
| gr.update(visible=True), |
| gr.update(value=None), |
| gr.update(value="")) |
| else: |
| es_text = translate_ni_to_es_bi(text) |
| return (gr.update(label=LABELS["ES"]["out_lat_nies"], value=es_text), |
| gr.update(value="<div class='ib-line'></div>"), |
| gr.update(visible=False), |
| gr.update(value=None), |
| gr.update(value="")) |
|
|
| btn_tr.click(do_translate, [es_in, direction], [ni_out, ib_out, loc_btn, audio_out, diag_out]) |
|
|
| def run_locution(latin_text, dir_label): |
| if dir_label.startswith("ES"): |
| return synthesize_speech(latin_text) |
| return None |
| loc_btn.click(run_locution, [ni_out, direction], audio_out) |
|
|
| def do_diagnose(text, dir_label): |
| return gr.update(value=diagnose_text(text, dir_label)) |
| btn_diag.click(do_diagnose, [es_in, direction], [diag_out]) |
|
|
| def switch_lang(sel_lang, dir_label): |
| L=LABELS[sel_lang] |
| in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] |
| in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] |
| out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] |
| return ( |
| gr.update(label=L["combo"], value=sel_lang), |
| gr.update(label=L["dir"], choices=L["dir_opts"], value=dir_label), |
| gr.update(label=in_label, placeholder=in_ph), |
| gr.update(label=out_lab), |
| gr.update(label=L["out_ib"]), |
| gr.update(label=L["out_audio"]), |
| gr.update(value=L["btn"]) |
| ) |
| combo.change( |
| switch_lang, |
| [combo, direction], |
| [combo, direction, |
| es_in, ni_out, ib_out, audio_out, btn_tr] |
| ) |
|
|
| def switch_direction(dir_label, sel_lang): |
| L=LABELS[sel_lang] |
| in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] |
| in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] |
| out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] |
| loc_vis = True if dir_label.startswith("ES") else False |
| return (gr.update(label=in_label, placeholder=in_ph), |
| gr.update(label=out_lab, value=""), |
| gr.update(value="<div class='ib-line'></div>"), |
| gr.update(visible=loc_vis), |
| gr.update(value=None), |
| gr.update(value="")) |
| direction.change( |
| switch_direction, |
| [direction, combo], |
| [es_in, ni_out, ib_out, loc_btn, audio_out, diag_out] |
| ) |
|
|
| |
| |
| btn_clear_in.click( |
| fn=lambda: "", |
| inputs=None, |
| outputs=[es_in], |
| ) |
|
|
| |
| def _clear_output_block(): |
| return ("", "<div class='ib-line'></div>", None, "") |
| btn_clear_out.click( |
| fn=_clear_output_block, |
| inputs=None, |
| outputs=[ni_out, ib_out, audio_out, diag_out], |
| ) |
|
|
| |
| btn_copy_out.click( |
| fn=None, |
| inputs=[ni_out], |
| outputs=None, |
| js="(text) => { if (text) { navigator.clipboard.writeText(text); } return []; }", |
| ) |
|
|
| |
| def _cut_output_block(_text): |
| return ("", "<div class='ib-line'></div>", None, "") |
| btn_cut_out.click( |
| fn=_cut_output_block, |
| inputs=[ni_out], |
| outputs=[ni_out, ib_out, audio_out, diag_out], |
| js="(text) => { if (text) { navigator.clipboard.writeText(text); } return text; }", |
| ) |
|
|
| |
| def _symmetry_smoketest(): |
| print("\n[SMOKE] Prueba ES↔NI (BI-estricto, determinista)…") |
| probes = [ |
| "nuker-ke ni etxe-ka ?", |
| "¿Pagaste 12,75 en la cafetería?", |
| "Marta llega a las 18:30.", |
| "[SIN-LEX:Tomás]-na euŕak-ke !" |
| ] |
| for p in probes: |
| es_from_ni = translate_ni_to_es_bi(p) |
| ni_round, _ = translate_es_to_ni_bi(es_from_ni) |
| print(" IN:", p) |
| print(" ES:", es_from_ni) |
| print(" NI:", ni_round) |
| print("---") |
|
|
| if DEBUG_MODE: |
| _symmetry_smoketest() |
|
|
| if __name__ == "__main__": |
| demo.queue().launch() |
|
|