# app.py — Traductor Español ↔ Neoíbero # UI clásica (v2.2) + motor bidireccional (v3), con Línea Ibérica y Locución # 2025-10 – unificación y limpieza (parches: enclíticos, números, cobertura mínima) # 2025-10 (rev): Superficie estricta + NI→ES desde CSV dedicado si existe (fallback inversor en memoria) # 2025-11 (rev2): prioridad absoluta de CSV, contador de evidencias, doc vascoide ampliada, fallbacks solo si faltan en CSV import gradio as gr import os, csv, re, base64, unicodedata import torch from transformers import AutoProcessor, VitsModel import numpy as np from html import escape # ← para escapar la línea ibérica en HTML # Caches locales (si existen) os.environ['TRANSFORMERS_CACHE'] = os.environ.get('TRANSFORMERS_CACHE', '/tmp/cache') os.environ['HF_HOME'] = os.environ.get('HF_HOME', '/tmp/hf') DEBUG_MODE = False def debug_print(msg): if DEBUG_MODE: print(f"[DEBUG] {msg}") # ========================= # LÉXICO Y ESTRUCTURAS # ========================= # << RUTAS EN RAÍZ DEL REPO >> CSV_ES_NI = "HF_Pairs_ES_NI_RICH.csv" # ES→NI (rico) CSV_NI_ES = "HF_Pairs_NI_ES_Translator.csv" # NI→ES (determinista si existe) # Superficie estricta: no tocar superficies que vengan del CSV STRICT_SURFACE = True # ES→NI SURF_RICH = {} # (es_lower, tag) -> ni_surface LEX_FORM = {} # es_form -> ni_lemma/surface LEX_LEMMA = {} # es_lemma -> ni_lemma FOLD_FORM = {} # es_form_no_diacritics -> ni_lemma LEX_META = {} # es_form/lemma -> {"pos":..., "tam_ok":...} FORCE_KEYS = set() # NI→ES (se carga de CSV si existe; si no, se genera como inverso en memoria) NI_TO_ES_SURF = {} # (ni_surface_lower_or_fold, tag) -> es_surface (exacto observado) NI_TO_ES_FORM = {} # ni_form/root (lower/fold) -> es_surface o es_lemma NI_TO_ES_LEMMA= {} # ni_root (lower/fold) -> es_lemma # Contador informativo de evidencias (solo logging) EVIDENCE_COUNTS = {} # p.ej.: {"conjetural": 2039780, "vascoide": 9, ...} # ========================= # MORFOLOGÍA – ESPAÑOL # ========================= RE_GER = re.compile(r"(ando|iendo|yendo)$", re.I) RE_PART = re.compile(r"(ado|ido|to|so|cho)$", re.I) FUT_END = ("é","ás","á","emos","éis","án") COND_END = ("ía","ías","ía","íamos","íais","ían") PRET_AR = ("é","aste","ó","amos","asteis","aron") PRET_ERIR = ("í","iste","ió","imos","isteis","ieron") IMPF_AR = ("aba","abas","ábamos","abais","aban") IMPF_ERIR = ("ía","ías","íamos","íais","ían") SUBJ_AR = ("e","es","e","emos","éis","en") SUBJ_ERIR = ("a","as","a","amos","áis","an") SUBJ_PAST_AR = ("ara","aras","ara","áramos","arais","aran","ase","ases","ase","ásemos","aseis","asen") SUBJ_PAST_ERIR = ("iera","ieras","iera","iéramos","ierais","ieran","iese","ieses","iese","iésemos","ieseis","iesen") PRS_AR = ("o","as","a","amos","áis","an") PRS_ER = ("o","es","e","emos","éis","en") PRS_IR = ("o","es","e","imos","ís","en") def _strip_any(w, ends): for s in sorted(ends, key=len, reverse=True): if w.endswith(s): return w[:-len(s)], s return None, None def _guess_class_from_ending(ending): if ending in PRET_AR or ending in IMPF_AR or ending in SUBJ_AR or ending in PRS_AR: return "ar" return "er" # Irregularidades para **adivinar lema** y **tag** (TAM de rescate) IRREG_LEMMA = { "fui":"ir","fuiste":"ir","fue":"ir","fuimos":"ir","fuisteis":"ir","fueron":"ir", "voy":"ir","vas":"ir","va":"ir","vamos":"ir","vais":"ir","van":"ir", "soy":"ser","eres":"ser","es":"ser","somos":"ser","sois":"ser","son":"ser", "era":"ser","eras":"ser","éramos":"ser","erais":"ser","eran":"ser", "he":"haber","has":"haber","ha":"haber","hemos":"haber","habéis":"haber","han":"haber", "hube":"haber","hubo":"haber","hubimos":"haber","hubiste":"haber","hubisteis":"haber","hubieron":"haber", "estoy":"estar","estás":"estar","está":"estar","estamos":"estar","estáis":"estar","están":"estar", "estuve":"estar","estuviste":"estar","estuvo":"estar","estuvimos":"estar","estuvisteis":"estar","estuvieron":"estar", "estaba":"estar","estabas":"estar","estábamos":"estar","estabais":"estar","estaban":"estar", "tuve":"tener","tuviste":"tener","tuvo":"tener","tuvimos":"tener","tuvisteis":"tener","tuvieron":"tener", "vine":"venir","viniste":"venir","vino":"venir","vinimos":"venir","vinisteis":"venir","vinieron":"venir", "hice":"hacer","hiciste":"hacer","hizo":"hacer","hicimos":"hacer","hicisteis":"hacer","hicieron":"hacer", "puse":"poner","pusiste":"poner","puso":"poner","pusimos":"poner","pusisteis":"poner","pusieron":"poner", "pude":"poder","pudiste":"poder","pudo":"poder","pudimos":"poder","pudisteis":"poder","pudieron":"poder", "quise":"querer","quisiste":"querer","quiso":"querer","quisimos":"querer","quisisteis":"querer","quisieron":"querer", "supe":"saber","supiste":"saber","supo":"saber","supimos":"saber","supisteis":"saber","supieron":"saber", "traje":"traer","trajiste":"traer","trajo":"traer","trajimos":"traer","trajisteis":"traer","trajeron":"traer", "dije":"decir","dijiste":"decir","dijo":"decir","dijimos":"decir","dijisteis":"decir","dijeron":"decir", "conduje":"conducir","condujiste":"conducir","condujo":"conducir","condujimos":"conducir","condujisteis":"conducir","condujeron":"conducir", "anduve":"andar","anduviste":"andar","anduvo":"andar","anduvimos":"andar","anduvisteis":"andar","anduvieron":"andar", "cupe":"caber","cupiste":"caber","cupo":"caber","cupimos":"caber","cupisteis":"caber","cupieron":"caber", "di":"dar","diste":"dar","dio":"dar","dimos":"dar","disteis":"dar","dieron":"dar", "vi":"ver","viste":"ver","vio":"ver","vimos":"ver","visteis":"ver","vieron":"ver", "tengo":"tener","vengo":"venir","pongo":"poner","salgo":"salir","traigo":"traer","caigo":"caer","hago":"hacer","oigo":"oír","digo":"decir","valgo":"valer","sigo":"seguir", "tienes":"tener","tiene":"tener","tienen":"tener", "vienes":"venir","viene":"venir","vienen":"venir", "pienso":"pensar","piensas":"pensar","piensa":"pensar","piensan":"pensar", "quiero":"querer","quieres":"querer","quiere":"querer","quieren":"querer", "prefiero":"preferir","prefieres":"preferir","prefiere":"preferir","prefieren":"preferir", "vaya":"ir","vayas":"ir","vayamos":"ir","vayáis":"ir","vayan":"ir", "sea":"ser","seas":"ser","seamos":"ser","seáis":"ser","sean":"ser", "haya":"haber","hayas":"haber","hayamos":"haber","hayáis":"haber","hayan":"haber", "dé":"dar","des":"dar","demos":"dar","deis":"dar","den":"dar", "esté":"estar","estés":"estar","estemos":"estar","estéis":"estar","estén":"estar", "tenga":"tener","tengas":"tener","tengamos":"tener","tengáis":"tener","tengan":"tener", "venga":"venir","vengas":"venir","vengamos":"venir","vayáis":"ir","vengan":"venir", "ve":"ir","id":"ir","sé":"ser","sed":"ser","haz":"hacer","haced":"hacer","pon":"poner","poned":"poner", "ven":"venir","venid":"venir","ten":"tener","tened":"tener","sal":"salir","salid":"salir","di":"decir","decid":"decir", "doy":"dar","das":"dar","da":"dar","damos":"dar","dais":"dar","dan":"dar", "veo":"ver","ves":"ver","vemos":"ver","veis":"ver","ven":"ver", "oí":"oír","oíste":"oír","oyó":"oír","oímos":"oír","oísteis":"oír","oyeron":"oír", "iba":"ir","ibas":"ir","íbamos":"ir","ibais":"ir","iban":"ir", "veía":"ver","veías":"ver","veíamos":"ver","veíais":"ver","veían":"ver", "vinieras":"venir","lloviera":"llover", # Futuro de subjuntivo (arcaico) "viniere":"venir","vinieres":"venir","vinieren":"venir", "hiciere":"hacer","hicieres":"hacer","hicieren":"hacer", "hubiere":"haber","hubieres":"haber","hubieren":"haber", # "fuere" ambiguo (ser/ir) → omitido a propósito } IRREG_MORPH_TAGS = { # Subjuntivos/imperativos/etc (para tag de rescate) "vaya":"SBJV","vayas":"SBJV","vayamos":"SBJV","vayáis":"SBJV","vayan":"SBJV", "sea":"SBJV","seas":"SBJV","seamos":"SBJV","seáis":"SBJV","sean":"SBJV", "haya":"SBJV","hayas":"SBJV","hayamos":"SBJV","hayáis":"SBJV","hayan":"SBJV", "dé":"SBJV","des":"SBJV","demos":"SBJV","deis":"SBJV","den":"SBJV", "esté":"SBJV","estés":"SBJV","estemos":"SBJV","estéis":"SBJV","estén":"SBJV", "tenga":"SBJV","tengas":"SBJV","tengamos":"SBJV","tengáis":"SBJV","tengan":"SBJV", "venga":"SBJV","vengas":"SBJV","vengamos":"SBJV","vengáis":"SBKV","vengan":"SBJV", "haga":"SBJV","hagas":"SBJV","hagamos":"SBJV","hagáis":"SBJV","hagan":"SBJV", "pueda":"SBJV","puedas":"SBJV","podamos":"SBJV","podáis":"SBJV","puedan":"SBJV", "id":"IMP","sed":"IMP","haz":"IMP","haced":"IMP","pon":"IMP","poned":"IMP","ven":"IMP","venid":"IMP", "ten":"IMP","tened":"IMP","sal":"IMP","salid":"IMP","decid":"IMP", "llámame":"IMP","llámalo":"IMP","llámala":"IMP","llámanos":"IMP","llámalos":"IMP","llámalas":"IMP", "dime":"IMP","dímelo":"IMP","dinos":"IMP","dínoslo":"IMP", "hazme":"IMP","hazlo":"IMP","hazla":"IMP","haznos":"IMP", "ponme":"IMP","ponlo":"IMP","ponla":"IMP","ponnos":"IMP", "dame":"IMP","dámelo":"IMP","danos":"IMP","dánoslo":"IMP", "tráeme":"IMP","tráelo":"IMP","tráela":"IMP","tráenos":"IMP", "díselo":"IMP","pónselo":"IMP","házselo":"IMP", "viniere":"FUT_SBJV","vinieres":"FUT_SBJV","vinieren":"FUT_SBJV", "hiciere":"FUT_SBJV","hicieres":"FUT_SBJV","hicieren":"FUT_SBJV", "fuere":"FUT_SBJV","fueres":"FUT_SBJV","fueren":"FUT_SBJV", "hubiere":"FUT_SBJV","hubieres":"FUT_SBJV","hubieren":"FUT_SBJV", "creísteis":"PST","dijisteis":"PST","hicisteis":"PST","pusisteis":"PST", "supisteis":"PST","quisisteis":"PST","trajisteis":"PST","vi":"PST","dio":"PST","fue":"PST","fui":"PST", "iba":"IPFV","ibas":"IPFV","íbamos":"IPFV","ibais":"IPFV","iban":"IPFV", "veía":"IPFV","veías":"IPFV","veíamos":"IPFV","veíais":"IPFV","veían":"IPFV", } def looks_like_verb_form_strict(w: str) -> bool: w = (w or "").lower() if w.endswith(("ar","er","ir")): return True if RE_GER.search(w) or RE_PART.search(w): return True if re.search(r"(á|ás|áis|és|éis|ís)$", w): return True if _strip_any(w, FUT_END+COND_END)[0] is not None: return True if _strip_any(w, PRET_AR+PRET_ERIR)[0] is not None: return True if _strip_any(w, IMPF_AR+IMPF_ERIR)[0] is not None: return True if _strip_any(w, SUBJ_PAST_AR+SUBJ_PAST_ERIR)[0] is not None: return True if re.search(r"(anduve|anduviste|anduvo|anduvimos|anduvieron|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w): return True return False def _zco_guess(w:str)->str: if w.endswith("uzco"): return w[:-4] + "ucir" if w.endswith("ezco"): return w[:-4] + "ecer" if w.endswith("ozco"): return w[:-4] + "ocer" if w.endswith("azco"): return w[:-4] + "acer" return "" def guess_infinitive_es(w: str) -> str: w = (w or "").lower() if w in IRREG_LEMMA: return IRREG_LEMMA[w] if w in ("vámonos","vamonos"): return "ir" if w.endswith("zco"): z = _zco_guess(w) if z: return z if w.endswith("go"): base = w[:-2] map_go = {"ten":"tener","ven":"venir","pon":"poner","sal":"salir","tra":"traer","ca":"caer","ha":"hacer","oi":"oír","di":"decir","val":"valer","si":"seguir"} for k,v in map_go.items(): if base.startswith(k): return v if w.endswith(("ar","er","ir")): return w m = RE_GER.search(w) if m: base = w[:m.start()] return base + ("ar" if m.group(0)=="ando" else "er") m = RE_PART.search(w) if m: base = w[:m.start()] part_irreg = { "hecho":"hacer","dicho":"decir","visto":"ver","puesto":"poner","escrito":"escribir", "abierto":"abrir","cubierto":"cubrir","muerto":"morir","roto":"romper", "vuelto":"volver","resuelto":"resolver","frito":"freír","impreso":"imprimir", "satisfecho":"satisfacer","provisto":"proveer" } if w in part_irreg: return part_irreg[w] return base + "er" base, end = _strip_any(w, FUT_END+COND_END) if base is not None: irreg = {"saldr":"salir","vendr":"venir","tendr":"tener","pondr":"poner","valdr":"valer","podr":"poder", "habr":"haber","sabr":"saber","cabr":"caber","querr":"querer","dir":"decir","har":"hacer"} if base in irreg: return irreg[base] return base if w.endswith("áis"): return w[:-3] + "ar" if w.endswith("éis"): return w[:-3] + "er" if w.endswith("ís"): return w[:-2] + "ir" if w.endswith("ás"): return w[:-2] + "ar" if w.endswith("és"): return w[:-2] + "er" if w.endswith("á"): return w[:-1] + "ar" for group in (PRET_AR+PRET_ERIR, IMPF_AR+IMPF_ERIR, SUBJ_AR+SUBJ_ERIR, PRS_AR+PRS_ER+PRS_IR): base, end = _strip_any(w, group) if base is not None: return base + _guess_class_from_ending(end) base, end = _strip_any(w, SUBJ_PAST_AR) if base is not None: return base + "ar" base, end = _strip_any(w, SUBJ_PAST_ERIR) if base is not None: return base + "er" return "" def es_morph_tag(w: str) -> str: w = (w or "").lower() if w in IRREG_MORPH_TAGS: return IRREG_MORPH_TAGS[w] if re.search(r"^(llám|dím|házm|pónm|vén|dám|tén|tráe)(a|e)?(me|te|lo|la|nos|os|les|se|melo|telo|selo)$", w): return "IMP" if re.search(r"(adme|edme|idme|adlo|edle|idle|adnos|ednos)$", w): return "IMP" if re.search(r"(?:ad|ed|id|ád|éd|íd)(?:me|te|se|lo|la|nos|os|les|melo|telo|selo|noslo|oslo|sela|selas|selos)$", w): return "IMP" if re.search(r"^.*[áéí]ndo(me|te|se|lo|la|nos|os|les|melo|telo|selo)$", w): return "IPFV" if re.search(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", w): base = re.sub(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", "", w) if base and len(base) > 2: return "IMP" if w.endswith(("ar","er","ir")): return "INF" if RE_GER.search(w): return "IPFV" if RE_PART.search(w): return "PST" if _strip_any(w, PRET_AR+PRET_ERIR)[0] is not None: return "PST" if _strip_any(w, IMPF_AR+IMPF_ERIR)[0] is not None: return "IPFV" if _strip_any(w, FUT_END)[0] is not None: return "FUT" if _strip_any(w, COND_END)[0] is not None: return "COND" if re.search(r"(á|ás|áis|és|éis|ís)$", w): return "PRS" if _strip_any(w, SUBJ_AR+SUBJ_ERIR)[0] is not None: return "SBJV" if _strip_any(w, PRS_AR+PRS_ER+PRS_IR)[0] is not None: return "PRS" if _strip_any(w, SUBJ_PAST_AR+SUBJ_PAST_ERIR)[0] is not None: return "SBJV" if re.search(r"(anduve|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w): return "PST" if re.search(r"^.+[aei]d$", w): return "IMP" return "UNK" # ========================= # MORFOLOGÍA – NEOÍBERO # ========================= NI_TAM_SUFFIXES = {"-ke":"PRS","-bo":"PST","-ta":"FUT","-ri":"IPFV","-ni":"COND","-tu":"IMP","-ra":"FUT_SBJV"} def detect_ni_tam(word: str): word = (word or "").lower().strip() for suf, tag in NI_TAM_SUFFIXES.items(): if word.endswith(suf): return word[:-len(suf)], tag, suf return word, "INF", "" # ========================= # UTILIDADES # ========================= def fold(s:str)->str: return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c)!="Mn") def has_diacritic(s:str)->bool: return bool(re.search(r"[áéíóúüÁÉÍÓÚÜ]", s or "")) def _canon_pos(p: str) -> str: p = (p or "").strip().upper() MAP = {"V":"V","VERB":"V","N":"N","NOUN":"N","ADJ":"ADJ","ADJECTIVE":"ADJ","ADV":"ADV","ADVERB":"ADV", "INTJ":"INTJ","INTERJ":"INTJ","INTERJECTION":"INTJ","PRON":"PRON","PRONOUN":"PRON", "PART":"PART","PARTICLE":"PART","POSTP":"POSTP","ADP":"POSTP","ADPOSITION":"POSTP"} return MAP.get(p, "") def _boolish(x): if x is None: return None s = str(x).strip().lower() if s in ("1","true","t","yes","y","si","sí"): return True if s in ("0","false","f","no","n"): return False return None def _meta_set(form_es:str, pos:str=None, tam_ok=None): if not form_es: return d = LEX_META.setdefault(form_es, {}) if pos and not d.get("pos"): d["pos"] = pos if tam_ok is not None and d.get("tam_ok") is None: d["tam_ok"] = bool(tam_ok) def pos_of_es(token_low:str) -> str: m = LEX_META.get(token_low, {}) if m.get("pos"): return m["pos"] return "V" if looks_like_verb_form_strict(token_low) else "" def tam_allowed_for_es(token_low:str) -> bool: m = LEX_META.get(token_low, {}) if m.get("tam_ok") is not None: return bool(m.get("tam_ok")) return pos_of_es(token_low) == "V" # ========================= # TTS (Meta MMS) # ========================= print("Cargando modelo de voz...") device = "cuda" if torch.cuda.is_available() else "cpu" processor = model = None try: processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa") model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device) print("Modelo de voz cargado.") except Exception as e: print(f"ERROR TTS: {e}") PAUSE_LEVEL=3 def add_reading_pauses(text: str, level:int=3) -> str: if level <= 1: return text t = text if level >= 2: t = re.sub(r",\s*", ", , ", t) if level >= 3: t = re.sub(r"\.\s*", ". . ", t); t = re.sub(r";\s*", "; ; ", t) return re.sub(r'\s+',' ',t).strip() def hispanize_for_tts(ni_text: str) -> str: text=(ni_text or "").lower() text=text.replace('ŕ','rr').replace('ś','s').replace('eś','es') text=text.replace('ŕa','rra').replace('aŕe','arre').replace('-', ' ') text=re.sub(r'\[.*?\]','',text) text=re.sub(r'\s+',' ',text).strip() return add_reading_pauses(text, PAUSE_LEVEL) def synthesize_speech(text): if not text or not text.strip() or model is None or processor is None: return None try: inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device) with torch.no_grad(): output = model(**inputs).waveform speech_np = output.cpu().numpy().squeeze() mx = max(abs(speech_np.min()), abs(speech_np.max())) if mx>0: speech_np = speech_np/mx*0.9 return (16000, speech_np.astype(np.float32)) except Exception as e: print(f"Error TTS: {e}"); return None # ========================= # LÍNEA IBÉRICA (claves Georgeos) # ========================= KEYS_MODE = "explicit" V = "aeiou" SYL_FOR={"b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"], "d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"], "t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"], "g":["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"], "k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]} ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›","l":"‹L›","r":"‹R›","ŕ":"‹Ŕ›","n":"‹N›","m":"‹M›"} CODA_FOR={"":"","n":"‹N›","s":"‹S›","ś":"‹Ś›","r":"‹R›","ŕ":"‹Ŕ›","l":"‹L›","m":"‹M›","k":"‹K›","t":"‹T›"} def tokens_from_latin(ni:str)->str: out=[]; i=0; ni=(ni or "").lower() while istr: low=(ni_plain or "").lower() if low in KEYS_OVERRIDE: return KEYS_OVERRIDE[low] m=re.findall(r"‹(.*?)›", token_str) out=[] for t in m: if KEYS_MODE == "compact": if len(t)==2 and t[0] in "BDTGK": out.append(t[0]) elif t in ("A","E","I","O","U"): out.append(t) elif t=="Ś": out.append("X") elif t=="Ŕ": out.append("r") else: out.append(t[0].upper()) else: if len(t)==2 and t[0] in "BDTGK": out.append(t) elif t=="Ś": out.append("X") elif t=="Ŕ": out.append("r") else: out.append(t.upper()) return "".join(out) TRIDOT = "/" VISIBLE_PUNCT = {",",".",";","; ",":","…","(",")","[","]","{","}","\"","'","«","»","—","–","“","”","‘","’"} HARD_BOUND = {".",";","—","–",":","(",")","«","»"} def render_ib_with_tridots(toks): res=[]; prev_word=False for tk in toks: is_punct = tk in VISIBLE_PUNCT if is_punct: res.append(" "+tk+" "); prev_word=False else: if prev_word: res.append(" "+TRIDOT+" ") res.append(tk); prev_word=True return "".join(res).strip() # ========================= # TRADUCTOR ES→NI # ========================= TAM_SUFFIX={"PRS":"-ke","PST":"-bo","FUT":"-ta","IPFV":"-ri","COND":"-ni","SBJV":"-ni","IMP":"-tu","INF":"","FUT_SBJV":"-ra","UNK":"-ke"} VERB_TAM = ("-ke","-ta","-bo","-ri","-ni","-tu","-ra") def strip_ni_tam(lemma: str): lemma = lemma or "" for s in sorted(VERB_TAM, key=len, reverse=True): if lemma.endswith(s): return lemma[:-len(s)], s return lemma, "" STOP=set(""" el la los las lo un una unos unas al del de en con sin por sobre entre hasta desde hacia según tras pero aunque sino que como si porque cuando donde mientras muy ya sí no también solo sólo aún aun más menos mi mis tu tus su sus nuestro nuestra nuestros nuestras esto eso aquello ese esa esos esas aquel aquella aquellos aquellas quien quién quiénes cual cuál cuales cuáles cuyo cuya cuyos cuyas eh ay oh uy ah aja jeje jaja aah ahh ohh uhh """.split()) # --- Reglas "a" → ka/mi/te def rule_a(prev_tok:str, token:str, next_tok:str)->str: verbs={"dar","decir","contar","enviar","ofrecer","mostrar","prestar","regalar","entregar"} if prev_tok in verbs: return "mi" nombres={"ana","marta","juan","pedro","luis","maría","jose","carlos","laura"} if next_tok in nombres: return "te" return "ka" Q_ENCLITIC_INT = "-na" Q_ENCLITIC_EXC = "-ba" WH_WORDS = { "qué","quien","quién","quienes","quiénes","cual","cuál","cuales","cuáles", "donde","dónde","cuando","cuándo","como","cómo", "cuanto","cuánto","cuanta","cuánta","cuantos","cuántos","cuantas","cuántas" } def is_wh_token(t: str) -> bool: low = (t or "").lower() if low in WH_WORDS: return True f = fold(low) return f in {"que","quien","quienes","cual","cuales","donde","cuando","como","cuanto","cuanta","cuantos","cuantas"} def has_wh_outside_parens(toks) -> bool: depth = 0 for tk in toks: if tk in {"(", "«", "“", "‘"}: depth += 1 elif tk in {")", "»", "”", "’"}: depth = max(0, depth-1) elif depth == 0 and is_wh_token(tk): return True return False ESTAR_SET={"estoy","estás","está","estamos","estáis","están","estaba","estabas","estábamos","estabais","estaban"} HABER_SET={"he","has","ha","hemos","habéis","han","había","habías","habíamos","habíais","habían"} def detect_tam_with_context(toks, i, sentence_start=False): t=toks[i].lower() prev=toks[i-1].lower() if i>0 else "" prev2=toks[i-2].lower() if i>1 else "" nxt=toks[i+1].lower() if i+1str: if low=="visto" and nxt=="de": return "vestir" return "" def has_tilde_equiv_lookup(low:str)->str: if has_diacritic(low) and not looks_like_verb_form_strict(low): f=fold(low) if f in LEX_FORM: return LEX_FORM[f] if f in FOLD_FORM: return FOLD_FORM[f] return "" # ==== quitar enclíticos españoles para el "guesser" ==== ENCL_RE = re.compile(r"(?:(?:me|te|se|nos|os|le|les)(?:lo|la|los|las)?|(?:lo|la|los|las))$", re.I) def strip_es_enclitics(w:str)->str: if not w: return w w = w.replace("-", "") return ENCL_RE.sub("", w) def lookup_form_lemma(token:str, prev:str, nxt:str): if not token: return "", False low=token.lower() fl=forced_lemma_with_context(low, prev, nxt) if fl and fl in LEX_LEMMA: return LEX_LEMMA[fl], True if low in LEX_FORM: return LEX_FORM[low], True til=has_tilde_equiv_lookup(low) if til: return til, True base_no_clit = strip_es_enclitics(low) if base_no_clit != low: lem = guess_infinitive_es(base_no_clit) if lem and lem in LEX_LEMMA: return LEX_LEMMA[lem], True if looks_like_verb_form_strict(low): lem=guess_infinitive_es(low) if lem and lem in LEX_LEMMA: return LEX_LEMMA[lem], True return "", False def attach_enclitic(out_words, ib_keys, plain, attach_idx, encl): if attach_idx is None or attach_idx < 0 or attach_idx >= len(out_words): return cur = out_words[attach_idx] or "" if cur.endswith(encl): return out_words[attach_idx] = cur + encl plain[attach_idx] = (plain[attach_idx] or "") + encl ib_keys[attach_idx] = georgeos_keys(tokens_from_latin(plain[attach_idx]), plain[attach_idx]) def ensure_terminal_qmark(out_words, ib_keys, plain): if not out_words: out_words.append("?"); ib_keys.append(""); plain.append("?"); return j = len(out_words) - 1 while j >= 0 and (out_words[j] == "" or out_words[j] is None): j -= 1 if j < 0: out_words.append("?"); ib_keys.append(""); plain.append("?"); return if out_words[j] == ".": out_words[j] = "?"; ib_keys[j] = ""; plain[j] = "?" elif out_words[j] not in {"?","!"}: out_words.append("?"); ib_keys.append(""); plain.append("?") def normalize_surface_by_pos(ni_surface:str, pos:str) -> str: # Modo superficie estricta: devolvemos exactamente lo que venga del CSV. return ni_surface def translate_sentence(sent:str): toks = re.sub(r"\s+"," ", (sent or "").strip()) toks = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", toks) toks = [t for t in toks.split() if t] out_words=[]; ib_keys=[]; plain=[] neg_next=False; last_finite_idx=None; has_qmark=False saw_wh = has_wh_outside_parens(toks) sentence_start=True for i,t in enumerate(toks): if t in {"¿","¡"}: sentence_start=True; continue if t in {"?","!"}: if t=="?": has_qmark=True encl = Q_ENCLITIC_INT if t=="?" else Q_ENCLITIC_EXC attach_idx = last_finite_idx if attach_idx is None: for j in range(len(out_words)-1, -1, -1): if out_words[j] and out_words[j] not in VISIBLE_PUNCT: attach_idx = j; break if attach_idx is not None: attach_enclitic(out_words, ib_keys, plain, attach_idx, encl) out_words.append(t); ib_keys.append(""); plain.append(t) sentence_start=True; continue if t in VISIBLE_PUNCT: out_words.append(t); ib_keys.append(t); plain.append(t) if t in HARD_BOUND: last_finite_idx=None sentence_start = (t in {".",":",";","—","–"}) continue low=t.lower() prev = toks[i-1].lower() if i>0 else "" nxt = toks[i+1].lower() if i+1 str: t = (text or "").replace("/", " ") t = re.sub(r"\[SIN-LEX:([^\]]+)\]", r"\1", t) return re.sub(r"\s+", " ", t.strip()) def tokenize_ni(text: str): text = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", text) return [t for t in text.split() if t] def _ni_fold(s: str) -> str: return (s or "").replace("ś","s").replace("ŕ","r") def translate_ni_to_es(sent: str): toks = tokenize_ni(normalize_ni(sent)) out = [] i = 0 while i < len(toks): t = toks[i] if i + 2 < len(toks) and re.fullmatch(r"\d{1,2}", toks[i]) and toks[i+1] == ":" and re.fullmatch(r"\d{2}", toks[i+2]): out.append(f"{toks[i]}:{toks[i+2]}"); i += 3; continue if t in VISIBLE_PUNCT or t in {"?", "!", "¿", "¡"}: out.append(t); i += 1; continue low = t.lower() lookup = low[:-3] if (low.endswith("-na") or low.endswith("-ba")) else low root, tag, _ = detect_ni_tam(lookup) es_direct = NI_TO_ES_SURF.get((lookup, tag)) or NI_TO_ES_SURF.get((_ni_fold(lookup), tag)) if es_direct: out.append(es_direct); i += 1; continue form = NI_TO_ES_FORM.get(lookup) or NI_TO_ES_FORM.get(_ni_fold(lookup)) if form: out.append(form); i += 1; continue if root: lem = NI_TO_ES_LEMMA.get(root) or NI_TO_ES_LEMMA.get(_ni_fold(root)) if lem: out.append(lem); i += 1; continue if re.fullmatch(r"\d+([.,]\d+)?", low): out.append(t); i += 1; continue out.append(f"[?:{t}]"); i += 1 s = " ".join(out) s = re.sub(r"\s+([,.;:!?])", r"\1", s) s = re.sub(r"\(\s+", "(", s) s = re.sub(r"\s+\)", ")", s) s = re.sub(r"\s{2,}", " ", s).strip() s = s.replace("a a ", " a ") return s # ========================= # CARGA DE LÉXICO ES→NI # ========================= def load_lexicon_es_ni(): """ Carga léxico ES→NI desde CSV rico, priorizando SIEMPRE lo que venga en el CSV. Rellena: SURF_RICH, LEX_FORM, LEX_LEMMA, LEX_META y EVIDENCE_COUNTS (si hay 'evidencia_es'). """ loaded = False total_rich = total_simple = 0 EVIDENCE_COUNTS.clear() p = CSV_ES_NI if not os.path.exists(p): print(f"[WARN] No se encontró {p} (ES→NI).") return False try: with open(p, encoding="utf-8") as f: rd = csv.DictReader(f) flds = set(rd.fieldnames or []) # CSV rico (preferente) if {"source_es", "es_morph"}.issubset(flds): for r in rd: es = (r.get("source_es") or "").strip().lower() tag = (r.get("es_morph") or "").strip().upper() surf = (r.get("ni_surface") or "").strip() if not surf: root = (r.get("ni_root") or "").strip() suf = (r.get("ni_suffix") or "").strip() if root or suf: surf = f"{root}{suf}" if es and tag and surf: SURF_RICH[(es, tag)] = surf total_rich += 1 ni = (r.get("target_ni") or "").strip() es_lem = (r.get("es_lemma") or "").strip().lower() pos = _canon_pos(r.get("pos") or r.get("pos_es") or r.get("pos_ni") or "") tam_ok = _boolish(r.get("tam_ok")) if es: _meta_set(es, pos=pos, tam_ok=(tam_ok if tam_ok is not None else (pos == "V" if pos else None))) if es_lem: _meta_set( es_lem, pos=("V" if es_lem.endswith(("ar","er","ir")) else (pos or "")), tam_ok=(tam_ok if tam_ok is not None else (pos == "V" if pos else None)) ) if es and ni != "": LEX_FORM.setdefault(es, ni) if es_lem and ni != "": LEX_LEMMA.setdefault(es_lem, ni) ev = (r.get("evidencia_es") or "").strip().lower() if ev: EVIDENCE_COUNTS[ev] = EVIDENCE_COUNTS.get(ev, 0) + 1 loaded = True # CSV simple (soporte) elif {"source_es", "target_ni"}.issubset(flds): for r in rd: es = (r.get("source_es") or "").strip().lower() ni = (r.get("target_ni") or "").strip() if not es: continue LEX_FORM.setdefault(es, ni) total_simple += 1 _meta_set(es, pos="", tam_ok=None) if looks_like_verb_form_strict(es): lem = guess_infinitive_es(es) if lem: LEX_LEMMA.setdefault(lem, ni) _meta_set(lem, pos="V", tam_ok=True) loaded = True else: print(f"[WARN] Campos no reconocidos en {p}: {sorted(flds)}") except Exception as e: print(f"[WARN] No se pudo leer {p}: {e}") # FOLD_FORM: variantes sin diacríticos (solo para *lookup*; no pisa) global FOLD_FORM FOLD_FORM = {} for k, v in LEX_FORM.items(): fk = fold(k) if fk != k and len(k) >= 5 and not looks_like_verb_form_strict(k): FOLD_FORM.setdefault(fk, v) # ---- Fallbacks mínimos (SOLO si faltan en CSV) ---- # Se eliminan redundancias: numerales, vascoides y otros deben venir ya en el CSV. MIN_FALLBACK_FORM = { # Coordinantes y negación "y": "ne", "o": "o", "no": "eś", # DOM 'a' (si no hay mapeo explícito, se resolverá por regla rule_a) "a": "ka", # Artículos indefinidos (si faltaran) "un": "ban", "una": "ban", } for k, v in MIN_FALLBACK_FORM.items(): if k not in LEX_FORM: LEX_FORM[k] = v # Lemas “core” (SI faltan): solo verbos auxiliares/núcleo del motor MIN_FALLBACK_LEMMA = { "ir": "nitus", "ser": "izan", "estar": "egon", "haber": "ukan", "venir": "nuker", "hacer": "giotael", "ver": "giŕok", "decir": "siśnesiŕ", "poder": "giokk", "tener": "giokk", "poner": "pusen", "salir": "salku", "dar": "buś", "llamar": "lankur", "llover": "xemmo", } for k, v in MIN_FALLBACK_LEMMA.items(): if k not in LEX_LEMMA: LEX_LEMMA[k] = v _meta_set(k, pos="V", tam_ok=True) # Formas forzadas mínimas (imperativos muy frecuentes) — solo si faltan FORCE_FORMS_MIN = { "ven": "nuker-tu", "haz": "giotael-tu", "pon": "pusen-tu", "di": "siśnesir-tu", "llámame": "lankur-tu" } for form, ni in FORCE_FORMS_MIN.items(): if form not in LEX_FORM: LEX_FORM[form] = ni _meta_set(form, pos="V", tam_ok=True) global FORCE_KEYS FORCE_KEYS = set(FORCE_FORMS_MIN.keys()) # ---- LOG RESUMEN ---- if total_rich or total_simple or loaded: print(f"✓ ES→NI: {total_rich} superficies ricas, {total_simple} pares simples") if EVIDENCE_COUNTS: print("\nEvidencias (RICH):") for k, v in sorted(EVIDENCE_COUNTS.items(), key=lambda kv: (-kv[1], kv[0])): print(f" {k}: {v}") return loaded # ========================= # CARGA DE LÉXICO NI→ES # ========================= def load_lexicon_ni_es_from_csv(): """ Carga NI→ES desde CSV dedicado (HF_Pairs_NI_ES_Translator.csv) ✅ Usa 'source_ni' directamente (NO 'ni_surface' reconstruida). """ if not os.path.exists(CSV_NI_ES): return False c_surf = c_form = c_lemma = 0 try: with open(CSV_NI_ES, encoding="utf-8") as f: rd = csv.DictReader(f) for r in rd: ni_surface = (r.get("source_ni") or "").strip() if not ni_surface: continue es_surface = (r.get("target_es") or "").strip() if not es_surface: continue tag = (r.get("es_morph") or "").strip().upper() if not tag or tag == "INF": _, tag0, _ = detect_ni_tam(ni_surface.lower()) tag = tag or tag0 ni_lower = ni_surface.lower() if tag: NI_TO_ES_SURF[(ni_lower, tag)] = es_surface NI_TO_ES_SURF[(_ni_fold(ni_lower), tag)] = es_surface c_surf += 1 NI_TO_ES_FORM[ni_lower] = es_surface NI_TO_ES_FORM[_ni_fold(ni_lower)] = es_surface c_form += 1 es_lemma = (r.get("es_lemma") or "").strip().lower() if es_lemma: root, _, _ = detect_ni_tam(ni_lower) if root: NI_TO_ES_LEMMA[root] = es_lemma NI_TO_ES_LEMMA[_ni_fold(root)] = es_lemma c_lemma += 1 except Exception as e: print(f"[WARN] No se pudo leer {CSV_NI_ES}: {e}") return False if c_surf + c_form + c_lemma > 0: print(f"✓ NI→ES (CSV): {c_surf} superficies+tag, {c_form} formas, {c_lemma} lemas") return True return False def build_inverse_from_esni(): """Construye NI→ES invirtiendo ES→NI cargado (fallback si no hay CSV).""" NI_TO_ES_SURF.clear(); NI_TO_ES_FORM.clear(); NI_TO_ES_LEMMA.clear() c_surf = c_form = c_lemma = 0 for (es_low, tag), ni_surface in SURF_RICH.items(): ni_low = (ni_surface or "").strip().lower() if not ni_low or not es_low or not tag: continue if (ni_low, tag) not in NI_TO_ES_SURF: NI_TO_ES_SURF[(ni_low, tag)] = es_low NI_TO_ES_SURF[(_ni_fold(ni_low), tag)] = es_low c_surf += 1 NI_TO_ES_FORM.setdefault(ni_low, es_low) NI_TO_ES_FORM.setdefault(_ni_fold(ni_low), es_low) for es_form, ni_form in LEX_FORM.items(): ni_low = (ni_form or "").strip().lower() if not ni_low: continue NI_TO_ES_FORM.setdefault(ni_low, es_form) NI_TO_ES_FORM.setdefault(_ni_fold(ni_low), es_form) c_form += 1 root, tag, _ = detect_ni_tam(ni_low) if tag and tag != "INF" and (ni_low, tag) not in NI_TO_ES_SURF: NI_TO_ES_SURF[(ni_low, tag)] = es_form NI_TO_ES_SURF[(_ni_fold(ni_low), tag)] = es_form for es_lemma, ni_lemma in LEX_LEMMA.items(): ni_low = (ni_lemma or "").strip().lower() if not ni_low: continue NI_TO_ES_LEMMA.setdefault(ni_low, es_lemma) NI_TO_ES_FORM.setdefault(ni_low, es_lemma) NI_TO_ES_LEMMA.setdefault(_ni_fold(ni_low), es_lemma) NI_TO_ES_FORM.setdefault(_ni_fold(ni_low), es_lemma) c_lemma += 1 print(f"✓ NI→ES (inversor): {c_surf} superficies+tag, {c_form} formas, {c_lemma} lemas") return True def load_lexicon_ni_es(): """Carga NI→ES: primero intenta CSV, si no existe invierte ES→NI.""" loaded_csv = load_lexicon_ni_es_from_csv() if not loaded_csv: build_inverse_from_esni() # Cobertura mínima (no pisa lo existente) — solo conectores clave KEEP_MIN_NI = {"ne":"y","o":"o","eś":"no","ka":"a","mi":"a","te":"a","ban":"un","ni":"yo","zu":"tú","nar":"él"} for k,v in KEEP_MIN_NI.items(): NI_TO_ES_FORM.setdefault(k.lower(), v) NI_TO_ES_FORM.setdefault(_ni_fold(k.lower()), v) return True print("Cargando léxico ES→NI..."); load_lexicon_es_ni() print("Cargando léxico NI→ES..."); load_lexicon_ni_es() # ========================= # UI CLÁSICA (con dirección) # ========================= LABELS={ "ES":{ "title":"Traductor Español ↔ Neoíbero", "subtitle":"Explora el renacimiento ibérico con tecnología moderna", "in_label_es":"✏️ Entrada (Español)", "in_label_ni":"✏️ Entrada (Neoíbero)", "in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.", "in_ph_ni":"Idatzi hemen. Adib.: nuker-ke ni etxe-ka.", "out_lat_esni":"📜 Salida: Neoíbero (latín)", "out_lat_nies":"📜 Salida: Español", "out_ib":"🗿 Línea ibérica", "out_audio":"🔊 Locución (Audio)", "btn":"🔄 Traducir", "combo":"🌍 Idioma (UI + explicación)", "dir":"🔁 Dirección", "dir_opts":["ES → NI","NI → ES"], "doc_header":"📚 Documentación y Referencia", "acc_titles":[ "🎓 Marco académico y decisiones del neoíbero", "🏛️ Herencia posible del íbero histórico", "🎨 Diseño de la conlang (neoíbero)", "⚙️ Pipeline del traductor (paso a paso)", "🔤 Ortografía, línea ibérica y claves", "❓/❗ Modalidad presunto vascoide (-na / -ba)", "📖 Gramática de referencia (v1.2)", "📚 Bibliografía de base", "🧾 Siglas y glosario", "🪶 Léxico vascoide, evidencias y prioridad del CSV" ] }, "EN":{ "title":"Spanish ↔ Neo-Iberian Translator", "subtitle":"Explore the revival of Neo-Iberian with modern tech", "in_label_es":"✏️ Input (Spanish)", "in_label_ni":"✏️ Input (Neo-Iberian)", "in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.", "in_ph_ni":"Type here. E.g., nuker-ke ni etxe-ka.", "out_lat_esni":"📜 Output: Neo-Iberian (Latin)", "out_lat_nies":"📜 Output: Spanish", "out_ib":"🗿 Iberian line", "out_audio":"🔊 Speech (Audio)", "btn":"🔄 Translate", "combo":"🌍 Language (UI + docs)", "dir":"🔁 Direction", "dir_opts":["ES → NI","NI → ES"], "doc_header":"📚 Documentation & Reference", "acc_titles":[ "🎓 Background & design choices", "🏛️ Possible inheritance from ancient Iberian", "🎨 Conlang design (Neo-Iberian)", "⚙️ Translator pipeline (step-by-step)", "🔤 Orthography, Iberian line & keys", "❓/❗ ‘Vascoid’ mood (-na / -ba)", "📖 Reference grammar (v1.2)", "📚 Core references", "🧾 Acronyms & glossary", "🪶 Vascoid lexicon, evidences & CSV priority" ] } } DOC_ES_0 = """**Escritura y datos (visión general).** **Objetivo.** Traducir ES↔NI con salida latina y línea ibérica "visual". **Alcance.** Lengua construida (neoíbero) inspirada en rasgos ibéricos/vascoides; no es reconstrucción histórica. **Datos.** - CSV "ricos" (`HF_Pairs_ES_NI_RICH.csv`): superficies condicionadas por morfología (`source_es`, `es_morph`, `ni_surface`, `ni_root`, `ni_suffix`, `es_lemma`, `pos`, `tam_ok`, `ni_tam`…). - NI→ES se **construye automáticamente** como inverso del ES→NI si no hay CSV dedicado (`HF_Pairs_NI_ES_Translator.csv`). **Motor.** - Analizador ES: adivina **TAM** (tiempo/aspecto/modo) + excepciones irregulares; sirve de *rescate* si el CSV no marca el TAM. - Generador NI: compone raíz + sufijo TAM y añade clíticos modales (-na / -ba). - Reverso NI→ES: *lookup* determinista (superficie+TAM → ES; sin conjugar). **Licencia/datos.** Ficheros CSV locales; puedes ampliarlos sin tocar el código. **Versión.** v2.4 (doc ampliada + prioridad CSV + evidencias).""" DOC_ES_1 = """**Herencia plausible del íbero (resumen no paleográfico).** - **Fonotaxis** preferente **CV(C)**; *p* marginal → **b** en línea ibérica. - **Vibrantes**: /r/ simple, **ŕ** (fuerte) no inicial en grafía latina NI. - **Casos/postposiciones** productivos en NI: `-k` (plural), `-te` (agente/instrumental), `-ar/-en` (genitivo/origen), `-ka` (dativo/locativo/distal), `-i` (acusativo PN). - **Partículas**: **ne** 'y', **o** 'o', **eś** 'no'. - **Numerales** (base 10/20 visual): *ban, bi, irur, laur, borste, sei, sisbi, sorse, lauŕbi, abar, orkei…* **Nota.** Diseño coherente interno > exactitud histórica literal.""" DOC_ES_2 = """**Diseño del neoíbero (fonología + morfología).** **TAM verbal** (sufijos): **PRS** `-ke`, **PST** `-bo`, **FUT** `-ta`, **IPFV** `-ri`, **IMP** `-tu`, **COND/SBJV** `-ni`, **FUT_SBJV** `-ra`. **Derivación**: `-ar`, `-en`, `-la`, `-ŕa`, `-tu` (agente), `-si` (adjetival). **Ortografía latina NI.** /p/→**b**; **ś/ŕ**; guiones `-` visibles para TAM/clíticos. **Orden** preferente **SOV**.""" DOC_ES_3 = """**Pipeline del traductor (ES→NI).** 1) Tokeniza y separa signos (incluye comillas curvas). 2) Elimina artículos/contracciones frecuentes. 3) DOM `a` → **ka/mi/te** (transferencia y PN comunes). 4) Gating POS/TAM: solo verbos reciben TAM. 5) Detección TAM (perífrasis+irregulares) como **rescate** si el CSV no trae `es_morph`. 6) Negación: **eś** antes del último finito. 7) ¿?/¡! → **-na/-ba**; WH sin ¿? → inyecta **-na** + `?`. 8) Línea ibérica: tokens BA/BE/… y **tridots** `/`. 9) Números: pasan tal cual (o según CSV si existen).""" DOC_ES_4 = """**Ortografía, línea ibérica y claves.** Modo `explicit` (BA/BE/BI/BO/BU + A/E/I/O/U). Atajos: `ka`→K, `mi`→MI, `te`→TE, `ne`→N, `o`→O, `eś`→X. Separador de palabra = **tridots `/`**. `-` para TAM/clíticos. Puntuación visible: , . ; : … ( ) [ ] « » — – “ ” ‘ ’. `p`→**b**; codas N/S/Ś/R/Ŕ/L/M/K/T.""" DOC_ES_5 = """**Modalidad -na (¿?) / -ba (¡!).** - `?` → **-na** al último finito (o último constituyente). - `!` → **-ba** idem. - WH fuera de paréntesis sin `?` → inyecta **-na** + `?`. - Evita duplicados de -na/-ba. `¿ ¡` se ignoran en la lógica.""" DOC_ES_6 = """**Gramática mínima (v1.2).** Verbo = raíz + TAM; negación **eś**. Casos: `-k, -te, -ka, -ar/-en, -i`. Pronombres: `ni, zu, nar, gu, zuek, narek`. Orden SOV; coordinaciones **ne/o**. Irregularidades: listados y atajos contextuales (`FORCE_KEYS`).""" DOC_ES_7 = """**Bibliografía / fuentes (selección).** Untermann; de Hoz; Ferrer i Jané; Correa; gramáticas del español para heurísticas. App: decisiones de diseño (no reconstrucción).""" DOC_ES_8 = """**Glosario y datasets.** TAM, DOM, superficie, lema, enclítico, tridots, clave. **CSV ricos (ES→NI)**: `source_es`, `es_morph`, `ni_surface` o (`ni_root`+`ni_suffix`), `es_lemma`, `pos`, `tam_ok`, `ni_tam`, `evidencia_es` (opcional). **NI→ES**: `HF_Pairs_NI_ES_Translator.csv` **o** inversor automático desde ES→NI. **Troubleshooting**: - Regex enclíticos: corregido. - Audio: MMS CPU si no hay CUDA; locución oculta en NI→ES. - `[SIN-LEX:…]` / `[?:…]` exponen huecos para completar el CSV. """ DOC_ES_9 = """**Léxico vascoide, evidencias y prioridad del CSV.** - **Vascoide**: voces marcadas como `evidencia_es = "vascoide"` en el CSV **no se tocan**; se usan tal cual (superficie estricta). - **Conjetural / familia / irregular / blindaje**: se registran y **solo** informan; no alteran la traducción si la superficie viene dada. - **Prioridad CSV**: siempre gana `ni_surface` (o `ni_root`+`ni_suffix`) frente a reglas internas. Las reglas solo actúan como **rescate** si falta entrada o `es_morph`. - **Limpieza**: listas internas (numerales, vascoides, atajos) se reducen al mínimo **solo si faltan** en CSV. - **Simetría**: NI→ES toma el CSV dedicado; si falta, invierte ES→NI conservando TAM.""" DOC = { "ES": [DOC_ES_0, DOC_ES_1, DOC_ES_2, DOC_ES_3, DOC_ES_4, DOC_ES_5, DOC_ES_6, DOC_ES_7, DOC_ES_8, DOC_ES_9], "EN": [ "Script & data (overview).", "Possible inheritance (non-palaeographic).", "Neo-Iberian design (phonology & morphology).", "Translator pipeline (ES→NI).", "Orthography, Iberian line & keys.", "‘Vascoid’ mood (-na / -ba).", "Minimal grammar (v1.2).", "Selected references.", "Glossary & datasets.", "Vascoid lexicon, evidences & CSV priority." ] } def build_css(): b64=None if os.path.exists("Iberia-Georgeos.ttf"): with open("Iberia-Georgeos.ttf","rb") as f: b64=base64.b64encode(f.read()).decode("ascii") font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')" return f""" @font-face {{ font-family: 'IberiaGeorgeos'; src: {font_src}; font-weight: normal; font-style: normal; }} :root {{ --iberian-clay:#8B4513; --iberian-ochre:#CC7722; --iberian-stone:#5C5C5C; --iberian-sand:#D2B48C; --iberian-rust:#A0522D; --iberian-bronze:#CD7F32; }} .gradio-container {{ background:linear-gradient(135deg,#f4e8d8 0%,#e8d5c4 50%,#d4c4b0 100%)!important; font-family:'Georgia','Times New Roman',serif!important; }} .gradio-container h1,.gradio-container h2,.gradio-container h3 {{ color:var(--iberian-clay)!important; text-shadow:2px 2px 4px rgba(139,69,19,.15)!important; border-bottom:3px solid var(--iberian-bronze)!important; padding-bottom:.5rem!important; letter-spacing:.5px!important; }} .gradio-container .gr-group {{ background:linear-gradient(to bottom,#f9f6f0,#ede6dc)!important; border:2px solid var(--iberian-sand)!important; border-radius:8px!important; box-shadow:0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5)!important; padding:1.5rem!important; margin-bottom:1.5rem!important; }} .gradio-container .gr-accordion {{ background:linear-gradient(145deg,#ebe3d5,#d9cec0)!important; border:2px solid var(--iberian-rust)!important; border-radius:6px!important; margin-bottom:.8rem!important; box-shadow:2px 2px 6px rgba(0,0,0,.15)!important; }} .gradio-container .gr-accordion .label-wrap {{ background:linear-gradient(to right,var(--iberian-ochre),var(--iberian-rust))!important; color:#fff!important; font-weight:600!important; padding:.8rem 1rem!important; border-radius:4px!important; text-shadow:1px 1px 2px rgba(0,0,0,.3)!important; }} .gradio-container .gr-textbox textarea,.gradio-container .gr-textbox input {{ background:linear-gradient(to bottom,#faf8f3,#f5f0e8)!important; border:2px solid var(--iberian-sand)!important; border-radius:6px!important; color:var(--iberian-stone)!important; font-family:'Georgia',serif!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1)!important; }} .gradio-container .gr-textbox textarea:focus,.gradio-container .gr-textbox input:focus {{ border-color:var(--iberian-bronze)!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3)!important; }} .gradio-container .gr-button.gr-button-primary {{ background:linear-gradient(145deg,var(--iberian-bronze),var(--iberian-rust))!important; border:2px solid var(--iberian-clay)!important; color:#fff!important; font-weight:bold!important; text-shadow:1px 1px 2px rgba(0,0,0,.4)!important; box-shadow:0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2)!important; border-radius:8px!important; padding:.8rem 1.5rem!important; transition:all .3s ease!important; }} .gradio-container .gr-button.gr-button-primary:hover {{ background:linear-gradient(145deg,var(--iberian-rust),var(--iberian-bronze))!important; transform:translateY(-2px)!important; box-shadow:0 6px 12px rgba(139,69,19,.4)!important; }} .ib-line {{ font-family:'IberiaGeorgeos',monospace,sans-serif!important; font-size:1.9rem!important; line-height:2.4rem!important; white-space:pre-wrap!important; background:linear-gradient(135deg,#e8dcc8 0%,#d4c4a8 50%,#c4b098 100%)!important; padding:24px!important; border-radius:10px!important; border:3px solid var(--iberian-rust)!important; border-left:6px solid var(--iberian-bronze)!important; box-shadow:0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1)!important; color:var(--iberian-clay)!important; position:relative!important; }} .ib-line::before {{ content:''!important; position:absolute!important; inset:0!important; background-image:repeating-linear-gradient(0deg,transparent,transparent 2px, rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px)!important; pointer-events:none!important; border-radius:10px!important; }} @media (max-width:768px) {{ .ib-line {{ font-size:1.5rem!important; line-height:2rem!important; padding:16px!important; }} .gradio-container .gr-group {{ padding:1rem!important; }} .gradio-container h1 {{ font-size:1.8rem!important; }} }} @media (max-width:480px) {{ .ib-line {{ font-size:1.3rem!important; line-height:1.8rem!important; padding:12px!important; }} .gradio-container h1 {{ font-size:1.5rem!important; }} }} """ CSS = build_css() with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo: with gr.Group(): title = gr.Markdown(f"# {LABELS['ES']['title']}") subtitle = gr.Markdown(f"*{LABELS['ES']['subtitle']}*") with gr.Row(): combo = gr.Dropdown(choices=["ES","EN"], value="ES", label=LABELS["ES"]["combo"]) direction = gr.Radio(choices=LABELS["ES"]["dir_opts"], value="ES → NI", label=LABELS["ES"]["dir"]) with gr.Group(): doc_header = gr.Markdown(f"## {LABELS['ES']['doc_header']}") acc_titles = LABELS["ES"]["acc_titles"] with gr.Accordion(acc_titles[0], open=False) as acc1: md1 = gr.Markdown(DOC["ES"][0]) with gr.Accordion(acc_titles[1], open=False) as acc2: md2 = gr.Markdown(DOC["ES"][1]) with gr.Accordion(acc_titles[2], open=False) as acc3: md3 = gr.Markdown(DOC["ES"][2]) with gr.Accordion(acc_titles[3], open=False) as acc4: md4 = gr.Markdown(DOC["ES"][3]) with gr.Accordion(acc_titles[4], open=False) as acc5: md5 = gr.Markdown(DOC["ES"][4]) with gr.Accordion(acc_titles[5], open=False) as acc6: md6 = gr.Markdown(DOC["ES"][5]) with gr.Accordion(acc_titles[6], open=False) as acc7: md7 = gr.Markdown(DOC["ES"][6]) with gr.Accordion(acc_titles[7], open=False) as acc8: md8 = gr.Markdown(DOC["ES"][7]) with gr.Accordion(acc_titles[8], open=False) as acc9: md9 = gr.Markdown(DOC["ES"][8]) with gr.Accordion(acc_titles[9], open=False) as acc10: md10 = gr.Markdown(DOC["ES"][9]) with gr.Group(): es_in = gr.Textbox(label=LABELS["ES"]["in_label_es"], placeholder=LABELS["ES"]["in_ph_es"], lines=5) btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary") with gr.Row(): with gr.Column(scale=2): ni_out = gr.Textbox(label=LABELS["ES"]["out_lat_esni"], lines=5, interactive=False) loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=False) audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy") with gr.Column(scale=1): ib_out = gr.HTML(label=LABELS["ES"]["out_ib"]) def do_translate(text, dir_label): if not text or not text.strip(): return (gr.update(value=""), gr.update(value="
"), gr.update(visible=False), gr.update(value=None)) if dir_label.startswith("ES"): latin, ib = translate(text) ib_html = "
" + escape(ib) + "
" return (gr.update(label=LABELS["ES"]["out_lat_esni"], value=latin), gr.update(value=ib_html), gr.update(visible=True), gr.update(value=None)) else: es_text = translate_ni_to_es(text) return (gr.update(label=LABELS["ES"]["out_lat_nies"], value=es_text), gr.update(value="
"), gr.update(visible=False), gr.update(value=None)) btn_tr.click(do_translate, [es_in, direction], [ni_out, ib_out, loc_btn, audio_out]) def run_locution(latin_text, dir_label): if dir_label.startswith("ES"): return synthesize_speech(latin_text) return None loc_btn.click(run_locution, [ni_out, direction], audio_out) def switch_lang(sel_lang, dir_label): L=LABELS[sel_lang]; T=L["acc_titles"]; D=DOC[sel_lang] in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] return ( gr.update(value=f"# {L['title']}"), gr.update(value=f"*{L['subtitle']}*"), gr.update(label=L["combo"], value=sel_lang), gr.update(label=L["dir"], choices=L["dir_opts"], value=dir_label), gr.update(value=f"## {L['doc_header']}"), gr.update(label=T[0]), gr.update(value=D[0]), gr.update(label=T[1]), gr.update(value=D[1]), gr.update(label=T[2]), gr.update(value=D[2]), gr.update(label=T[3]), gr.update(value=D[3]), gr.update(label=T[4]), gr.update(value=D[4]), gr.update(label=T[5]), gr.update(value=D[5]), gr.update(label=T[6]), gr.update(value=D[6]), gr.update(label=T[7]), gr.update(value=D[7]), gr.update(label=T[8]), gr.update(value=D[8]), gr.update(label=T[9]), gr.update(value=D[9]), gr.update(label=in_label, placeholder=in_ph), gr.update(label=out_lab), gr.update(label=L["out_ib"]), gr.update(label=L["out_audio"]), gr.update(value=L["btn"]) ) combo.change( switch_lang, [combo, direction], [title, subtitle, combo, direction, doc_header, acc1, md1, acc2, md2, acc3, md3, acc4, md4, acc5, md5, acc6, md6, acc7, md7, acc8, md8, acc9, md9, acc10, md10, es_in, ni_out, ib_out, audio_out, btn_tr] ) def switch_direction(dir_label, sel_lang): L=LABELS[sel_lang] in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] loc_vis = True if dir_label.startswith("ES") else False return (gr.update(label=in_label, placeholder=in_ph), gr.update(label=out_lab, value=""), gr.update(value="
"), gr.update(visible=loc_vis), gr.update(value=None)) direction.change( switch_direction, [direction, combo], [es_in, ni_out, ib_out, loc_btn, audio_out] ) if __name__ == "__main__": demo.queue().launch()