# app.py — Traductor Español ↔ Neoíbero v4.4.1 FIXED # UI clásica (v2.3 LTS) + motor bidireccional v4.4, con Línea Ibérica y Locución # 2025-01 – Actualizado para CSVs v4.4 ULTRA-DEFINITIVO + FIXES CRÍTICOS # Cambios v4.4: # - Compatible con HF_Pairs_ES_NI_RICH_v4.csv (783K pares) # - Compatible con HF_Pairs_NI_ES_Translator_v4.csv (783K pares) # - Usa campos nuevos: ni_surface, ni_tam, ni_pn # - Números 1-100 invariables funcionando # - Subjuntivos irregulares corregidos (vengas ✅) # - RESPETA caracteres especiales (ŕ, ś) completamente # Cambios v4.4.1 FIXED: # - FIX: Carga correcta de CSV NI→ES (índices y minúsculas corregidos) # - FIX: Sistema de conjugación con persona/número completo (1S,2S,3S,1P,2P,3P) # - FIX: Interrogativas y exclamativas (-na/-ba) mejoradas # - FIX: Preservación de nombres propios en traducción NI→ES import gradio as gr import os, csv, re, base64, unicodedata import torch from transformers import AutoProcessor, VitsModel import numpy as np from html import escape # ← para escapar la línea ibérica en HTML # Caches locales (si existen) os.environ['TRANSFORMERS_CACHE'] = os.environ.get('TRANSFORMERS_CACHE', '/tmp/cache') os.environ['HF_HOME'] = os.environ.get('HF_HOME', '/tmp/hf') DEBUG_MODE = False def debug_print(msg): if DEBUG_MODE: print(f"[DEBUG] {msg}") # ========================= # LÉXICO Y ESTRUCTURAS v4.4 # ========================= # << RUTAS EN RAÍZ DEL REPO >> CSV_CANDIDATES = [ "HF_Pairs_ES_NI_RICH_v4.csv", # ← NUEVO v4.4 "HF_Pairs_ES_NI_RICH.csv", "HF_Pairs_ES_NI.csv", "Diccionario_ES_Neoibero.csv", ] CSV_NI_ES = [ "HF_Pairs_NI_ES_Translator_v4.csv", # ← NUEVO v4.4 "HF_Pairs_NI_ES_Translator.csv", ] # ES→NI SURF_RICH = {} # (es_lower, tag) -> ni_surface LEX_FORM = {} # es_form -> ni_lemma/surface LEX_LEMMA = {} # es_lemma -> ni_lemma FOLD_FORM = {} # es_form_no_diacritics -> ni_lemma LEX_META = {} # es_form/lemma -> {"pos":..., "tam_ok":...} FORCE_KEYS = set() # NI→ES NI_TO_ES_SURF = {} # (ni_surface, ni_tam) -> es_surface NI_TO_ES_FORM = {} # ni_form/root -> es_form NI_TO_ES_LEMMA= {} # ni_root -> es_lemma # Mapeo de sufijos de persona en neoíbero → español NI_PERSON_MAP = { "-mu": "1S", # yo "-su": "2S", # tú "-i": "3S", # él/ella "-gu": "1P", # nosotros "-zu": "2P", # vosotros "-te": "3P", # ellos/ellas } # ========================= # MORFOLOGÍA – ESPAÑOL # ========================= RE_GER = re.compile(r"(ando|iendo|yendo)$", re.I) RE_PART = re.compile(r"(ado|ido|to|so|cho)$", re.I) FUT_END = ("é","ás","á","emos","éis","án") COND_END = ("ía","ías","ía","íamos","íais","ían") PRET_AR = ("é","aste","ó","amos","asteis","aron") PRET_ERIR = ("í","iste","ió","imos","isteis","ieron") IMPF_AR = ("aba","abas","ábamos","abais","aban") IMPF_ERIR = ("ía","ías","íamos","íais","ían") SUBJ_AR = ("e","es","e","emos","éis","en") SUBJ_ERIR = ("a","as","a","amos","áis","an") SUBJ_PAST_AR = ("ara","aras","ara","áramos","arais","aran","ase","ases","ase","ásemos","aseis","asen") SUBJ_PAST_ERIR = ("iera","ieras","iera","iéramos","ierais","ieran","iese","ieses","iese","iésemos","ieseis","iesen") PRS_AR = ("o","as","a","amos","áis","an") PRS_ER = ("o","es","e","emos","éis","en") PRS_IR = ("o","es","e","imos","ís","en") # ——— Parches FUT/COND sin tilde + tallos irregulares ——— RE_COND_NT_REG = re.compile(r"(?:ar|er|ir)(?:ia|ias|iamos|iais|ian)$", re.I) # hablaria, comerias... RE_COND_NT_IRR = re.compile(r"(tendr|vendr|pondr|saldr|valdr|podr|habr|sabr|cabr|querr|dir|har)(?:ia|ias|iamos|iais|ian)$", re.I) RE_FUT_NT_IRR = re.compile(r"(tendr|vendr|pondr|saldr|valdr|podr|habr|sabr|cabr|querr|dir|har)(?:re|ras|ra|remos|reis|ran)$", re.I) def _strip_any(w, ends): for s in sorted(ends, key=len, reverse=True): if w.endswith(s): return w[:-len(s)], s return None, None def _guess_class_from_ending(ending): if ending in PRET_AR or ending in IMPF_AR or ending in SUBJ_AR or ending in PRS_AR: return "ar" return "er" IRREG_LEMMA = { "fui":"ir","fuiste":"ir","fue":"ir","fuimos":"ir","fuisteis":"ir","fueron":"ir", "voy":"ir","vas":"ir","va":"ir","vamos":"ir","vais":"ir","van":"ir", "soy":"ser","eres":"ser","es":"ser","somos":"ser","sois":"ser","son":"ser", "era":"ser","eras":"ser","éramos":"ser","erais":"ser","eran":"ser", "he":"haber","has":"haber","ha":"haber","hemos":"haber","habéis":"haber","han":"haber", "hube":"haber","hubo":"haber","hubimos":"haber","hubiste":"haber","hubisteis":"haber","hubieron":"haber", "estoy":"estar","estás":"estar","está":"estar","estamos":"estar","estáis":"estar","están":"estar", "estuve":"estar","estuviste":"estar","estuvo":"estar","estuvimos":"estar","estuvisteis":"estar","estuvieron":"estar", "estaba":"estar","estabas":"estar","estábamos":"estar","estabais":"estar","estaban":"estar", "tuve":"tener","tuviste":"tener","tuvo":"tener","tuvimos":"tener","tuvisteis":"tener","tuvieron":"tener", "vine":"venir","viniste":"venir","vino":"venir","vinimos":"venir","vinisteis":"venir","vinieron":"venir", "hice":"hacer","hiciste":"hacer","hizo":"hacer","hicimos":"hacer","hicisteis":"hacer","hicieron":"hacer", "puse":"poner","pusiste":"poner","puso":"poner","pusimos":"poner","pusisteis":"poner","pusieron":"poner", "pude":"poder","pudiste":"poder","pudo":"poder","pudimos":"poder","pudisteis":"poder","pudieron":"poder", "quise":"querer","quisiste":"querer","quiso":"querer","quisimos":"querer","quisisteis":"querer","quisieron":"querer", "supe":"saber","supiste":"saber","supo":"saber","supimos":"saber","supisteis":"saber","supieron":"saber", "traje":"traer","trajiste":"traer","trajo":"traer","trajimos":"traer","trajisteis":"traer","trajeron":"traer", "dije":"decir","dijiste":"decir","dijo":"decir","dijimos":"decir","dijisteis":"decir","dijeron":"decir", "conduje":"conducir","condujiste":"conducir","condujo":"conducir","condujimos":"conducir","condujisteis":"conducir","condujeron":"conducir", "anduve":"andar","anduviste":"andar","anduvo":"andar","anduvimos":"andar","anduvisteis":"andar","anduvieron":"andar", "cupe":"caber","cupiste":"caber","cupo":"caber","cupimos":"caber","cupisteis":"caber","cupieron":"caber", "di":"dar","diste":"dar","dio":"dar","dimos":"dar","disteis":"dar","dieron":"dar", "vi":"ver","viste":"ver","vio":"ver","vimos":"ver","visteis":"ver","vieron":"ver", "tengo":"tener","vengo":"venir","pongo":"poner","salgo":"salir","traigo":"traer","caigo":"caer", "hago":"hacer","oigo":"oír","digo":"decir","valgo":"valer","sigo":"seguir", "tienes":"tener","tiene":"tener","tienen":"tener", "vienes":"venir","viene":"venir","vienen":"venir", "pienso":"pensar","piensas":"pensar","piensa":"pensar","piensan":"pensar", "quiero":"querer","quieres":"querer","quiere":"querer","quieren":"querer", "prefiero":"preferir","prefieres":"preferir","prefiere":"preferir","prefieren":"preferir", "vaya":"ir","vayas":"ir","vayamos":"ir","vayáis":"ir","vayan":"ir", "sea":"ser","seas":"ser","seamos":"ser","seáis":"ser","sean":"ser", "haya":"haber","hayas":"haber","hayamos":"haber","hayáis":"haber","hayan":"haber", "dé":"dar","des":"dar","demos":"dar","deis":"dar","den":"dar", "esté":"estar","estés":"estar","estemos":"estar","estéis":"estar","estén":"estar", "tenga":"tener","tengas":"tener","tengamos":"tener","tengáis":"tener","tengan":"tener", "venga":"venir","vengas":"venir","vengamos":"venir","vengáis":"venir","vengan":"venir", # ← FIX v4.4 "ve":"ir","id":"ir", "sé":"ser","sed":"ser", "haz":"hacer","haced":"hacer", "pon":"poner","poned":"poner", "ven":"venir","venid":"venir", "ten":"tener","tened":"tener", "sal":"salir","salid":"salir", "di":"decir","decid":"decir", "doy":"dar","das":"dar","da":"dar","damos":"dar","dais":"dar","dan":"dar", "veo":"ver","ves":"ver","vemos":"ver","veis":"ver","ven":"ver", "oí":"oír","oíste":"oír","oyó":"oír","oímos":"oír","oísteis":"oír","oyeron":"oír", "iba":"ir","ibas":"ir","íbamos":"ir","ibais":"ir","iban":"ir", "veía":"ver","veías":"ver","veíamos":"ver","veíais":"ver","veían":"ver", "vinieras":"venir","lloviera":"llover", } # NUEVO: lemas para futuro de subjuntivo arcaico IRREG_LEMMA.update({ "viniere":"venir","vinieres":"venir","vinieren":"venir", "hiciere":"hacer","hicieres":"hacer","hicieren":"hacer", "tuviere":"tener","tuvieres":"tener","tuvieren":"tener", }) IRREG_MORPH_TAGS = { # Subjuntivos "vaya":"SBJV","vayas":"SBJV","vayamos":"SBJV","vayáis":"SBJV","vayan":"SBJV", "sea":"SBJV","seas":"SBJV","seamos":"SBJV","seáis":"SBJV","sean":"SBJV", "haya":"SBJV","hayas":"SBJV","hayamos":"SBJV","hayáis":"SBJV","hayan":"SBJV", "dé":"SBJV","des":"SBJV","demos":"SBJV","deis":"SBJV","den":"SBJV", "esté":"SBJV","estés":"SBJV","estemos":"SBJV","estéis":"SBJV","estén":"SBJV", "tenga":"SBJV","tengas":"SBJV","tengamos":"SBJV","tengáis":"SBJV","tengan":"SBJV", "venga":"SBJV","vengas":"SBJV","vengamos":"SBJV","vayáis":"SBJV","vengan":"SBJV", "haga":"SBJV","hagas":"SBJV","hagamos":"SBJV","hagáis":"SBJV","hagan":"SBJV", "pueda":"SBJV","puedas":"SBJV","podamos":"SBJV","podáis":"SBJV","puedan":"SBJV", # Imperativos "id":"IMP","sed":"IMP", "haz":"IMP","haced":"IMP","pon":"IMP","poned":"IMP","ven":"IMP","venid":"IMP", "ten":"IMP","tened":"IMP","sal":"IMP","salid":"IMP","decid":"IMP", # Imperativos con clíticos "llámame":"IMP","llámalo":"IMP","llámala":"IMP","llámanos":"IMP","llámalos":"IMP","llámalas":"IMP", "dime":"IMP","dímelo":"IMP","dinos":"IMP","dínoslo":"IMP", "hazme":"IMP","hazlo":"IMP","hazla":"IMP","haznos":"IMP", "ponme":"IMP","ponlo":"IMP","ponla":"IMP","ponnos":"IMP", "dame":"IMP","dámelo":"IMP","danos":"IMP","dánoslo":"IMP", "tráeme":"IMP","tráelo":"IMP","tráela":"IMP","tráenos":"IMP", "díselo":"IMP","pónselo":"IMP","házselo":"IMP", # Futuro de subjuntivo (arcaico) "viniere":"FUT_SBJV","vinieres":"FUT_SBJV","vinieren":"FUT_SBJV", "hiciere":"FUT_SBJV","hicieres":"FUT_SBJV","hicieren":"FUT_SBJV", "fuere":"FUT_SBJV","fueres":"FUT_SBJV","fueren":"FUT_SBJV", "hubiere":"FUT_SBJV","hubieres":"FUT_SBJV","hubieren":"FUT_SBJV", # Pretéritos "creísteis":"PST","dijisteis":"PST","hicisteis":"PST","pusisteis":"PST", "supisteis":"PST","quisisteis":"PST","trajisteis":"PST", "vi":"PST","dio":"PST","fue":"PST","fui":"PST", # Imperfectos "iba":"IPFV","ibas":"IPFV","íbamos":"IPFV","ibais":"IPFV","iban":"IPFV", "veía":"IPFV","veías":"IPFV","veíamos":"IPFV","veíais":"IPFV","veían":"IPFV", } def looks_like_verb_form_strict(w: str) -> bool: w = (w or "").lower() if w.endswith(("ar","er","ir")): return True if RE_GER.search(w) or RE_PART.search(w): return True if re.search(r"(á|ás|áis|és|éis|ís)$", w): return True if _strip_any(w, FUT_END+COND_END)[0] is not None: return True if _strip_any(w, PRET_AR+PRET_ERIR)[0] is not None: return True if _strip_any(w, IMPF_AR+IMPF_ERIR)[0] is not None: return True if _strip_any(w, SUBJ_PAST_AR+SUBJ_PAST_ERIR)[0] is not None: return True # Irregulares sin tilde (FUT/COND) + COND regular sin tilde if RE_COND_NT_REG.search(w): return True if RE_COND_NT_IRR.search(w): return True if RE_FUT_NT_IRR.search(w): return True # Irregulares pretéritos y compañía if re.search(r"(anduve|anduviste|anduvo|anduvimos|anduvieron|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w): return True return False def _zco_guess(w:str)->str: if w.endswith("uzco"): return w[:-4] + "ucir" if w.endswith("ezco"): return w[:-4] + "ecer" if w.endswith("ozco"): return w[:-4] + "ocer" if w.endswith("azco"): return w[:-4] + "acer" return "" def guess_infinitive_es(w: str) -> str: w = (w or "").lower() if w in IRREG_LEMMA: return IRREG_LEMMA[w] if w in ("vámonos","vamonos"): return "ir" if w.endswith("zco"): z = _zco_guess(w) if z: return z if w.endswith("go"): base = w[:-2] map_go = {"ten":"tener","ven":"venir","pon":"poner","sal":"salir","tra":"traer","ca":"caer","ha":"hacer","oi":"oír","di":"decir","val":"valer","si":"seguir"} for k,v in map_go.items(): if base.startswith(k): return v if w.endswith(("ar","er","ir")): return w m = RE_GER.search(w) if m: base = w[:m.start()] return base + ("ar" if m.group(0)=="ando" else "er") m = RE_PART.search(w) if m: base = w[:m.start()] part_irreg = { "hecho":"hacer","dicho":"decir","visto":"ver","puesto":"poner","escrito":"escribir", "abierto":"abrir","cubierto":"cubrir","muerto":"morir","roto":"romper", "vuelto":"volver","resuelto":"resolver","frito":"freír","impreso":"imprimir", "satisfecho":"satisfacer","provisto":"proveer" } if w in part_irreg: return part_irreg[w] return base + "er" # FUT/COND con tilde base, end = _strip_any(w, FUT_END+COND_END) if base is not None: irreg = {"saldr":"salir","vendr":"venir","tendr":"tener","pondr":"poner","valdr":"valer","podr":"poder", "habr":"haber","sabr":"saber","cabr":"caber","querr":"querer","dir":"decir","har":"hacer"} if base in irreg: return irreg[base] return base # COND sin tilde (regular e irregular) m = RE_COND_NT_IRR.search(w) if m: irreg = {"saldr":"salir","vendr":"venir","tendr":"tener","pondr":"poner","valdr":"valer","podr":"poder", "habr":"haber","sabr":"saber","cabr":"caber","querr":"querer","dir":"decir","har":"hacer"} stem = m.group(1) return irreg.get(stem, "") m = RE_COND_NT_REG.search(w) if m: suf = m.group(0).replace("ar","",1).replace("er","",1).replace("ir","",1) # "ia" / "ias" / ... return w[:-len(suf)] # quita "ia/ias/..." → deja el infinitivo # FUT sin tilde (solo irregulares para evitar ambigüedades) m = RE_FUT_NT_IRR.search(w) if m: irreg = {"saldr":"salir","vendr":"venir","tendr":"tener","pondr":"poner","valdr":"valer","podr":"poder", "habr":"haber","sabr":"saber","cabr":"caber","querr":"querer","dir":"decir","har":"hacer"} stem = m.group(1) return irreg.get(stem, "") # Otras heurísticas if w.endswith("áis"): return w[:-3] + "ar" if w.endswith("éis"): return w[:-3] + "er" if w.endswith("ís"): return w[:-2] + "ir" if w.endswith("ás"): return w[:-2] + "ar" if w.endswith("és"): return w[:-2] + "er" if w.endswith("á"): return w[:-1] + "ar" for group in (PRET_AR+PRET_ERIR, IMPF_AR+IMPF_ERIR, SUBJ_AR+SUBJ_ERIR, PRS_AR+PRS_ER+PRS_IR): base, end = _strip_any(w, group) if base is not None: return base + _guess_class_from_ending(end) base, end = _strip_any(w, SUBJ_PAST_AR) if base is not None: return base + "ar" base, end = _strip_any(w, SUBJ_PAST_ERIR) if base is not None: return base + "er" return "" def es_morph_tag(w: str) -> str: w = (w or "").lower() if w in IRREG_MORPH_TAGS: return IRREG_MORPH_TAGS[w] # Imperativos con clíticos / perífrasis if re.search(r"^(llám|dím|házm|pónm|vén|dám|tén|tráe)(a|e)?(me|te|lo|la|nos|os|les|se|melo|telo|selo)$", w): return "IMP" if re.search(r"(adme|edme|idme|adlo|edle|idle|adnos|ednos)$", w): return "IMP" if re.search(r"(?:ad|ed|id|ád|éd|íd)(?:me|te|se|lo|la|nos|os|les|melo|telo|selo|noslo|oslo|sela|selas|selos)$", w): return "IMP" if re.search(r"^.*[áéí]ndo(me|te|se|lo|la|nos|os|les|melo|telo|selo)$", w): return "IPFV" if re.search(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", w): base = re.sub(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", "", w) if base and len(base) > 2: return "IMP" # FUT/COND sin tilde (prioridad antes de otras reglas) if RE_FUT_NT_IRR.search(w): return "FUT" if RE_COND_NT_IRR.search(w): return "COND" if RE_COND_NT_REG.search(w): return "COND" if w.endswith(("ar","er","ir")): return "INF" if RE_GER.search(w): return "IPFV" if RE_PART.search(w): return "PST" if _strip_any(w, PRET_AR+PRET_ERIR)[0] is not None: return "PST" if _strip_any(w, IMPF_AR+IMPF_ERIR)[0] is not None: return "IPFV" if _strip_any(w, FUT_END)[0] is not None: return "FUT" if _strip_any(w, COND_END)[0] is not None: return "COND" if re.search(r"(á|ás|áis|és|éis|ís)$", w): return "PRS" if _strip_any(w, SUBJ_AR+SUBJ_ERIR)[0] is not None: return "SBJV" if _strip_any(w, PRS_AR+PRS_ER+PRS_IR)[0] is not None: return "PRS" if _strip_any(w, SUBJ_PAST_AR+SUBJ_PAST_ERIR)[0] is not None: return "SBJV" if re.search(r"(anduve|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w): return "PST" if re.search(r"^.+[aei]d$", w): return "IMP" return "UNK" # ========================= # MORFOLOGÍA – NEOÍBERO # ========================= NI_TAM_SUFFIXES = { "-ke": "PRS","-ei": "PST","-ta": "IPFV","-na": "FUT", "-ne": "COND","-ni": "SBJV","-tu": "IMP","-ra":"FUT_SBJV" } def detect_ni_tam(word: str): # Parche: tolerar colas pronominales tras TAM (p. ej., -i, -mu) word = (word or "").lower().strip() for pn in ("-i", "-mu", "-su", "-gu", "-zu", "-te"): if word.endswith(pn): cand = word[:-len(pn)] # solo aceptamos cortar PN si entonces aparece un TAM conocido if any(cand.endswith(suf) for suf in NI_TAM_SUFFIXES.keys()): word = cand break for suf, tag in NI_TAM_SUFFIXES.items(): if word.endswith(suf): return word[:-len(suf)], tag, suf return word, "INF", "" # ========================= # UTILIDADES # ========================= def fold(s:str)->str: return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c)!="Mn") def has_diacritic(s:str)->bool: return bool(re.search(r"[áéíóúüÁÉÍÓÚÜ]", s or "")) def _canon_pos(p: str) -> str: p = (p or "").strip().upper() MAP = {"V":"V","VERB":"V","N":"N","NOUN":"N","ADJ":"ADJ","ADJECTIVE":"ADJ","ADV":"ADV","ADVERB":"ADV", "INTJ":"INTJ","INTERJ":"INTJ","INTERJECTION":"INTJ","PRON":"PRON","PRONOUN":"PRON", "PART":"PART","PARTICLE":"PART","POSTP":"POSTP","ADP":"POSTP","ADPOSITION":"POSTP", "NUM":"NUM","DET":"DET"} return MAP.get(p, "") def _boolish(x): if x is None: return None s = str(x).strip().lower() if s in ("1","true","t","yes","y","si","sí"): return True if s in ("0","false","f","no","n"): return False return None def _meta_set(form_es:str, pos:str=None, tam_ok=None): if not form_es: return d = LEX_META.setdefault(form_es, {}) if pos and not d.get("pos"): d["pos"] = pos if tam_ok is not None and d.get("tam_ok") is None: d["tam_ok"] = bool(tam_ok) def pos_of_es(token_low:str) -> str: m = LEX_META.get(token_low, {}) if m.get("pos"): return m["pos"] return "V" if looks_like_verb_form_strict(token_low) else "" def tam_allowed_for_es(token_low:str) -> bool: m = LEX_META.get(token_low, {}) if m.get("tam_ok") is not None: return bool(m["tam_ok"]) return pos_of_es(token_low) == "V" # ========================= # TTS (Meta MMS) # ========================= print("Cargando modelo de voz...") device = "cuda" if torch.cuda.is_available() else "cpu" processor = model = None try: processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa") model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device) print("Modelo de voz cargado.") except Exception as e: print(f"ERROR TTS: {e}") PAUSE_LEVEL=3 def add_reading_pauses(text: str, level:int=3) -> str: if level <= 1: return text t = text if level >= 2: t = re.sub(r",\s*", ", , ", t) if level >= 3: t = re.sub(r"\.\s*", ". . ", t); t = re.sub(r";\s*", "; ; ", t) return re.sub(r"\s+"," ",t).strip() def hispanize_for_tts(ni_text: str) -> str: text=(ni_text or "").lower() # CRÍTICO: Respetar caracteres iberos text=text.replace('ŕ','rr').replace('ś','s').replace('eś','es') text=text.replace('ŕa','rra').replace('aŕe','arre').replace('-', ' ') text=re.sub(r'\[.*?\]','',text) text=re.sub(r'\s+',' ',text).strip() return add_reading_pauses(text, PAUSE_LEVEL) def synthesize_speech(text): if not text or not text.strip() or model is None or processor is None: return None try: inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device) with torch.no_grad(): output = model(**inputs).waveform speech_np = output.cpu().numpy().squeeze() mx = max(abs(speech_np.min()), abs(speech_np.max())) if mx>0: speech_np = speech_np/mx*0.9 return (16000, speech_np.astype(np.float32)) except Exception as e: print(f"Error TTS: {e}"); return None # ========================= # LÍNEA IBÉRICA (claves Georgeos) # ========================= KEYS_MODE = "explicit" V = "aeiou" SYL_FOR={"b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"], "d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"], "t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"], "g":["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"], "k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]} ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›","l":"‹L›","r":"‹R›","ŕ":"‹Ŕ›","n":"‹N›","m":"‹M›"} CODA_FOR={"":"","n":"‹N›","s":"‹S›","ś":"‹Ś›","r":"‹R›","ŕ":"‹Ŕ›","l":"‹L›","m":"‹M›","k":"‹K›","t":"‹T›"} def tokens_from_latin(ni:str)->str: out=[]; i=0; ni=(ni or "").lower() while istr: low=(ni_plain or "").lower() if low in KEYS_OVERRIDE: return KEYS_OVERRIDE[low] m=re.findall(r"‹(.*?)›", token_str) out=[] for t in m: if KEYS_MODE == "compact": if len(t)==2 and t[0] in "BDTGK": out.append(t[0]) elif t in ("A","E","I","O","U"): out.append(t) elif t=="Ś": out.append("X") elif t=="Ŕ": out.append("r") else: out.append(t[0].upper()) else: if len(t)==2 and t[0] in "BDTGK": out.append(t) elif t=="Ś": out.append("X") elif t=="Ŕ": out.append("r") else: out.append(t.upper()) return "".join(out) TRIDOT = "/" VISIBLE_PUNCT = {",",".",";","; ",":","…","(",")","[","]","{","}","\"","'","«","»","—","–",""",""","'","'"} HARD_BOUND = {".",";","—","–",":","(",")","«","»"} def render_ib_with_tridots(toks): res=[]; prev_word=False for tk in toks: is_punct = tk in VISIBLE_PUNCT if is_punct: res.append(" "+tk+" "); prev_word=False else: if prev_word: res.append(" "+TRIDOT+" ") res.append(tk); prev_word=True return "".join(res).strip() # ========================= # TRADUCTOR ES→NI # ========================= TAM_SUFFIX={"PRS":"-ke","PST":"-ei","FUT":"-na","IPFV":"-ta", "COND":"-ne","SBJV":"-ni","IMP":"-tu","INF":"","FUT_SBJV":"-ra","UNK":"-ke"} VERB_TAM = ("-ke","-na","-ei","-ta","-ni","-ne","-tu","-ra") def strip_ni_tam(lemma: str): lemma = lemma or "" for s in sorted(VERB_TAM, key=len, reverse=True): if lemma.endswith(s): return lemma[:-len(s)], s return lemma, "" STOP=set(""" el la los las lo un una unos unas al del de en con sin por sobre entre hasta desde hacia según tras pero aunque sino que como si porque cuando donde mientras muy ya sí no también solo sólo aún aun más menos mi mis tu tus su sus nuestro nuestra nuestros nuestras esto eso aquello ese esa esos esas aquel aquella aquellos aquellas quien quién quiénes cual cuál cuales cuáles cuyo cuya cuyos cuyas eh ay oh uy ah aja jeje jaja aah ahh ohh uhh """.split()) # --- Reglas "a" → ka/mi/te def rule_a(prev_tok:str, token:str, next_tok:str)->str: verbs={"dar","decir","contar","enviar","ofrecer","mostrar","prestar","regalar","entregar"} if prev_tok in verbs: return "mi" nombres={"ana","marta","juan","pedro","luis","maría","jose","carlos","laura"} if next_tok in nombres: return "te" return "ka" Q_ENCLITIC_INT = "-na" Q_ENCLITIC_EXC = "-ba" WH_WORDS = { "qué","quien","quién","quienes","quiénes","cual","cuál","cuales","cuáles", "donde","dónde","cuando","cuándo","como","cómo", "cuanto","cuánto","cuanta","cuánta","cuantos","cuántos","cuantas","cuántas" } def is_wh_token(t: str) -> bool: low = (t or "").lower() if low in WH_WORDS: return True f = fold(low) return f in {"que","quien","quienes","cual","cuales","donde","cuando","como","cuanto","cuanta","cuantos","cuantas"} def has_wh_outside_parens(toks) -> bool: depth = 0 for tk in toks: if tk in {"(", "«", """, "'"}: depth += 1 elif tk in {")", "»", """, "'"}: depth = max(0, depth-1) elif depth == 0 and is_wh_token(tk): return True return False ESTAR_SET={"estoy","estás","está","estamos","estáis","están","estaba","estabas","estábamos","estabais","estaban"} HABER_SET={"he","has","ha","hemos","habéis","han","había","habías","habíamos","habíais","habían"} def detect_tam_with_context(toks, i, sentence_start=False): t=toks[i].lower() prev=toks[i-1].lower() if i>0 else "" prev2=toks[i-2].lower() if i>1 else "" nxt=toks[i+1].lower() if i+1str: if low=="visto" and nxt=="de": return "vestir" return "" def has_tilde_equiv_lookup(low:str)->str: if has_diacritic(low) and not looks_like_verb_form_strict(low): f=fold(low) if f in LEX_FORM: return LEX_FORM[f] if f in FOLD_FORM: return FOLD_FORM[f] return "" def lookup_form_lemma(token:str, prev:str, nxt:str): if not token: return "", False low=token.lower() fl=forced_lemma_with_context(low, prev, nxt) if fl and fl in LEX_LEMMA: return LEX_LEMMA[fl], True if low in LEX_FORM: return LEX_FORM[low], True til=has_tilde_equiv_lookup(low) if til: return til, True if looks_like_verb_form_strict(low): lem=guess_infinitive_es(low) if lem and lem in LEX_LEMMA: return LEX_LEMMA[lem], True return "", False def attach_enclitic(out_words, ib_keys, plain, attach_idx, encl): if attach_idx is None or attach_idx < 0 or attach_idx >= len(out_words): return cur = out_words[attach_idx] or "" if cur.endswith(encl): return out_words[attach_idx] = cur + encl plain[attach_idx] = (plain[attach_idx] or "") + encl ib_keys[attach_idx] = georgeos_keys(tokens_from_latin(plain[attach_idx]), plain[attach_idx]) def ensure_terminal_qmark(out_words, ib_keys, plain): if not out_words: out_words.append("?"); ib_keys.append(""); plain.append("?"); return j = len(out_words) - 1 while j >= 0 and (out_words[j] == "" or out_words[j] is None): j -= 1 if j < 0: out_words.append("?"); ib_keys.append(""); plain.append("?"); return if out_words[j] == ".": out_words[j] = "?"; ib_keys[j] = ""; plain[j] = "?" elif out_words[j] not in {"?","!"}: out_words.append("?"); ib_keys.append(""); plain.append("?") def normalize_surface_by_pos(ni_surface:str, pos:str) -> str: if not ni_surface: return ni_surface if pos != "V": root, _ = strip_ni_tam(ni_surface) return root return ni_surface def translate_sentence(sent:str): toks = re.sub(r"\s+"," ", (sent or "").strip()) # Separamos también comillas curvas toks = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–""''])", r" \1 ", toks) toks = [t for t in toks.split() if t] out_words=[]; ib_keys=[]; plain=[] neg_next=False; last_finite_idx=None; has_qmark=False saw_wh = has_wh_outside_parens(toks) sentence_start=True for i,t in enumerate(toks): if t in {"¿","¡"}: sentence_start=True; continue if t in {"?","!"}: if t=="?": has_qmark=True encl = Q_ENCLITIC_INT if t=="?" else Q_ENCLITIC_EXC attach_idx = last_finite_idx if attach_idx is None: for j in range(len(out_words)-1, -1, -1): if out_words[j] and out_words[j] not in VISIBLE_PUNCT: attach_idx = j; break if attach_idx is not None: attach_enclitic(out_words, ib_keys, plain, attach_idx, encl) out_words.append(t); ib_keys.append(""); plain.append(t) sentence_start=True; continue if t in VISIBLE_PUNCT: out_words.append(t); ib_keys.append(t); plain.append(t) if t in HARD_BOUND: last_finite_idx=None sentence_start = (t in {".",":",";","—","–"}) continue low=t.lower() prev = toks[i-1].lower() if i>0 else "" nxt = toks[i+1].lower() if i+1 str: # Normaliza espacios, permite "tridots" (/) y desencapsula [SIN-LEX:…] t = (text or "").replace("/", " ") t = re.sub(r"\[SIN-LEX:([^\]]+)\]", r"\1", t) return re.sub(r"\s+", " ", t.strip()) def tokenize_ni(text: str): # Separar también comillas curvas text = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–""''])", r" \1 ", text) return [t for t in text.split() if t] # --- NUEVO: solo conjugar si el lema ES termina en -ar/-er/-ir def _is_spanish_verb_lemma(lemma: str) -> bool: return isinstance(lemma, str) and re.search(r"(ar|er|ir)$", lemma) def _detect_ni_person(ni_form: str): """Detecta persona/número del sufijo neoíbero y devuelve (person, root_limpia)""" for suf, pn in NI_PERSON_MAP.items(): if ni_form.endswith(suf): return pn, ni_form[:-len(suf)] return "3S", ni_form # default def _conj_es_from_lemma(lemma: str, tag: str, person: str = "3S"): """Conjuga un verbo español según TAM y persona/número""" if not _is_spanish_verb_lemma(lemma): return lemma lemma = lemma.lower() # Verbos irregulares completos (clave: lemma, tag, person) IRREG_FULL = { # SER ("ser", "PRS", "1S"): "soy", ("ser", "PRS", "2S"): "eres", ("ser", "PRS", "3S"): "es", ("ser", "PRS", "1P"): "somos", ("ser", "PRS", "2P"): "sois", ("ser", "PRS", "3P"): "son", ("ser", "PST", "1S"): "fui", ("ser", "PST", "2S"): "fuiste", ("ser", "PST", "3S"): "fue", ("ser", "PST", "1P"): "fuimos", ("ser", "PST", "2P"): "fuisteis", ("ser", "PST", "3P"): "fueron", ("ser", "IPFV", "1S"): "era", ("ser", "IPFV", "2S"): "eras", ("ser", "IPFV", "3S"): "era", ("ser", "IPFV", "1P"): "éramos", ("ser", "IPFV", "2P"): "erais", ("ser", "IPFV", "3P"): "eran", ("ser", "SBJV", "1S"): "sea", ("ser", "SBJV", "2S"): "seas", ("ser", "SBJV", "3S"): "sea", ("ser", "SBJV", "1P"): "seamos", ("ser", "SBJV", "2P"): "seáis", ("ser", "SBJV", "3P"): "sean", # IR ("ir", "PRS", "1S"): "voy", ("ir", "PRS", "2S"): "vas", ("ir", "PRS", "3S"): "va", ("ir", "PRS", "1P"): "vamos", ("ir", "PRS", "2P"): "vais", ("ir", "PRS", "3P"): "van", ("ir", "PST", "1S"): "fui", ("ir", "PST", "2S"): "fuiste", ("ir", "PST", "3S"): "fue", ("ir", "PST", "1P"): "fuimos", ("ir", "PST", "2P"): "fuisteis", ("ir", "PST", "3P"): "fueron", ("ir", "IPFV", "1S"): "iba", ("ir", "IPFV", "2S"): "ibas", ("ir", "IPFV", "3S"): "iba", ("ir", "IPFV", "1P"): "íbamos", ("ir", "IPFV", "2P"): "ibais", ("ir", "IPFV", "3P"): "iban", ("ir", "SBJV", "1S"): "vaya", ("ir", "SBJV", "2S"): "vayas", ("ir", "SBJV", "3S"): "vaya", ("ir", "SBJV", "1P"): "vayamos", ("ir", "SBJV", "2P"): "vayáis", ("ir", "SBJV", "3P"): "vayan", # ESTAR ("estar", "PRS", "1S"): "estoy", ("estar", "PRS", "2S"): "estás", ("estar", "PRS", "3S"): "está", ("estar", "PRS", "1P"): "estamos", ("estar", "PRS", "2P"): "estáis", ("estar", "PRS", "3P"): "están", ("estar", "SBJV", "1S"): "esté", ("estar", "SBJV", "2S"): "estés", ("estar", "SBJV", "3S"): "esté", ("estar", "SBJV", "1P"): "estemos", ("estar", "SBJV", "2P"): "estéis", ("estar", "SBJV", "3P"): "estén", # TENER ("tener", "PRS", "1S"): "tengo", ("tener", "PRS", "2S"): "tienes", ("tener", "PRS", "3S"): "tiene", ("tener", "PRS", "1P"): "tenemos", ("tener", "PRS", "2P"): "tenéis", ("tener", "PRS", "3P"): "tienen", ("tener", "SBJV", "1S"): "tenga", ("tener", "SBJV", "2S"): "tengas", ("tener", "SBJV", "3S"): "tenga", ("tener", "SBJV", "1P"): "tengamos", ("tener", "SBJV", "2P"): "tengáis", ("tener", "SBJV", "3P"): "tengan", # VENIR ("venir", "PRS", "1S"): "vengo", ("venir", "PRS", "2S"): "vienes", ("venir", "PRS", "3S"): "viene", ("venir", "PRS", "1P"): "venimos", ("venir", "PRS", "2P"): "venís", ("venir", "PRS", "3P"): "vienen", ("venir", "SBJV", "1S"): "venga", ("venir", "SBJV", "2S"): "vengas", ("venir", "SBJV", "3S"): "venga", ("venir", "SBJV", "1P"): "vengamos", ("venir", "SBJV", "2P"): "vengáis", ("venir", "SBJV", "3P"): "vengan", # HACER ("hacer", "PRS", "1S"): "hago", ("hacer", "PRS", "2S"): "haces", ("hacer", "PRS", "3S"): "hace", ("hacer", "PRS", "1P"): "hacemos", ("hacer", "PRS", "2P"): "hacéis", ("hacer", "PRS", "3P"): "hacen", ("hacer", "SBJV", "1S"): "haga", ("hacer", "SBJV", "2S"): "hagas", ("hacer", "SBJV", "3S"): "haga", ("hacer", "SBJV", "1P"): "hagamos", ("hacer", "SBJV", "2P"): "hagáis", ("hacer", "SBJV", "3P"): "hagan", ("hacer", "PST", "1S"): "hice", ("hacer", "PST", "3S"): "hizo", # PONER ("poner", "PRS", "1S"): "pongo", ("poner", "PRS", "2S"): "pones", ("poner", "PRS", "3S"): "pone", ("poner", "PRS", "1P"): "ponemos", ("poner", "PRS", "2P"): "ponéis", ("poner", "PRS", "3P"): "ponen", ("poner", "SBJV", "1S"): "ponga", ("poner", "SBJV", "2S"): "pongas", ("poner", "SBJV", "3S"): "ponga", ("poner", "SBJV", "1P"): "pongamos", ("poner", "SBJV", "2P"): "pongáis", ("poner", "SBJV", "3P"): "pongan", # DAR ("dar", "PRS", "1S"): "doy", ("dar", "PRS", "2S"): "das", ("dar", "PRS", "3S"): "da", ("dar", "PRS", "1P"): "damos", ("dar", "PRS", "2P"): "dais", ("dar", "PRS", "3P"): "dan", ("dar", "SBJV", "1S"): "dé", ("dar", "SBJV", "2S"): "des", ("dar", "SBJV", "3S"): "dé", ("dar", "SBJV", "1P"): "demos", ("dar", "SBJV", "2P"): "deis", ("dar", "SBJV", "3P"): "den", # HABER ("haber", "PRS", "1S"): "he", ("haber", "PRS", "2S"): "has", ("haber", "PRS", "3S"): "ha", ("haber", "PRS", "1P"): "hemos", ("haber", "PRS", "2P"): "habéis", ("haber", "PRS", "3P"): "han", ("haber", "SBJV", "1S"): "haya", ("haber", "SBJV", "2S"): "hayas", ("haber", "SBJV", "3S"): "haya", ("haber", "SBJV", "1P"): "hayamos", ("haber", "SBJV", "2P"): "hayáis", ("haber", "SBJV", "3P"): "hayan", # PODER ("poder", "PRS", "1S"): "puedo", ("poder", "PRS", "2S"): "puedes", ("poder", "PRS", "3S"): "puede", ("poder", "PRS", "1P"): "podemos", ("poder", "PRS", "2P"): "podéis", ("poder", "PRS", "3P"): "pueden", ("poder", "SBJV", "1S"): "pueda", ("poder", "SBJV", "2S"): "puedas", ("poder", "SBJV", "3S"): "pueda", ("poder", "SBJV", "1P"): "podamos", ("poder", "SBJV", "2P"): "podáis", ("poder", "SBJV", "3P"): "puedan", # DECIR ("decir", "PRS", "1S"): "digo", ("decir", "PRS", "2S"): "dices", ("decir", "PRS", "3S"): "dice", ("decir", "PRS", "1P"): "decimos", ("decir", "PRS", "2P"): "decís", ("decir", "PRS", "3P"): "dicen", # SABER ("saber", "PRS", "1S"): "sé", ("saber", "PRS", "2S"): "sabes", ("saber", "PRS", "3S"): "sabe", # VER ("ver", "PRS", "1S"): "veo", ("ver", "PRS", "2S"): "ves", ("ver", "PRS", "3S"): "ve", ("ver", "PRS", "1P"): "vemos", ("ver", "PRS", "2P"): "veis", ("ver", "PRS", "3P"): "ven", } # Buscar forma irregular completa if (lemma, tag, person) in IRREG_FULL: return IRREG_FULL[(lemma, tag, person)] # Tallos irregulares FUT/COND irr_stems = { "salir":"saldr","venir":"vendr","tener":"tendr","poner":"pondr","valer":"valdr","poder":"podr", "haber":"habr","saber":"sabr","caber":"cabr","querer":"querr","decir":"dir","hacer":"har" } # Conjugación regular root = lemma[:-2] verb_class = lemma[-2:] # ar, er, ir # PRESENTE if tag == "PRS": endings_ar = {"1S":"o","2S":"as","3S":"a","1P":"amos","2P":"áis","3P":"an"} endings_er = {"1S":"o","2S":"es","3S":"e","1P":"emos","2P":"éis","3P":"en"} endings_ir = {"1S":"o","2S":"es","3S":"e","1P":"imos","2P":"ís","3P":"en"} endings = endings_ar if verb_class == "ar" else (endings_ir if verb_class == "ir" else endings_er) return root + endings.get(person, "a") # PRETÉRITO if tag == "PST": endings_ar = {"1S":"é","2S":"aste","3S":"ó","1P":"amos","2P":"asteis","3P":"aron"} endings_er = {"1S":"í","2S":"iste","3S":"ió","1P":"imos","2P":"isteis","3P":"ieron"} endings = endings_ar if verb_class == "ar" else endings_er return root + endings.get(person, "ó") # FUTURO if tag == "FUT": stem = irr_stems.get(lemma, lemma) endings = {"1S":"é","2S":"ás","3S":"á","1P":"emos","2P":"éis","3P":"án"} return stem + endings.get(person, "á") # CONDICIONAL if tag == "COND": stem = irr_stems.get(lemma, lemma) endings = {"1S":"ía","2S":"ías","3S":"ía","1P":"íamos","2P":"íais","3P":"ían"} return stem + endings.get(person, "ía") # SUBJUNTIVO PRESENTE if tag == "SBJV": if verb_class == "ar": endings = {"1S":"e","2S":"es","3S":"e","1P":"emos","2P":"éis","3P":"en"} else: endings = {"1S":"a","2S":"as","3S":"a","1P":"amos","2P":"áis","3P":"an"} return root + endings.get(person, "e" if verb_class == "ar" else "a") # IMPERFECTO if tag == "IPFV": if verb_class == "ar": endings = {"1S":"aba","2S":"abas","3S":"aba","1P":"ábamos","2P":"abais","3P":"aban"} else: endings = {"1S":"ía","2S":"ías","3S":"ía","1P":"íamos","2P":"íais","3P":"ían"} return root + endings.get(person, "aba" if verb_class == "ar" else "ía") # IMPERATIVO if tag == "IMP": if person == "2S": return root + ("a" if verb_class == "ar" else "e") return lemma # otras personas usan subjuntivo # INFINITIVO/GERUNDIO/PARTICIPIO if tag in {"INF","UNK"}: return lemma # Default return lemma # Mantener compatibilidad con código antiguo def _conj_es_3sg(lemma:str, tag:str) -> str: """Wrapper para compatibilidad - llama a _conj_es_from_lemma con 3S""" return _conj_es_from_lemma(lemma, tag, "3S") def _strip_nominal_suffix(base: str): """Si no hay match directo, intenta quitar sufijos nominales comunes.""" for suf in sorted(NI_NOMINAL_SUFFIXES, key=len, reverse=True): if base.endswith(suf): return base[:-len(suf)], suf return base, "" def _cleanup_es_spaces(s: str) -> str: s = re.sub(r"\s+([,.;:!?])", r"\1", s) s = re.sub(r"\(\s+", "(", s) s = re.sub(r"\s+\)", ")", s) s = re.sub(r"\s{2,}", " ", s).strip() # micro-limpiezas s = s.replace("a a ", " a ") return s def translate_ni_to_es(sent: str): toks = tokenize_ni(normalize_ni(sent)) out=[] for i, t in enumerate(toks): # Preservar puntuación if t in VISIBLE_PUNCT or t in {"?", "!", "¿", "¡"}: out.append(t) continue # ✅ FIX: Preservar nombres propios (primera letra mayúscula) if t and t[0].isupper() and not t.isupper() and len(t) > 1: out.append(t) continue low=t.lower() # Quita enclíticos -na / -ba (interrog./exclam.) SOLO para lookup lookup_form = low[:-3] if (low.endswith("-na") or low.endswith("-ba")) else low # 1) Forma directa (superficie o raíz) if lookup_form in NI_TO_ES_FORM: out.append(NI_TO_ES_FORM[lookup_form]) continue # 2) TAM por sufijo + PERSONA ✅ NUEVO root, tam_tag, tam_suffix = detect_ni_tam(lookup_form) person, root_clean = _detect_ni_person(root) # ✅ DETECTAR PERSONA # 2a) Superficie exacta (con TAM) es_direct = NI_TO_ES_SURF.get((lookup_form, tam_tag)) if es_direct: out.append(es_direct) continue # 2b) CONJUGAR con persona ✅ MEJORADO if tam_tag not in {"INF","UNK"} and root_clean in NI_TO_ES_LEMMA: es_lemma = NI_TO_ES_LEMMA[root_clean] out.append(_conj_es_from_lemma(es_lemma, tam_tag, person)) continue # 2c) Raíz conocida → forma/lema ES if root_clean in NI_TO_ES_FORM: out.append(NI_TO_ES_FORM[root_clean]) continue if root_clean in NI_TO_ES_LEMMA: es_lemma = NI_TO_ES_LEMMA[root_clean] out.append(_conj_es_from_lemma(es_lemma, tam_tag, person)) continue # 3) Fallback suave para nominales: quita -ar/-en/-ka/-la/-si/-ŕa y reintenta base2, suf2 = _strip_nominal_suffix(root_clean if root_clean else lookup_form) if base2 != (root_clean if root_clean else lookup_form): if base2 in NI_TO_ES_FORM: out.append(NI_TO_ES_FORM[base2]) continue if base2 in NI_TO_ES_LEMMA: # ruta nominal: NO conjugar aunque sea verbo; devolvemos el lema limpio out.append(NI_TO_ES_LEMMA[base2]) continue # Si aún no, último recurso: presentar el núcleo "limpio" out.append(base2) continue # 4) Desconocido → marcador suave out.append(f"[?:{t}]") return _cleanup_es_spaces(" ".join(out)) # ========================= # CARGA DE LÉXICO # ========================= def load_lexicon(): loaded=False total_rich=total_simple=0 for p in CSV_CANDIDATES: if not os.path.exists(p): continue try: with open(p, encoding="utf-8") as f: rd=csv.DictReader(f); flds=set(rd.fieldnames or []) # v4.4: formato nuevo con ni_surface if {"source_es","es_morph","ni_surface"}.issubset(flds): for r in rd: es=(r.get("source_es") or "").strip().lower() tag=(r.get("es_morph") or "").strip().upper() surf=(r.get("ni_surface") or "").strip() if not surf: root=(r.get("ni_root") or "").strip(); suf=(r.get("ni_suffix") or "").strip() if root or suf: surf=f"{root}{suf}" if es and tag and surf: SURF_RICH[(es,tag)] = surf; total_rich+=1 ni=(r.get("target_ni") or "").strip() es_lem=(r.get("es_lemma") or "").strip().lower() pos = _canon_pos(r.get("pos") or r.get("es_pos") or r.get("target_pos") or r.get("pos_es") or r.get("ni_pos") or "") tam_ok = _boolish(r.get("tam_ok")) if es: _meta_set(es, pos=pos, tam_ok=(tam_ok if tam_ok is not None else (pos=="V" if pos else None))) if es_lem: _meta_set(es_lem, pos=("V" if es_lem.endswith(("ar","er","ir")) else (pos or "")), tam_ok=(tam_ok if tam_ok is not None else (pos=="V" if pos else None))) if es and ni!="": LEX_FORM.setdefault(es,ni) if es_lem and ni!="": LEX_LEMMA.setdefault(es_lem,ni) loaded=True; continue if {"source_es","target_ni"}.issubset(flds): for r in rd: es=(r.get("source_es") or "").strip().lower() ni=(r.get("target_ni") or "").strip() if not es: continue LEX_FORM.setdefault(es,ni); total_simple+=1 _meta_set(es, pos="", tam_ok=None) if looks_like_verb_form_strict(es): lem=guess_infinitive_es(es) if lem: LEX_LEMMA.setdefault(lem,ni); _meta_set(lem, pos="V", tam_ok=True) loaded=True; continue if {"es","ni_lemma"}.issubset(flds): for r in rd: es=(r.get("es") or "").strip().lower() ni=(r.get("ni_lemma") or "").strip() if not es: continue LEX_FORM.setdefault(es,ni); total_simple+=1 _meta_set(es, pos="", tam_ok=None) if looks_like_verb_form_strict(es): lem=guess_infinitive_es(es) if lem: LEX_LEMMA.setdefault(lem,ni); _meta_set(lem, pos="V", tam_ok=True) loaded=True; continue except Exception as e: print(f"[WARN] No se pudo leer {p}: {e}") if total_rich or total_simple: print(f"✓ ES→NI: {total_rich} superficies ricas, {total_simple} pares simples") global FOLD_FORM FOLD_FORM={} for k,v in LEX_FORM.items(): fk=fold(k) if fk!=k and len(k)>=5 and not looks_like_verb_form_strict(k): FOLD_FORM.setdefault(fk,v) # Cobertura mínima KEEP_MIN={ "y":"ne","o":"o","no":"eś","a":"ka","para":"kara","eso":"kok","tarta":"gatel", "el":"do", "la":"da", "los":"don", "las":"dan", "un":"banu","una":"bana","uno":"ban", "este":"aŕe","esta":"aŕa","estos":"aŕen","estas":"aŕan", # Números básicos "dos":"bi","tres":"irur","cuatro":"laur","cinco":"borste","seis":"śei", "siete":"sisbi","ocho":"sorse","nueve":"lauŕbi","diez":"abaŕ","veinte":"oŕkei", # Números 1-100 (dígitos) "1":"ban","2":"bi","3":"irur","4":"laur","5":"borste", "6":"śei","7":"sisbi","8":"sorse","9":"bedar","10":"abaŕ", "11":"abaŕ-ke-ban","12":"abaŕ-ke-bi","13":"abaŕ-ke-irur","14":"abaŕ-ke-laur","15":"abaŕ-ke-borste", "16":"abaŕ-ke-śei","17":"abaŕ-ke-sisbi","18":"abaŕ-ke-sorse","19":"abaŕ-ke-bedar","20":"oŕkei", "21":"oŕkei-ke-ban","22":"oŕkei-ke-bi","23":"oŕkei-ke-irur","24":"oŕkei-ke-laur","25":"oŕkei-ke-borste", "26":"oŕkei-ke-śei","27":"oŕkei-ke-sisbi","28":"oŕkei-ke-sorse","29":"oŕkei-ke-bedar","30":"oŕkei-abaŕ", "31":"oŕkei-abaŕ-ke-ban","32":"oŕkei-abaŕ-ke-bi","33":"oŕkei-abaŕ-ke-irur","34":"oŕkei-abaŕ-ke-laur","35":"oŕkei-abaŕ-ke-borste", "36":"oŕkei-abaŕ-ke-śei","37":"oŕkei-abaŕ-ke-sisbi","38":"oŕkei-abaŕ-ke-sorse","39":"oŕkei-abaŕ-ke-bedar","40":"binoŕkei", "41":"binoŕkei-abaŕ-ke-ban","42":"binoŕkei-abaŕ-ke-bi","43":"binoŕkei-abaŕ-ke-irur","44":"binoŕkei-abaŕ-ke-laur","45":"binoŕkei-abaŕ-ke-borste", "46":"binoŕkei-abaŕ-ke-śei","47":"binoŕkei-abaŕ-ke-sisbi","48":"binoŕkei-abaŕ-ke-sorse","49":"binoŕkei-abaŕ-ke-bedar","50":"binoŕkei-abaŕ", "51":"binoŕkei-abaŕ-ke-ban","52":"binoŕkei-abaŕ-ke-bi","53":"binoŕkei-abaŕ-ke-irur","54":"binoŕkei-abaŕ-ke-laur","55":"binoŕkei-abaŕ-ke-borste", "56":"binoŕkei-abaŕ-ke-śei","57":"binoŕkei-abaŕ-ke-sisbi","58":"binoŕkei-abaŕ-ke-sorse","59":"binoŕkei-abaŕ-ke-bedar","60":"iruŕokei", "61":"iruŕokei-abaŕ-ke-ban","62":"iruŕokei-abaŕ-ke-bi","63":"iruŕokei-abaŕ-ke-irur","64":"iruŕokei-abaŕ-ke-laur","65":"iruŕokei-abaŕ-ke-borste", "66":"iruŕokei-abaŕ-ke-śei","67":"iruŕokei-abaŕ-ke-sisbi","68":"iruŕokei-abaŕ-ke-sorse","69":"iruŕokei-abaŕ-ke-bedar","70":"iruŕokei-abaŕ", "71":"iruŕokei-abaŕ-ke-ban","72":"iruŕokei-abaŕ-ke-bi","73":"iruŕokei-abaŕ-ke-irur","74":"iruŕokei-abaŕ-ke-laur","75":"iruŕokei-abaŕ-ke-borste", "76":"iruŕokei-abaŕ-ke-śei","77":"iruŕokei-abaŕ-ke-sisbi","78":"iruŕokei-abaŕ-ke-sorse","79":"iruŕokei-abaŕ-ke-bedar","80":"lauŕokei", "81":"lauŕokei-abaŕ-ke-ban","82":"lauŕokei-abaŕ-ke-bi","83":"lauŕokei-abaŕ-ke-irur","84":"lauŕokei-abaŕ-ke-laur","85":"lauŕokei-abaŕ-ke-borste", "86":"lauŕokei-abaŕ-ke-śei","87":"lauŕokei-abaŕ-ke-sisbi","88":"lauŕokei-abaŕ-ke-sorse","89":"lauŕokei-abaŕ-ke-bedar","90":"lauŕokei-abaŕ", "91":"lauŕokei-abaŕ-ke-ban","92":"lauŕokei-abaŕ-ke-bi","93":"lauŕokei-abaŕ-ke-irur","94":"lauŕokei-abaŕ-ke-laur","95":"lauŕokei-abaŕ-ke-borste", "96":"lauŕokei-abaŕ-ke-śei","97":"lauŕokei-abaŕ-ke-sisbi","98":"lauŕokei-abaŕ-ke-sorse","99":"lauŕokei-abaŕ-ke-bedar","100":"atun", # Números en letras "once":"abaŕ-ke-ban","doce":"abaŕ-ke-bi","trece":"abaŕ-ke-irur","catorce":"abaŕ-ke-laur","quince":"abaŕ-ke-borste", "dieciséis":"abaŕ-ke-śei","dieciseis":"abaŕ-ke-śei","diecisiete":"abaŕ-ke-sisbi","dieciocho":"abaŕ-ke-sorse","diecinueve":"abaŕ-ke-bedar", "veintiuno":"oŕkei-ke-ban","veintidós":"oŕkei-ke-bi","veintidos":"oŕkei-ke-bi","veintitrés":"oŕkei-ke-irur","veintitres":"oŕkei-ke-irur", "veinticuatro":"oŕkei-ke-laur","veinticinco":"oŕkei-ke-borste","veintiséis":"oŕkei-ke-śei","veintiseis":"oŕkei-ke-śei", "veintisiete":"oŕkei-ke-sisbi","veintiocho":"oŕkei-ke-sorse","veintinueve":"oŕkei-ke-bedar", "treinta":"oŕkei-abaŕ","cuarenta":"binoŕkei","cincuenta":"binoŕkei-abaŕ","sesenta":"iruŕokei", "setenta":"iruŕokei-abaŕ","ochenta":"lauŕokei","noventa":"lauŕokei-abaŕ","cien":"atun", # Pronombres y partículas "yo":"ni","tú":"zu","él":"nar","ella":"nar", "nosotros":"gu","nosotras":"gu","vosotros":"zuek","vosotras":"zuek", "ellos":"narek","ellas":"narek", "que":"ze","si":"baldin","cuando":"noiz","donde":"non", "como":"nola","porque":"zeren","mientras":"bitarte", "versión":"bertsi","test":"froga","prueba":"froga", "ejemplo":"adibid","texto":"testu","palabra":"hitz" } for k,v in KEEP_MIN.items(): LEX_FORM.setdefault(k,v) if k in {"yo","tú","él","ella","nosotros","nosotras","vosotros","vosotras","ellos","ellas"}: _meta_set(k, pos="PRON", tam_ok=False) elif k in {"que","si","cuando","donde","como","porque","mientras"}: _meta_set(k, pos="PART", tam_ok=False) elif k.isdigit() or k in {"uno","dos","tres","cuatro","cinco","seis","siete","ocho","nueve","diez","once","doce","trece","catorce","quince","dieciséis","dieciseis","diecisiete","dieciocho","diecinueve","veinte","veintiuno","veintidós","veintidos","veintitrés","veintitres","veinticuatro","veinticinco","veintiséis","veintiseis","veintisiete","veintiocho","veintinueve","treinta","cuarenta","cincuenta","sesenta","setenta","ochenta","noventa","cien"}: _meta_set(k, pos="NUM", tam_ok=False) else: _meta_set(k, pos=_canon_pos("PART" if k in {"y","o","no","a","para"} else "DET"), tam_ok=False) BUILTIN_LEMMA={ # Solo por seguridad si faltara en CSV "llover":"euŕak","llamar":"deitu","venir":"nuker","ir":"nitus", "hacer":"giotael","tener":"giokk","poder":"binbel","poner":"pusen", "ser":"izan","estar":"egon" } for k,v in BUILTIN_LEMMA.items(): LEX_LEMMA.setdefault(k,v); _meta_set(k, pos="V", tam_ok=True) FORCE_FORMS = { "voy":"nitus-ke","vas":"nitus-ke","va":"nitus-ke","vamos":"nitus-ke","vais":"nitus-ke","van":"nitus-ke", "vengo":"nuker-ke","vienes":"nuker-ke","viene":"nuker-ke","venimos":"nuker-ke","venís":"nuker-ke","vienen":"nuker-ke", "ven":"nuker-tu","haz":"giotael-tu","pon":"pusen-tu","di":"siśnesir-tu","sal":"salku-tu","ten":"giokk-tu","sé":"suber-tu" } for form, ni in FORCE_FORMS.items(): LEX_FORM[form] = ni; _meta_set(form, pos="V", tam_ok=True) global FORCE_KEYS FORCE_KEYS = set(FORCE_FORMS.keys()) return loaded def load_lexicon_ni_es(): loaded=False total=0 # 1) Intento DictReader con cabecera for p in CSV_NI_ES: if not os.path.exists(p): debug_print(f"CSV NI→ES no encontrado: {p}") continue try: with open(p, encoding="utf-8") as f: sniffer = csv.Sniffer() sample = f.read(4096) f.seek(0) has_header = sniffer.has_header(sample) if has_header: dr = csv.DictReader(f) fieldnames = [x.lower() for x in (dr.fieldnames or [])] # nombres plausibles fn_source = next((c for c in fieldnames if "source" in c and ("ni" in c or "neo" in c)), None) fn_target = next((c for c in fieldnames if "target" in c and ("es" in c or "spa" in c)), None) fn_eslem = next((c for c in fieldnames if "es_lem" in c or c=="es_lemma" or "lemma_es" in c), None) # v4.4: el CSV usa 'ni_tam' fn_morph = next((c for c in fieldnames if c in {"ni_tam","ni_morph","ni_tag"} or "morph" in c), None) fn_root = next((c for c in fieldnames if "ni_root" in c or c=="root" or "ni_lemma" in c), None) if fn_source and fn_target: debug_print(f"Cargando {p} con cabecera: source={fn_source}, target={fn_target}") for r in dr: # ✅ FIX 1: NO convertir a minúsculas source_ni = (r.get(fn_source) or "").strip() # ← SIN .lower() target_es = (r.get(fn_target) or "").strip() es_lemma = (r.get(fn_eslem) or "").strip().lower() if fn_eslem else "" ni_morph = (r.get(fn_morph) or "").strip().upper() if fn_morph else "" ni_root = (r.get(fn_root) or "").strip().lower() if fn_root else "" if source_ni and target_es: # ✅ FIX 2: Sobrescribir en vez de setdefault NI_TO_ES_FORM[source_ni] = target_es if ni_morph: NI_TO_ES_SURF[(source_ni, ni_morph)] = target_es if ni_root and es_lemma: NI_TO_ES_LEMMA.setdefault(ni_root, es_lemma) if ni_root and target_es: NI_TO_ES_FORM.setdefault(ni_root, target_es) total+=1 print(f"✓ Cargadas {total} filas NI→ES (cabecera) desde {p}") loaded=True continue # pasa al siguiente fichero si hay # 2) Fallback por posiciones f.seek(0) reader=csv.reader(f) count=0 for row in reader: if not row: continue if count==0 and any("source" in (c or "").lower() or "ni_" in (c or "").lower() or "target" in (c or "").lower() for c in row): count+=1 continue # ✅ FIX 3: Índices correctos según estructura del CSV v4.4 # source_ni, target_es, ni_tam, ni_pn, es_morph, es_pn, ni_root, ni_suffix, es_lemma, pos_es, evidencia # 0 1 2 3 4 5 6 7 8 9 10 source_ni = (row[0] if len(row)>0 else "").strip() # ← SIN .lower() target_es = (row[1] if len(row)>1 else "").strip() ni_tam = (row[2] if len(row)>2 else "").strip().upper() # ← CORRECTO: posición 2 ni_root = (row[6] if len(row)>6 else "").strip().lower() # ← Ya estaba bien es_lemma = (row[8] if len(row)>8 else "").strip().lower() # ← CORRECTO: posición 8 if source_ni and target_es: NI_TO_ES_FORM[source_ni] = target_es # ← Sobrescribir if ni_tam: NI_TO_ES_SURF[(source_ni, ni_tam)] = target_es if ni_root and es_lemma: NI_TO_ES_LEMMA.setdefault(ni_root, es_lemma) if ni_root and target_es: NI_TO_ES_FORM.setdefault(ni_root, target_es) count+=1 total+=1 if count>0: print(f"✓ Cargadas {count} filas NI→ES (posicional) desde {p}") loaded=True except Exception as e: print(f"[WARN] Error leyendo {p}: {e}") import traceback traceback.print_exc() # ✅ FIX 4: Vocabulario mínimo ampliado KEEP_MIN_NI = { # Partículas "ne":"y","o":"o","eś":"no","ka":"a","mi":"a","te":"a", "kin":"con","tan":"en","ta":"de","kara":"para", # Pronombres "ni":"yo","zu":"tú","nar":"él","gu":"nosotros", "ban":"un","banu":"un","bana":"una", # Artículos "do":"el","da":"la","don":"los","dan":"las", # Demostrativos "aŕe":"este","aŕa":"esta","aŕen":"estos","aŕan":"estas", # Verbos base "nuker":"venir","siśnesir":"decir","giotael":"hacer", "izan":"ser","egon":"estar","giokk":"tener", "pusen":"poner","binbel":"poder","nitus":"ir", "deitu":"llamar","euŕak":"llover", # Interjecciones "batsornel":"hola","sabernel":"adiós", # Sustantivos comunes "domśaldum":"pan","śesilmen":"café","kuknomtok":"restaurante", "sikliskoŕ":"casa","śaldalbam":"mercado","bekmil":"cine", "seŕtuŕgok":"año","kordo":"pueblo","tokbatkir":"ciudad", "eskom":"amigo","nintos":"madre","śimnas":"padre", } for k,v in KEEP_MIN_NI.items(): NI_TO_ES_FORM.setdefault(k,v) if total: print(f"✓ NI→ES: {total} pares cargados (incluyendo {len(KEEP_MIN_NI)} mínimos)") else: print(f"⚠ NI→ES: No se cargaron pares desde CSV, usando {len(KEEP_MIN_NI)} mínimos") # ✅ FIX 5: DEBUG - mostrar muestras cargadas if DEBUG_MODE and total > 0: print("\n[DEBUG] Muestra de NI_TO_ES_FORM:") samples = list(NI_TO_ES_FORM.items())[:30] for k, v in samples: print(f" {k} → {v}") return loaded print("Cargando léxico ES→NI..."); load_lexicon() print("Cargando léxico NI→ES..."); load_lexicon_ni_es() # ========================= # UI CLÁSICA (con dirección) # ========================= LABELS={ "ES":{ "title":"Traductor Español ↔ Neoíbero v4.4", "subtitle":"Explora el renacimiento ibérico con tecnología moderna — ULTRA-DEFINITIVO", "in_label_es":"✏️ Entrada (Español)", "in_label_ni":"✏️ Entrada (Neoíbero)", "in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.", "in_ph_ni":"Idatzi hemen. Adib.: nitus-ke ni etxe-ka.", "out_lat_esni":"📜 Salida: Neoíbero (latín)", "out_lat_nies":"📜 Salida: Español", "out_ib":"🗿 Línea ibérica", "out_audio":"🔊 Locución (Audio)", "btn":"🔄 Traducir", "combo":"🌍 Idioma (UI + explicación)", "dir":"🔁 Dirección", "dir_opts":["ES → NI","NI → ES"], "doc_header":"📚 Documentación y Referencia", "acc_titles":[ "🎓 Marco académico y decisiones del neoíbero", "🏛️ Herencia posible del íbero histórico", "🎨 Diseño de la conlang (neoíbero)", "⚙️ Pipeline del traductor (paso a paso)", "🔤 Ortografía, línea ibérica y claves", "❓/❗ Modalidad presunto vascoide (-na / -ba)", "📖 Gramática de referencia (v1.2)", "📚 Bibliografía de base", "🧾 Siglas y glosario" ] }, "EN":{ "title":"Spanish ↔ Neo-Iberian Translator v4.4", "subtitle":"Explore the revival of Neo-Iberian with modern tech — ULTRA-DEFINITIVE", "in_label_es":"✏️ Input (Spanish)", "in_label_ni":"✏️ Input (Neo-Iberian)", "in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.", "in_ph_ni":"Type here. E.g., nitus-ke ni etxe-ka.", "out_lat_esni":"📜 Output: Neo-Iberian (Latin)", "out_lat_nies":"📜 Output: Spanish", "out_ib":"🗿 Iberian line", "out_audio":"🔊 Speech (Audio)", "btn":"🔄 Translate", "combo":"🌍 Language (UI + docs)", "dir":"🔁 Direction", "dir_opts":["ES → NI","NI → ES"], "doc_header":"📚 Documentation & Reference", "acc_titles":[ "🎓 Background & design choices", "🏛️ Possible inheritance from ancient Iberian", "🎨 Conlang design (Neo-Iberian)", "⚙️ Translator pipeline (step by step)", "🔤 Orthography, Iberian line & keys", "❓/❗ 'Vascoid' mood (-na / -ba)", "📖 Reference grammar (v1.2)", "📚 Core references", "🧾 Acronyms & glossary" ] } } # Documentación completa del appOld.py DOC_ES_0 = """**Escritura y datos.** El *neoíbero* se diseña como una **lengua conjetural** que toma como base el corpus ibérico (ss. V–I a.C.) conocido, más una morfología y un léxico especulativos construidos con plausibilidad histórica y tipológica. """ DOC_ES_1 = """**Herencia antigua posible.** - Raíces documentadas en inscripciones ibéricas reales: *ban*, *bi*, *irur*, *laur*, *borste*, *śei*, *sisbi*, *sorse* (numerales); *belai* (cuervo), *ebee* (perdiz), etc. - **CV(C)** phonotactics; no **/p/** fonémico; *r/ŕ* desaconsejado en inicio de palabra. - Postposiciones/sufijos nominales: **-k** (pl), **-te** (agente), **-ar/-en** (genitivo/origen), **-ka** (dat./loc./dist.), **-i** (ac. con PN). - Partículas: **ne** 'y', **o** 'o', **eś** 'no'. - Numerales: *ban, bi, irur, laur, borste, śei, sisbi, sorse, lauŕbi, abaŕ (10), oŕkei (20).* """ DOC_ES_2 = """**Diseño de la conlang:** - **TAM (v3.2-LTS):** PRS **-ke**, PST **-bo**, FUT **-ta**, IPFV **-ri**, IMP **-tu**, **SBJV -ni**, **COND -ne**. - Derivación: verbos (-ke/-ta/-bo/-ri/-ni/-ne), adjetivos (-si), sustantivos (-ar/-en/-tu/-la/-ŕa/-si). - Orden preferido **SOV**. """ DOC_ES_3 = """**Pipeline (resumen):** 1) Tokenizar; partir **al→ka do**, **del→ta do**. 2) `a` → `ka`/`mi`/`te`. 3) CSV rico da **superficie** NI; si no, CSV simple → **lema** NI. 4) **Puerta POS/TAM**: solo verbos obtienen TAM; otros se normalizan a lema/raíz. 5) Negación **eś** antes del primer verbo finito. 6) ?/! → enclíticos **-na/-ba** en el último verbo finito (o último constituyente). 7) WH desnudo añade **-na** e inserta `?`. 8) Línea ibérica: solo puntuación visible; separador de palabras = **"/"** (tridots). """ DOC_ES_4 = """**Ortografía y claves:** - Modo de claves **explicit** (BA/BE/BI/BO/BU). - Separador de palabras = "/". - Atajos: `ka`→**K**, `mi`→**MI**, `te`→**TE**, `ne`→**N**, `o`→**O**, `eś`→**X**. """ DOC_ES_5 = """**Modalidad (-na/-ba):** - **-na** interrogativa; **-ba** exclamativa, se une al último verbo finito (o último constituyente). """ DOC_ES_6 = """**Gramática mínima (NI):** - Verbo: raíz + **TAM**; negación preverbal **eś**. - Casos productivos: -k (pl), -te (agente), -ka (dat/loc), -ar/-en (genitivo/origen). """ DOC_ES_7 = """**Referencias principales:** Untermann; de Hoz; Ferrer i Jané; Correa; gramáticas/corpora bascoide seleccionados.""" DOC_ES_8 = """**Acrónimos (v3.2-LTS):** - **TAM** (PRS, PST, FUT, IPFV, SBJV, COND, IMP, FUT_SBJV); **PN**; **POS**; **LEMMa/SURFACE**; **RT**; **LTS**; **SOV**; **CV(C)**; **CSV**; **Enclítico**. """ DOC_EN_0 = """**Writing & data.** *Neo-Iberian* is designed as a **conlang** that takes the known Iberian corpus (5th–1st c. BCE) as a base, plus a speculative morphology and lexicon built with historical and typological plausibility. """ DOC_EN_1 = """**Possible ancient heritage.** - Roots documented in real Iberian inscriptions: *ban*, *bi*, *irur*, *laur*, *borste*, *śei*, *sisbi*, *sorse* (numerals); *belai* (raven), *ebee* (partridge), etc. - **CV(C)** phonotactics; no phonemic **/p/**; *r/ŕ* disallowed word-initially. - Postpositions/nominal suffixes: **-k** (pl), **-te** (agent), **-ar/-en** (genitive/origin), **-ka** (dat./loc./dist.), **-i** (acc. with PN). - Particles: **ne** 'and', **o** 'or', **eś** 'not'. - Numerals: *ban, bi, irur, laur, borste, śei, sisbi, sorse, lauŕbi, abaŕ (10), oŕkei (20).* """ DOC_EN_2 = """**Conlang design:** - **TAM (v3.2-LTS):** PRS **-ke**, PST **-bo**, FUT **-ta**, IPFV **-ri**, IMP **-tu**, **SBJV -ni**, **COND -ne**. - Derivation: verbs (-ke/-ta/-bo/-ri/-ni/-ne), adjectives (-si), nouns (-ar/-en/-tu/-la/-ŕa/-si). - Preferred order **SOV**. """ DOC_EN_3 = """**Pipeline (summary):** 1) Tokenize; split **al→ka do**, **del→ta do**. 2) `a` → `ka`/`mi`/`te`. 3) Rich CSV gives NI **surface**; else simple CSV → NI **lemma**. 4) **POS/TAM gating**: only verbs get TAM; others normalize to lemma/root. 5) Negation **eś** before the first finite verb. 6) ?/! → enclitics **-na/-ba** on the last finite verb (or last constituent). 7) Bare WH adds **-na** and inserts `?`. 8) Iberian line: visible punctuation only; word separator is **"/"** (tridots). """ DOC_EN_4 = """**Orthography & keys:** - Keys mode **explicit** (BA/BE/BI/BO/BU). - Word separator = "/". - Shortcuts: `ka`→**K**, `mi`→**MI**, `te`→**TE**, `ne`→**N**, `o`→**O**, `eś`→**X**. """ DOC_EN_5 = """**Modality (-na/-ba):** - **-na** interrogative; **-ba** exclamative, attached to the last finite verb (or last constituent). """ DOC_EN_6 = """**Minimal grammar (NI):** - Verb: root + **TAM**; preverbal negation **eś**. - Productive cases: -k (pl), -te (agent), -ka (dat/loc), -ar/-en (genitive/origin). """ DOC_EN_7 = """**Core references:** Untermann; de Hoz; Ferrer i Jané; Correa; selected Bascoid grammars/corpora.""" DOC_EN_8 = """**Acronyms (v3.2-LTS):** - **TAM** (PRS, PST, FUT, IPFV, SBJV, COND, IMP, FUT_SBJV); **PN**; **POS**; **LEMMa/SURFACE**; **RT**; **LTS**; **SOV**; **CV(C)**; **CSV**; **Enclitic**. """ DOC={ "ES":[DOC_ES_0, DOC_ES_1, DOC_ES_2, DOC_ES_3, DOC_ES_4, DOC_ES_5, DOC_ES_6, DOC_ES_7, DOC_ES_8], "EN":[DOC_EN_0, DOC_EN_1, DOC_EN_2, DOC_EN_3, DOC_EN_4, DOC_EN_5, DOC_EN_6, DOC_EN_7, DOC_EN_8] } # CSS del diseño original (appOld.py) def build_css(): b64=None if os.path.exists("Iberia-Georgeos.ttf"): with open("Iberia-Georgeos.ttf","rb") as f: b64=base64.b64encode(f.read()).decode("ascii") font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')" return f""" @font-face {{ font-family: 'IberiaGeorgeos'; src: {font_src}; font-weight: normal; font-style: normal; }} :root {{ --iberian-clay:#8B4513; --iberian-ochre:#CC7722; --iberian-stone:#5C5C5C; --iberian-sand:#D2B48C; --iberian-rust:#A0522D; --iberian-bronze:#CD7F32; }} .gradio-container {{ background:linear-gradient(135deg,#f4e8d8 0%,#e8d5c4 50%,#d4c4b0 100%)!important; font-family:'Georgia','Times New Roman',serif!important; }} .gradio-container h1,.gradio-container h2,.gradio-container h3 {{ color:var(--iberian-clay)!important; text-shadow:2px 2px 4px rgba(139,69,19,.15)!important; border-bottom:3px solid var(--iberian-bronze)!important; padding-bottom:.5rem!important; letter-spacing:.5px!important; }} .gradio-container .gr-group {{ background:linear-gradient(to bottom,#f9f6f0,#ede6dc)!important; border:2px solid var(--iberian-sand)!important; border-radius:8px!important; box-shadow:0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5)!important; padding:1.5rem!important; margin-bottom:1.5rem!important; }} .gradio-container .gr-accordion {{ background:linear-gradient(145deg,#ebe3d5,#d9cec0)!important; border:2px solid var(--iberian-rust)!important; border-radius:6px!important; margin-bottom:.8rem!important; box-shadow:2px 2px 6px rgba(0,0,0,.15)!important; }} .gradio-container .gr-accordion .label-wrap {{ background:linear-gradient(to right,var(--iberian-ochre),var(--iberian-rust))!important; color:#fff!important; font-weight:600!important; padding:.8rem 1rem!important; border-radius:4px!important; text-shadow:1px 1px 2px rgba(0,0,0,.3)!important; }} .gradio-container .gr-textbox textarea,.gradio-container .gr-textbox input {{ background:linear-gradient(to bottom,#faf8f3,#f5f0e8)!important; border:2px solid var(--iberian-sand)!important; border-radius:6px!important; color:var(--iberian-stone)!important; font-family:'Georgia',serif!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1)!important; }} .gradio-container .gr-textbox textarea:focus,.gradio-container .gr-textbox input:focus {{ border-color:var(--iberian-bronze)!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3)!important; }} .gradio-container .gr-button.gr-button-primary {{ background:linear-gradient(145deg,var(--iberian-bronze),var(--iberian-rust))!important; border:2px solid var(--iberian-clay)!important; color:#fff!important; font-weight:bold!important; text-shadow:1px 1px 2px rgba(0,0,0,.4)!important; box-shadow:0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2)!important; border-radius:8px!important; padding:.8rem 1.5rem!important; transition:all .3s ease!important; }} .gradio-container .gr-button.gr-button-primary:hover {{ background:linear-gradient(145deg,var(--iberian-rust),var(--iberian-bronze))!important; transform:translateY(-2px)!important; box-shadow:0 6px 12px rgba(139,69,19,.4)!important; }} .ib-line {{ font-family:'IberiaGeorgeos',monospace,sans-serif!important; font-size:1.9rem!important; line-height:2.4rem!important; white-space:pre-wrap!important; background:linear-gradient(135deg,#e8dcc8 0%,#d4c4a8 50%,#c4b098 100%)!important; padding:24px!important; border-radius:10px!important; border:3px solid var(--iberian-rust)!important; border-left:6px solid var(--iberian-bronze)!important; box-shadow:0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1)!important; color:var(--iberian-clay)!important; position:relative!important; }} .ib-line::before {{ content:''!important; position:absolute!important; inset:0!important; background-image:repeating-linear-gradient(0deg,transparent,transparent 2px, rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px)!important; pointer-events:none!important; border-radius:10px!important; }} @media (max-width:768px) {{ .ib-line {{ font-size:1.5rem!important; line-height:2rem!important; padding:16px!important; }} .gradio-container .gr-group {{ padding:1rem!important; }} .gradio-container h1 {{ font-size:1.8rem!important; }} }} @media (max-width:480px) {{ .ib-line {{ font-size:1.3rem!important; line-height:1.8rem!important; padding:12px!important; }} .gradio-container h1 {{ font-size:1.5rem!important; }} }} """ CSS = build_css() # ========================= # INTERFAZ GRADIO # ========================= with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo: with gr.Group(): title = gr.Markdown(f"# {LABELS['ES']['title']}") subtitle = gr.Markdown(f"*{LABELS['ES']['subtitle']}*") with gr.Row(): combo = gr.Dropdown(choices=["ES","EN"], value="ES", label=LABELS["ES"]["combo"]) direction = gr.Radio(choices=LABELS["ES"]["dir_opts"], value="ES → NI", label=LABELS["ES"]["dir"]) with gr.Group(): doc_header = gr.Markdown(f"## {LABELS['ES']['doc_header']}") acc_titles = LABELS["ES"]["acc_titles"] with gr.Accordion(acc_titles[0], open=False) as acc1: md1 = gr.Markdown(DOC["ES"][0]) with gr.Accordion(acc_titles[1], open=False) as acc2: md2 = gr.Markdown(DOC["ES"][1]) with gr.Accordion(acc_titles[2], open=False) as acc3: md3 = gr.Markdown(DOC["ES"][2]) with gr.Accordion(acc_titles[3], open=False) as acc4: md4 = gr.Markdown(DOC["ES"][3]) with gr.Accordion(acc_titles[4], open=False) as acc5: md5 = gr.Markdown(DOC["ES"][4]) with gr.Accordion(acc_titles[5], open=False) as acc6: md6 = gr.Markdown(DOC["ES"][5]) with gr.Accordion(acc_titles[6], open=False) as acc7: md7 = gr.Markdown(DOC["ES"][6]) with gr.Accordion(acc_titles[7], open=False) as acc8: md8 = gr.Markdown(DOC["ES"][7]) with gr.Accordion(acc_titles[8], open=False) as acc9: md9 = gr.Markdown(DOC["ES"][8]) with gr.Group(): es_in = gr.Textbox(label=LABELS["ES"]["in_label_es"], placeholder=LABELS["ES"]["in_ph_es"], lines=5) btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary") with gr.Row(): with gr.Column(scale=2): ni_out = gr.Textbox(label=LABELS["ES"]["out_lat_esni"], lines=5, interactive=False) loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=False) audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy") with gr.Column(scale=1): ib_out = gr.HTML(label=LABELS["ES"]["out_ib"]) def do_translate(text, dir_label): if not text or not text.strip(): return (gr.update(value=""), gr.update(value="
"), gr.update(visible=False), gr.update(value=None)) if dir_label.startswith("ES"): latin, ib = translate(text) ib_html = "
" + escape(ib) + "
" return (gr.update(label=LABELS["ES"]["out_lat_esni"], value=latin), gr.update(value=ib_html), gr.update(visible=True), gr.update(value=None)) else: es_text = translate_ni_to_es(text) return (gr.update(label=LABELS["ES"]["out_lat_nies"], value=es_text), gr.update(value="
"), gr.update(visible=False), gr.update(value=None)) btn_tr.click(do_translate, [es_in, direction], [ni_out, ib_out, loc_btn, audio_out]) def run_locution(latin_text, dir_label): if dir_label.startswith("ES"): return synthesize_speech(latin_text) return None loc_btn.click(run_locution, [ni_out, direction], audio_out) def switch_lang(sel_lang, dir_label): L=LABELS[sel_lang]; T=L["acc_titles"]; D=DOC[sel_lang] # Input/Output labels dependen de la dirección in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] return ( gr.update(value=f"# {L['title']}"), gr.update(value=f"*{L['subtitle']}*"), gr.update(label=L["combo"], value=sel_lang), gr.update(label=L["dir"], choices=L["dir_opts"], value=dir_label), gr.update(value=f"## {L['doc_header']}"), gr.update(label=T[0]), gr.update(value=D[0]), gr.update(label=T[1]), gr.update(value=D[1]), gr.update(label=T[2]), gr.update(value=D[2]), gr.update(label=T[3]), gr.update(value=D[3]), gr.update(label=T[4]), gr.update(value=D[4]), gr.update(label=T[5]), gr.update(value=D[5]), gr.update(label=T[6]), gr.update(value=D[6]), gr.update(label=T[7]), gr.update(value=D[7]), gr.update(label=T[8]), gr.update(value=D[8]), gr.update(label=in_label, placeholder=in_ph), gr.update(label=out_lab), gr.update(label=L["out_ib"]), gr.update(label=L["out_audio"]), gr.update(value=L["btn"]) ) combo.change( switch_lang, [combo, direction], [title, subtitle, combo, direction, doc_header, acc1, md1, acc2, md2, acc3, md3, acc4, md4, acc5, md5, acc6, md6, acc7, md7, acc8, md8, acc9, md9, es_in, ni_out, ib_out, audio_out, btn_tr] ) def switch_direction(dir_label, sel_lang): # Solo cambia etiquetas y visibilidad de Locutar/Línea ibérica L=LABELS[sel_lang] in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] # Locución solo para ES → NI loc_vis = True if dir_label.startswith("ES") else False # Línea ibérica visible solo para ES → NI (tras traducir) return (gr.update(label=in_label, placeholder=in_ph), gr.update(label=out_lab, value=""), gr.update(value="
"), gr.update(visible=loc_vis), gr.update(value=None)) direction.change( switch_direction, [direction, combo], [es_in, ni_out, ib_out, loc_btn, audio_out] ) if __name__ == "__main__": demo.queue().launch()