Spanish_NeoIberianNewDesign

Sleeping

App Files Files Community

LoloSemper commited on Nov 8, 2025

Commit

f9cd37e

1 Parent(s): 6dc49f4

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -96

app.py CHANGED Viewed

@@ -69,14 +69,26 @@ VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’"))
 _num_re = re.compile(r"^\d+([.,]\d+)?$")
 def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
-# --- NUEVO: separadores fuertes de cláusula + placeholders atómicos ---
-CLAUSE_BREAKS = {",", ";", "—", "–", ":"}  # cortes fuertes de cláusula (no fin de oración)
 PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
 def is_placeholder(tok: str) -> bool:
     return bool(PLACEHOLDER_RE.match(tok or ""))
 def simple_tokenize(text:str):
-    """Tokenización mínima, pero sin romper secuencias [ ... ] (placeholders)."""
     if not text:
         return []
     protected = []
@@ -85,16 +97,17 @@ def simple_tokenize(text:str):
         protected.append(m.group(0))
         return key
     t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
     t = re.sub(r"\s+"," ", t)
     t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
     toks = [tok for tok in t.split() if tok]
-    # restaura bloques protegidos
     for i, tok in enumerate(toks):
-        if tok.startswith("__BRK") and tok.endswith("__"):
-            idx = int(tok[5:-2])
-            toks[i] = protected[idx]
     return toks
 def detokenize(tokens):
@@ -111,17 +124,16 @@ def detokenize(tokens):
 # ====== Modalidad vascoide (-na / -ba) ======
 # Configuración
-MODAL_SUFFIX_ENABLE = True             # activar el sistema de modalidad
-MODAL_ONLY_ON_FINITE = True            # sólo añade -na/-ba a verbos finitos NI (si no, al último constituyente)
-MODAL_STRIP_QE_IN_NI = True            # ES→NI: tras añadir -na/-ba, quita ¿ ? ¡ ! en la salida NI
 # Conjuntos y ayudas
 SENT_END = {".", "!", "?", "…"}
 OPEN_FOR = {"?": "¿", "!": "¡"}
 WRAP_PREFIX = set(list("«“‘([{\"'"))
-# Sufijos de persona habituales en tu dataset (2P puede aparecer como -zuk o -zuek)
 PERS_ENDINGS = ("-n","-zu","-gu","-zuk","-zuek","-k")
-TAM_FINITE   = ("-ke","-bo","-ta","-ni","-tu")    # finitos: PRS/PST/FUT/SBJV/IMP
 def looks_like_finite_ni(tok:str)->bool:
     t = (tok or "").lower()
@@ -139,11 +151,25 @@ def last_content_index(tokens, start, end_exclusive):
     return i if i >= start else -1
 def strip_qe_punct(tokens):
-    """Quita ¿ ? ¡ ! (para NI minimalista)."""
     return [t for t in tokens if t not in ("¿","?","¡","!")]
 def add_modal_suffixes_es2ni(tokens):
-    """Añade -na (¿?) o -ba (¡!) al último verbo finito (o último constituyente) por oración."""
     if not MODAL_SUFFIX_ENABLE:
         return tokens
     out = tokens[:]
@@ -153,7 +179,6 @@ def add_modal_suffixes_es2ni(tokens):
     while i < n:
         if out[i] in ("?", "!"):
             closer = out[i]
-            # objetivo dentro [sent_start, i)
             target = -1
             j = i - 1
             while j >= sent_start:
@@ -166,7 +191,6 @@ def add_modal_suffixes_es2ni(tokens):
                 suf = "na" if closer == "?" else "ba"
                 if not re.search(rf"-(?:{suf})$", out[target].lower()):
                     out[target] = out[target] + "-" + suf
-            # siguiente oración
             sent_start = i + 1
         elif out[i] in SENT_END:
             sent_start = i + 1
@@ -177,22 +201,20 @@ def add_modal_suffixes_es2ni(tokens):
 def strip_modal_suffixes_ni(tokens):
     """
-    Interpreta -na / -ba como modalidad de la oración.
-    Mejoras:
-      - Si hay modo activo ('?'/'!') y aparece un separador fuerte de cláusula (coma, punto y coma, dos puntos, raya),
-        se cierra la modalidad ANTES del separador.
-      - Maneja abridores ¿/¡ iniciando tramo nuevo.
     """
     if not MODAL_SUFFIX_ENABLE:
         return tokens
-    out, buf = [], []
     pending_end = None
     mode = None  # "?" / "!"
     def _emit(end_override=None, also_append=None):
         nonlocal buf, mode, pending_end, out
-        local = [t for t in buf if t not in ("¿", "?", "¡", "!")]
         if local:
             end_tok = end_override or ("?" if mode == "?" else "!" if mode == "!" else pending_end or ".")
             out.extend(local)
@@ -201,64 +223,62 @@ def strip_modal_suffixes_ni(tokens):
         if also_append:
             out.append(also_append)
-    for t in tokens + ["."]:  # centinela
-        # abridores explícitos: cierran tramo previo y abren modo
         if t in ("¿", "¡"):
-            _emit()
-            mode = "?" if t == "¿" else "!"
             continue
-        # cierres explícitos: cierran ya
         if t in ("?", "!"):
-            pending_end = t
-            _emit()
-            continue
-        # finales de oración
         if t in SENT_END:
-            pending_end = t
-            _emit()
-            continue
-        # cortes fuertes de cláusula con modo activo -> cerrar antes del separador
-        if t in CLAUSE_BREAKS and mode in ("?", "!"):
-            _emit(end_override=None, also_append=t)
-            continue
-        # sufijos -na / -ba activan modo
         m = re.search(r"-(na|ba)$", (t or "").lower())
         if m:
-            if mode and buf:
-                _emit()  # cierra tramo anterior antes de cambiar de modo
             mode = "?" if m.group(1) == "na" else "!"
             t = t[:-len(m.group(0))]
         if t:
             buf.append(t)
-    # evitar doble punto por centinela
-    if len(out) >= 2 and out[-1] == "." and out[-2] == ".":
-        out.pop()
     return out
 def add_inverted_openers(tokens):
-    """Inserta ¿/¡ al inicio del tramo que termina con ?/!; respeta comillas y separadores fuertes."""
     out = tokens[:]
     START_BREAKS = SENT_END | CLAUSE_BREAKS
     i = 0
     while i < len(out):
         if out[i] in ("?", "!"):
-            closer = out[i]
-            opener = OPEN_FOR[closer]
-            # busca el inicio del tramo: tras el último fin de oración o separador fuerte
             j = i - 1
-            while j >= 0 and out[j] not in START_BREAKS:
                 j -= 1
             start = j + 1
-            # saltar abridores tipo «(“[
             k = start
             while k < i and out[k] in WRAP_PREFIX:
                 k += 1
             if not (k < len(out) and out[k] == opener):
-                out.insert(k, opener)
-                i += 1  # avanzamos por el insert
         i += 1
     return out
@@ -328,7 +348,7 @@ SYL_FOR = {
     "b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"],
     "d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"],
     "t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"],
-    "g":["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"],
     "k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]
 }
 ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›",
@@ -430,24 +450,20 @@ def load_bi_strict_and_diagnose():
                 es = lower(es_orig)
                 ni = lower(ni_orig)
-                # Frases (claves con espacios)
                 if " " in es:
                     ESPHRASE2NI[es] = (ni_orig, pid)
                 if " " in ni:
                     NIPHRASE2ES[ni] = (es_orig, pid)
-                # ES -> NI (primera ocurrencia gana)
                 if es in ES2NI: dup_es += 1
                 else: ES2NI[es] = (ni_orig, pid)
-                # NI -> ES (primera ocurrencia gana)
                 if ni in NI2ES: dup_ni += 1
                 else: NI2ES[ni] = (es_orig, pid)
                 base_rows.append((es_orig, ni_orig, pid, flags))
                 rows += 1
-        # Expansiones controladas
         if EXPANSION_ENABLE:
             for es_orig, ni_orig, pid, flags in base_rows:
                 if not flags: continue
@@ -456,15 +472,12 @@ def load_bi_strict_and_diagnose():
                     pl_key = lower(pl)
                     if pl_key not in ES2NI:
                         ES2NI[pl_key] = (ni_orig, pid)
-                        exp_plurals += 1
                 if _has_flag(flags, FLAG_3PL):
                     p3 = _present_3pl_from_3sg(es_orig)
                     p3_key = lower(p3)
                     if p3_key not in ES2NI:
                         ES2NI[p3_key] = (ni_orig, pid)
-                        exp_3pl += 1
-        # back-map check
         for es_low, (ni_surf, _) in ES2NI.items():
             ni_low = lower(ni_surf)
             back = NI2ES.get(ni_low)
@@ -488,10 +501,7 @@ def load_bi_strict_and_diagnose():
     if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
     if mismatch_backmap:
         print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
-    if EXPANSION_ENABLE:
-        print(f"[INFO] Expansiones: +{exp_plurals:,} plurales; +{exp_3pl:,} 3ª pl. (desde flags).")
-    # HTML de diagnóstico
     sam_html = ""
     if mismatch_samples:
         sam_rows = "".join(
@@ -508,8 +518,6 @@ def load_bi_strict_and_diagnose():
       ES únicas (tras expansiones): <b>{es_unique:,}</b> &nbsp;|&nbsp; NI únicas: <b>{ni_unique:,}</b> &nbsp;|&nbsp; pair_id únicos: <b>{pid_unique:,}</b><br>
       Duplicados ES: <b>{dup_es:,}</b> &nbsp;|&nbsp; Duplicados NI: <b>{dup_ni:,}</b> &nbsp;|&nbsp; Sin pair_id: <b>{empty_pid:,}</b><br>
       Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b><br>
-      Expansiones por flags: <b>+{exp_plurals:,}</b> plurales, <b>+{exp_3pl:,}</b> 3ª pl. presente.
-      {sam_html}
       <hr style="border:0;border-top:1px solid #caa">
       <small>Regla: el motor usa <b>sólo</b> tablas 1:1 y expansiones <b>explícitas por bandera</b> (flags) en el CSV.
       Nada “adivina”.</small>
@@ -546,14 +554,13 @@ def sentence_case_spanish(s: str) -> str:
         if not in_br and start:
             if ch.isspace():
-                out.append(ch)  # seguimos buscando primera letra
             elif ch in WRAPS:
-                out.append(ch)   # abridores no rompen el "start"
             elif ch.isalpha():
                 out.append(ch.upper()); start = False
             else:
                 out.append(ch)
-                # un no-letra puede seguir siendo inicio si también es ¿/¡
                 start = ch in "¿¡"
         else:
             out.append(ch)
@@ -568,26 +575,20 @@ def sentence_case_spanish(s: str) -> str:
     return "".join(out)
 def postprocess_spanish(s: str) -> str:
-    # colapsar espacios en horas 12:30 y decimales 1.234,56 / 3,50
-    s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s)          # horas
-    s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s)    # decimales y miles+decimales
-    # limpiar dobles espacios
     s = re.sub(r"\s{2,}", " ", s)
-    # quitar espacio ANTES de .,;:!? (refuerzo)
     s = re.sub(r"\s+([,.;:!?])", r"\1", s)
-    # añadir espacio DESPUÉS de .,;:!? cuando viene letra/dígito
     s = re.sub(r"([?!.:,;])([^\s])", r"\1 \2", s)
-    # no meter espacio después de signo de apertura invertido
     s = re.sub(r"([¿¡])\s+", r"\1", s)
-    # capitalización por oración
-    s = sentence_case_spanish(s)
-    return s.strip()
 # ====== Traducción BI estricta ======
 def translate_es_to_ni_bi(text:str):
     toks = simple_tokenize(text)
-    # NGRAM ES→NI
     out=[]; ib_toks=[]
     i=0
     while i < len(toks):
@@ -600,8 +601,7 @@ def translate_es_to_ni_bi(text:str):
         if span > 1:
             out.append(ni_surface)
             ib_toks.append(georgeos_keys(tokens_from_latin(ni_surface), ni_surface))
-            i += span
-            continue
         key = lower(t)
         if key in ES2NI:
             ni = ES2NI[key][0]
@@ -614,7 +614,6 @@ def translate_es_to_ni_bi(text:str):
             out.append(ph); ib_toks.append(ph)
         i += 1
-    # Modalidad -na/-ba + NI sin ¿?¡!
     if MODAL_SUFFIX_ENABLE:
         out = add_modal_suffixes_es2ni(out)
         ib_toks = []
@@ -631,7 +630,6 @@ def translate_es_to_ni_bi(text:str):
 def translate_ni_to_es_bi(text:str):
     toks = simple_tokenize(text)
-    # Modalidad: acepta -na/-ba; segmenta y cierra antes de separadores fuertes
     if MODAL_SUFFIX_ENABLE:
         toks = strip_modal_suffixes_ni(toks)
@@ -643,12 +641,9 @@ def translate_ni_to_es_bi(text:str):
             out.append(t); i+=1; continue
         if is_placeholder(t):
             out.append(t); i+=1; continue
-        # NGRAM NI→ES
         span, es_surface = _longest_match(toks, i, NIPHRASE2ES)
         if span > 1:
-            out.append(es_surface)
-            i += span
-            continue
         key = lower(t)
         if key in NI2ES:
@@ -664,7 +659,7 @@ def translate_ni_to_es_bi(text:str):
         out = add_inverted_openers(out)
     es_text = detokenize(out)
-    es_text = postprocess_spanish(es_text)  # limpieza + mayúsculas / horas / decimales
     return es_text
 # ====== Diagnóstico ======
@@ -1005,12 +1000,19 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue
 # ====== smoke opcional ======
 def _symmetry_smoketest():
     print("\n[SMOKE] Prueba ES↔NI (BI-estricto)…")
-    probe = "nuker-ke ni etxe-ka ?"  # con ? explícita
-    es_from_ni = translate_ni_to_es_bi(probe)
-    ni_round, _ = translate_es_to_ni_bi(es_from_ni)
-    unk_inv = len(re.findall(r"\[\?:", es_from_ni))
-    unk_esni = len(re.findall(r"\[SIN-LEX:", ni_round))
-    print(f"  NI→ES unknowns: {unk_inv}; ES→NI unknowns (roundtrip): {unk_esni}")
 if DEBUG_MODE:
     _symmetry_smoketest()
@@ -1021,4 +1023,3 @@ if __name__ == "__main__":

 _num_re = re.compile(r"^\d+([.,]\d+)?$")
 def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
+# --- separadores de cláusula + placeholders atómicos ---
+CLAUSE_BREAKS = {",", ";", "—", "–", ":"}  # cortes fuertes (no fin de oración)
 PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
 def is_placeholder(tok: str) -> bool:
     return bool(PLACEHOLDER_RE.match(tok or ""))
+def _restore_brk(tok, protected):
+    """
+    Restaura __BRKn__ y también __BRKn__-na / __BRKn__-ba a su forma original,
+    manteniendo el sufijo modal si existe (p.ej. '[SIN-LEX:Tomás]-na').
+    """
+    m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "")
+    if not m: return tok
+    idx = int(m.group(1))
+    suf = m.group(2)
+    base = protected[idx] if 0 <= idx < len(protected) else tok
+    return base + (f"-{suf}" if suf else "")
 def simple_tokenize(text:str):
+    """Tokenización mínima, sin romper [ ... ] ni [ ... ]-na/-ba."""
     if not text:
         return []
     protected = []
         protected.append(m.group(0))
         return key
+    # protegemos bloques [ ... ]
     t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
     t = re.sub(r"\s+"," ", t)
     t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
     toks = [tok for tok in t.split() if tok]
+    # restaura bloques protegidos (con soporte -na/-ba adheridos)
     for i, tok in enumerate(toks):
+        # si viene pegado el sufijo modal, no se habrá restaurado; hacemos la restauración robusta
+        if tok.startswith("__BRK") and "__" in tok:
+            toks[i] = _restore_brk(tok, protected)
     return toks
 def detokenize(tokens):
 # ====== Modalidad vascoide (-na / -ba) ======
 # Configuración
+MODAL_SUFFIX_ENABLE = True
+MODAL_ONLY_ON_FINITE = True
+MODAL_STRIP_QE_IN_NI = True
 # Conjuntos y ayudas
 SENT_END = {".", "!", "?", "…"}
 OPEN_FOR = {"?": "¿", "!": "¡"}
 WRAP_PREFIX = set(list("«“‘([{\"'"))
 PERS_ENDINGS = ("-n","-zu","-gu","-zuk","-zuek","-k")
+TAM_FINITE   = ("-ke","-bo","-ta","-ni","-tu")
 def looks_like_finite_ni(tok:str)->bool:
     t = (tok or "").lower()
     return i if i >= start else -1
 def strip_qe_punct(tokens):
     return [t for t in tokens if t not in ("¿","?","¡","!")]
+# --- helpers numéricos para no cortar decimales/horas ---
+def _is_numeric_comma(tokens, i):
+    return (0 < i < len(tokens)-1 and tokens[i] == "," and
+            is_number(tokens[i-1]) and is_number(tokens[i+1]))
+def _is_time_colon(tokens, i):
+    return (0 < i < len(tokens)-1 and tokens[i] == ":" and
+            is_number(tokens[i-1]) and is_number(tokens[i+1]))
+def _is_true_clause_break(tokens, i):
+    if tokens[i] not in CLAUSE_BREAKS: return False
+    if _is_numeric_comma(tokens, i): return False
+    if _is_time_colon(tokens, i): return False
+    return True
 def add_modal_suffixes_es2ni(tokens):
+    """Añade -na/-ba al último verbo finito (o último constituyente) por oración."""
     if not MODAL_SUFFIX_ENABLE:
         return tokens
     out = tokens[:]
     while i < n:
         if out[i] in ("?", "!"):
             closer = out[i]
             target = -1
             j = i - 1
             while j >= sent_start:
                 suf = "na" if closer == "?" else "ba"
                 if not re.search(rf"-(?:{suf})$", out[target].lower()):
                     out[target] = out[target] + "-" + suf
             sent_start = i + 1
         elif out[i] in SENT_END:
             sent_start = i + 1
 def strip_modal_suffixes_ni(tokens):
     """
+    Interpreta -na/-ba como modalidad; cierra antes de separadores fuertes,
+    excepto cuando la coma/“:” son numéricos (12,75 / 18:30).
     """
     if not MODAL_SUFFIX_ENABLE:
         return tokens
+    out = []
+    buf = []
     pending_end = None
     mode = None  # "?" / "!"
     def _emit(end_override=None, also_append=None):
         nonlocal buf, mode, pending_end, out
+        local = [t for t in buf if t not in ("¿","?","¡","!")]
         if local:
             end_tok = end_override or ("?" if mode == "?" else "!" if mode == "!" else pending_end or ".")
             out.extend(local)
         if also_append:
             out.append(also_append)
+    toks = tokens + ["."]
+    for i, t in enumerate(toks):
+        # Abridores explícitos
         if t in ("¿", "¡"):
+            _emit(); mode = "?" if t == "¿" else "!"
             continue
+        # Cierres explícitos
         if t in ("?", "!"):
+            pending_end = t; _emit(); continue
+        # Final de oración
         if t in SENT_END:
+            pending_end = t; _emit(); continue
+        # Separadores fuertes (no numéricos)
+        if t in CLAUSE_BREAKS and mode in ("?","!"):
+            if not _is_true_clause_break(toks, i):
+                # es decimal/hora -> no cerrar
+                pass
+            else:
+                _emit(also_append=t); continue
+        # Sufijos -na/-ba (en cualquier token, incl. placeholders)
         m = re.search(r"-(na|ba)$", (t or "").lower())
         if m:
+            if mode and buf: _emit()
             mode = "?" if m.group(1) == "na" else "!"
             t = t[:-len(m.group(0))]
         if t:
             buf.append(t)
+    if len(out) >= 2 and out[-1] == "." and out[-2] == ".": out.pop()
     return out
 def add_inverted_openers(tokens):
+    """Inserta ¿/¡ al inicio de cada tramo que acaba en ?/!, ignorando comas/“:” numéricos."""
     out = tokens[:]
     START_BREAKS = SENT_END | CLAUSE_BREAKS
+    def _is_true_start_break(idx):
+        if out[idx] in SENT_END: return True
+        if out[idx] in CLAUSE_BREAKS: return _is_true_clause_break(out, idx)
+        return False
     i = 0
     while i < len(out):
         if out[i] in ("?", "!"):
+            closer = out[i]; opener = OPEN_FOR[closer]
+            # inicio del tramo = después del último fin de oración o separador FALSO/VERDADERO
             j = i - 1
+            while j >= 0 and not _is_true_start_break(j):
                 j -= 1
             start = j + 1
             k = start
             while k < i and out[k] in WRAP_PREFIX:
                 k += 1
             if not (k < len(out) and out[k] == opener):
+                out.insert(k, opener); i += 1
         i += 1
     return out
     "b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"],
     "d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"],
     "t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"],
+    "g":["‹GA›","‹GE›","‹GI›","‹DO›","‹GU›"] if False else ["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"],
     "k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]
 }
 ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›",
                 es = lower(es_orig)
                 ni = lower(ni_orig)
                 if " " in es:
                     ESPHRASE2NI[es] = (ni_orig, pid)
                 if " " in ni:
                     NIPHRASE2ES[ni] = (es_orig, pid)
                 if es in ES2NI: dup_es += 1
                 else: ES2NI[es] = (ni_orig, pid)
                 if ni in NI2ES: dup_ni += 1
                 else: NI2ES[ni] = (es_orig, pid)
                 base_rows.append((es_orig, ni_orig, pid, flags))
                 rows += 1
         if EXPANSION_ENABLE:
             for es_orig, ni_orig, pid, flags in base_rows:
                 if not flags: continue
                     pl_key = lower(pl)
                     if pl_key not in ES2NI:
                         ES2NI[pl_key] = (ni_orig, pid)
                 if _has_flag(flags, FLAG_3PL):
                     p3 = _present_3pl_from_3sg(es_orig)
                     p3_key = lower(p3)
                     if p3_key not in ES2NI:
                         ES2NI[p3_key] = (ni_orig, pid)
         for es_low, (ni_surf, _) in ES2NI.items():
             ni_low = lower(ni_surf)
             back = NI2ES.get(ni_low)
     if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
     if mismatch_backmap:
         print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
     sam_html = ""
     if mismatch_samples:
         sam_rows = "".join(
       ES únicas (tras expansiones): <b>{es_unique:,}</b> &nbsp;|&nbsp; NI únicas: <b>{ni_unique:,}</b> &nbsp;|&nbsp; pair_id únicos: <b>{pid_unique:,}</b><br>
       Duplicados ES: <b>{dup_es:,}</b> &nbsp;|&nbsp; Duplicados NI: <b>{dup_ni:,}</b> &nbsp;|&nbsp; Sin pair_id: <b>{empty_pid:,}</b><br>
       Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b><br>
       <hr style="border:0;border-top:1px solid #caa">
       <small>Regla: el motor usa <b>sólo</b> tablas 1:1 y expansiones <b>explícitas por bandera</b> (flags) en el CSV.
       Nada “adivina”.</small>
         if not in_br and start:
             if ch.isspace():
+                out.append(ch)
             elif ch in WRAPS:
+                out.append(ch)
             elif ch.isalpha():
                 out.append(ch.upper()); start = False
             else:
                 out.append(ch)
                 start = ch in "¿¡"
         else:
             out.append(ch)
     return "".join(out)
 def postprocess_spanish(s: str) -> str:
+    # compactar horas y decimales
+    s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s)
+    s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s)
+    # espacios y signos
     s = re.sub(r"\s{2,}", " ", s)
     s = re.sub(r"\s+([,.;:!?])", r"\1", s)
     s = re.sub(r"([?!.:,;])([^\s])", r"\1 \2", s)
     s = re.sub(r"([¿¡])\s+", r"\1", s)
+    return sentence_case_spanish(s).strip()
 # ====== Traducción BI estricta ======
 def translate_es_to_ni_bi(text:str):
     toks = simple_tokenize(text)
     out=[]; ib_toks=[]
     i=0
     while i < len(toks):
         if span > 1:
             out.append(ni_surface)
             ib_toks.append(georgeos_keys(tokens_from_latin(ni_surface), ni_surface))
+            i += span; continue
         key = lower(t)
         if key in ES2NI:
             ni = ES2NI[key][0]
             out.append(ph); ib_toks.append(ph)
         i += 1
     if MODAL_SUFFIX_ENABLE:
         out = add_modal_suffixes_es2ni(out)
         ib_toks = []
 def translate_ni_to_es_bi(text:str):
     toks = simple_tokenize(text)
     if MODAL_SUFFIX_ENABLE:
         toks = strip_modal_suffixes_ni(toks)
             out.append(t); i+=1; continue
         if is_placeholder(t):
             out.append(t); i+=1; continue
         span, es_surface = _longest_match(toks, i, NIPHRASE2ES)
         if span > 1:
+            out.append(es_surface); i += span; continue
         key = lower(t)
         if key in NI2ES:
         out = add_inverted_openers(out)
     es_text = detokenize(out)
+    es_text = postprocess_spanish(es_text)
     return es_text
 # ====== Diagnóstico ======
 # ====== smoke opcional ======
 def _symmetry_smoketest():
     print("\n[SMOKE] Prueba ES↔NI (BI-estricto)…")
+    probes = [
+        "nuker-ke ni etxe-ka ?",                  # modal simple
+        "¿Pagaste 12,75 en la cafetería?",        # decimal
+        "Marta llega a las 18:30.",               # hora
+        "[SIN-LEX:Tomás]-na euŕak-ke !"           # placeholder + -na + cierre explícito
+    ]
+    for p in probes:
+        es_from_ni = translate_ni_to_es_bi(p)
+        ni_round, _ = translate_es_to_ni_bi(es_from_ni)
+        print("  IN:", p)
+        print("  ES:", es_from_ni)
+        print("  NI:", ni_round)
+        print("---")
 if DEBUG_MODE:
     _symmetry_smoketest()