Commit ·
f9cd37e
1
Parent(s): 6dc49f4
Update app.py
Browse files
app.py
CHANGED
|
@@ -69,14 +69,26 @@ VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’"))
|
|
| 69 |
_num_re = re.compile(r"^\d+([.,]\d+)?$")
|
| 70 |
def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
|
| 71 |
|
| 72 |
-
# ---
|
| 73 |
-
CLAUSE_BREAKS = {",", ";", "—", "–", ":"} # cortes fuertes
|
| 74 |
PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
|
| 75 |
def is_placeholder(tok: str) -> bool:
|
| 76 |
return bool(PLACEHOLDER_RE.match(tok or ""))
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
def simple_tokenize(text:str):
|
| 79 |
-
"""Tokenización mínima,
|
| 80 |
if not text:
|
| 81 |
return []
|
| 82 |
protected = []
|
|
@@ -85,16 +97,17 @@ def simple_tokenize(text:str):
|
|
| 85 |
protected.append(m.group(0))
|
| 86 |
return key
|
| 87 |
|
|
|
|
| 88 |
t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
|
| 89 |
t = re.sub(r"\s+"," ", t)
|
| 90 |
t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
|
| 91 |
toks = [tok for tok in t.split() if tok]
|
| 92 |
|
| 93 |
-
# restaura bloques protegidos
|
| 94 |
for i, tok in enumerate(toks):
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
toks[i] = protected
|
| 98 |
return toks
|
| 99 |
|
| 100 |
def detokenize(tokens):
|
|
@@ -111,17 +124,16 @@ def detokenize(tokens):
|
|
| 111 |
|
| 112 |
# ====== Modalidad vascoide (-na / -ba) ======
|
| 113 |
# Configuración
|
| 114 |
-
MODAL_SUFFIX_ENABLE = True
|
| 115 |
-
MODAL_ONLY_ON_FINITE = True
|
| 116 |
-
MODAL_STRIP_QE_IN_NI = True
|
| 117 |
|
| 118 |
# Conjuntos y ayudas
|
| 119 |
SENT_END = {".", "!", "?", "…"}
|
| 120 |
OPEN_FOR = {"?": "¿", "!": "¡"}
|
| 121 |
WRAP_PREFIX = set(list("«“‘([{\"'"))
|
| 122 |
-
# Sufijos de persona habituales en tu dataset (2P puede aparecer como -zuk o -zuek)
|
| 123 |
PERS_ENDINGS = ("-n","-zu","-gu","-zuk","-zuek","-k")
|
| 124 |
-
TAM_FINITE = ("-ke","-bo","-ta","-ni","-tu")
|
| 125 |
|
| 126 |
def looks_like_finite_ni(tok:str)->bool:
|
| 127 |
t = (tok or "").lower()
|
|
@@ -139,11 +151,25 @@ def last_content_index(tokens, start, end_exclusive):
|
|
| 139 |
return i if i >= start else -1
|
| 140 |
|
| 141 |
def strip_qe_punct(tokens):
|
| 142 |
-
"""Quita ¿ ? ¡ ! (para NI minimalista)."""
|
| 143 |
return [t for t in tokens if t not in ("¿","?","¡","!")]
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
def add_modal_suffixes_es2ni(tokens):
|
| 146 |
-
"""Añade -na
|
| 147 |
if not MODAL_SUFFIX_ENABLE:
|
| 148 |
return tokens
|
| 149 |
out = tokens[:]
|
|
@@ -153,7 +179,6 @@ def add_modal_suffixes_es2ni(tokens):
|
|
| 153 |
while i < n:
|
| 154 |
if out[i] in ("?", "!"):
|
| 155 |
closer = out[i]
|
| 156 |
-
# objetivo dentro [sent_start, i)
|
| 157 |
target = -1
|
| 158 |
j = i - 1
|
| 159 |
while j >= sent_start:
|
|
@@ -166,7 +191,6 @@ def add_modal_suffixes_es2ni(tokens):
|
|
| 166 |
suf = "na" if closer == "?" else "ba"
|
| 167 |
if not re.search(rf"-(?:{suf})$", out[target].lower()):
|
| 168 |
out[target] = out[target] + "-" + suf
|
| 169 |
-
# siguiente oración
|
| 170 |
sent_start = i + 1
|
| 171 |
elif out[i] in SENT_END:
|
| 172 |
sent_start = i + 1
|
|
@@ -177,22 +201,20 @@ def add_modal_suffixes_es2ni(tokens):
|
|
| 177 |
|
| 178 |
def strip_modal_suffixes_ni(tokens):
|
| 179 |
"""
|
| 180 |
-
Interpreta -na
|
| 181 |
-
|
| 182 |
-
- Si hay modo activo ('?'/'!') y aparece un separador fuerte de cláusula (coma, punto y coma, dos puntos, raya),
|
| 183 |
-
se cierra la modalidad ANTES del separador.
|
| 184 |
-
- Maneja abridores ¿/¡ iniciando tramo nuevo.
|
| 185 |
"""
|
| 186 |
if not MODAL_SUFFIX_ENABLE:
|
| 187 |
return tokens
|
| 188 |
|
| 189 |
-
out
|
|
|
|
| 190 |
pending_end = None
|
| 191 |
mode = None # "?" / "!"
|
| 192 |
|
| 193 |
def _emit(end_override=None, also_append=None):
|
| 194 |
nonlocal buf, mode, pending_end, out
|
| 195 |
-
local = [t for t in buf if t not in ("¿",
|
| 196 |
if local:
|
| 197 |
end_tok = end_override or ("?" if mode == "?" else "!" if mode == "!" else pending_end or ".")
|
| 198 |
out.extend(local)
|
|
@@ -201,64 +223,62 @@ def strip_modal_suffixes_ni(tokens):
|
|
| 201 |
if also_append:
|
| 202 |
out.append(also_append)
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
|
|
|
| 206 |
if t in ("¿", "¡"):
|
| 207 |
-
_emit()
|
| 208 |
-
mode = "?" if t == "¿" else "!"
|
| 209 |
continue
|
| 210 |
-
#
|
| 211 |
if t in ("?", "!"):
|
| 212 |
-
pending_end = t
|
| 213 |
-
|
| 214 |
-
continue
|
| 215 |
-
# finales de oración
|
| 216 |
if t in SENT_END:
|
| 217 |
-
pending_end = t
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
| 224 |
|
| 225 |
-
#
|
| 226 |
m = re.search(r"-(na|ba)$", (t or "").lower())
|
| 227 |
if m:
|
| 228 |
-
if mode and buf:
|
| 229 |
-
_emit() # cierra tramo anterior antes de cambiar de modo
|
| 230 |
mode = "?" if m.group(1) == "na" else "!"
|
| 231 |
t = t[:-len(m.group(0))]
|
| 232 |
|
| 233 |
if t:
|
| 234 |
buf.append(t)
|
| 235 |
|
| 236 |
-
|
| 237 |
-
if len(out) >= 2 and out[-1] == "." and out[-2] == ".":
|
| 238 |
-
out.pop()
|
| 239 |
return out
|
| 240 |
|
| 241 |
def add_inverted_openers(tokens):
|
| 242 |
-
"""Inserta ¿/¡ al inicio
|
| 243 |
out = tokens[:]
|
| 244 |
START_BREAKS = SENT_END | CLAUSE_BREAKS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
i = 0
|
| 246 |
while i < len(out):
|
| 247 |
if out[i] in ("?", "!"):
|
| 248 |
-
closer = out[i]
|
| 249 |
-
|
| 250 |
-
# busca el inicio del tramo: tras el último fin de oración o separador fuerte
|
| 251 |
j = i - 1
|
| 252 |
-
while j >= 0 and
|
| 253 |
j -= 1
|
| 254 |
start = j + 1
|
| 255 |
-
# saltar abridores tipo «(“[
|
| 256 |
k = start
|
| 257 |
while k < i and out[k] in WRAP_PREFIX:
|
| 258 |
k += 1
|
| 259 |
if not (k < len(out) and out[k] == opener):
|
| 260 |
-
out.insert(k, opener)
|
| 261 |
-
i += 1 # avanzamos por el insert
|
| 262 |
i += 1
|
| 263 |
return out
|
| 264 |
|
|
@@ -328,7 +348,7 @@ SYL_FOR = {
|
|
| 328 |
"b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"],
|
| 329 |
"d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"],
|
| 330 |
"t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"],
|
| 331 |
-
"g":["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"],
|
| 332 |
"k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]
|
| 333 |
}
|
| 334 |
ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›",
|
|
@@ -430,24 +450,20 @@ def load_bi_strict_and_diagnose():
|
|
| 430 |
es = lower(es_orig)
|
| 431 |
ni = lower(ni_orig)
|
| 432 |
|
| 433 |
-
# Frases (claves con espacios)
|
| 434 |
if " " in es:
|
| 435 |
ESPHRASE2NI[es] = (ni_orig, pid)
|
| 436 |
if " " in ni:
|
| 437 |
NIPHRASE2ES[ni] = (es_orig, pid)
|
| 438 |
|
| 439 |
-
# ES -> NI (primera ocurrencia gana)
|
| 440 |
if es in ES2NI: dup_es += 1
|
| 441 |
else: ES2NI[es] = (ni_orig, pid)
|
| 442 |
|
| 443 |
-
# NI -> ES (primera ocurrencia gana)
|
| 444 |
if ni in NI2ES: dup_ni += 1
|
| 445 |
else: NI2ES[ni] = (es_orig, pid)
|
| 446 |
|
| 447 |
base_rows.append((es_orig, ni_orig, pid, flags))
|
| 448 |
rows += 1
|
| 449 |
|
| 450 |
-
# Expansiones controladas
|
| 451 |
if EXPANSION_ENABLE:
|
| 452 |
for es_orig, ni_orig, pid, flags in base_rows:
|
| 453 |
if not flags: continue
|
|
@@ -456,15 +472,12 @@ def load_bi_strict_and_diagnose():
|
|
| 456 |
pl_key = lower(pl)
|
| 457 |
if pl_key not in ES2NI:
|
| 458 |
ES2NI[pl_key] = (ni_orig, pid)
|
| 459 |
-
exp_plurals += 1
|
| 460 |
if _has_flag(flags, FLAG_3PL):
|
| 461 |
p3 = _present_3pl_from_3sg(es_orig)
|
| 462 |
p3_key = lower(p3)
|
| 463 |
if p3_key not in ES2NI:
|
| 464 |
ES2NI[p3_key] = (ni_orig, pid)
|
| 465 |
-
exp_3pl += 1
|
| 466 |
|
| 467 |
-
# back-map check
|
| 468 |
for es_low, (ni_surf, _) in ES2NI.items():
|
| 469 |
ni_low = lower(ni_surf)
|
| 470 |
back = NI2ES.get(ni_low)
|
|
@@ -488,10 +501,7 @@ def load_bi_strict_and_diagnose():
|
|
| 488 |
if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
|
| 489 |
if mismatch_backmap:
|
| 490 |
print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
|
| 491 |
-
if EXPANSION_ENABLE:
|
| 492 |
-
print(f"[INFO] Expansiones: +{exp_plurals:,} plurales; +{exp_3pl:,} 3ª pl. (desde flags).")
|
| 493 |
|
| 494 |
-
# HTML de diagnóstico
|
| 495 |
sam_html = ""
|
| 496 |
if mismatch_samples:
|
| 497 |
sam_rows = "".join(
|
|
@@ -508,8 +518,6 @@ def load_bi_strict_and_diagnose():
|
|
| 508 |
ES únicas (tras expansiones): <b>{es_unique:,}</b> | NI únicas: <b>{ni_unique:,}</b> | pair_id únicos: <b>{pid_unique:,}</b><br>
|
| 509 |
Duplicados ES: <b>{dup_es:,}</b> | Duplicados NI: <b>{dup_ni:,}</b> | Sin pair_id: <b>{empty_pid:,}</b><br>
|
| 510 |
Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b><br>
|
| 511 |
-
Expansiones por flags: <b>+{exp_plurals:,}</b> plurales, <b>+{exp_3pl:,}</b> 3ª pl. presente.
|
| 512 |
-
{sam_html}
|
| 513 |
<hr style="border:0;border-top:1px solid #caa">
|
| 514 |
<small>Regla: el motor usa <b>sólo</b> tablas 1:1 y expansiones <b>explícitas por bandera</b> (flags) en el CSV.
|
| 515 |
Nada “adivina”.</small>
|
|
@@ -546,14 +554,13 @@ def sentence_case_spanish(s: str) -> str:
|
|
| 546 |
|
| 547 |
if not in_br and start:
|
| 548 |
if ch.isspace():
|
| 549 |
-
out.append(ch)
|
| 550 |
elif ch in WRAPS:
|
| 551 |
-
out.append(ch)
|
| 552 |
elif ch.isalpha():
|
| 553 |
out.append(ch.upper()); start = False
|
| 554 |
else:
|
| 555 |
out.append(ch)
|
| 556 |
-
# un no-letra puede seguir siendo inicio si también es ¿/¡
|
| 557 |
start = ch in "¿¡"
|
| 558 |
else:
|
| 559 |
out.append(ch)
|
|
@@ -568,26 +575,20 @@ def sentence_case_spanish(s: str) -> str:
|
|
| 568 |
return "".join(out)
|
| 569 |
|
| 570 |
def postprocess_spanish(s: str) -> str:
|
| 571 |
-
#
|
| 572 |
-
s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s)
|
| 573 |
-
s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s)
|
| 574 |
-
#
|
| 575 |
s = re.sub(r"\s{2,}", " ", s)
|
| 576 |
-
# quitar espacio ANTES de .,;:!? (refuerzo)
|
| 577 |
s = re.sub(r"\s+([,.;:!?])", r"\1", s)
|
| 578 |
-
# añadir espacio DESPUÉS de .,;:!? cuando viene letra/dígito
|
| 579 |
s = re.sub(r"([?!.:,;])([^\s])", r"\1 \2", s)
|
| 580 |
-
# no meter espacio después de signo de apertura invertido
|
| 581 |
s = re.sub(r"([¿¡])\s+", r"\1", s)
|
| 582 |
-
|
| 583 |
-
s = sentence_case_spanish(s)
|
| 584 |
-
return s.strip()
|
| 585 |
|
| 586 |
# ====== Traducción BI estricta ======
|
| 587 |
def translate_es_to_ni_bi(text:str):
|
| 588 |
toks = simple_tokenize(text)
|
| 589 |
|
| 590 |
-
# NGRAM ES→NI
|
| 591 |
out=[]; ib_toks=[]
|
| 592 |
i=0
|
| 593 |
while i < len(toks):
|
|
@@ -600,8 +601,7 @@ def translate_es_to_ni_bi(text:str):
|
|
| 600 |
if span > 1:
|
| 601 |
out.append(ni_surface)
|
| 602 |
ib_toks.append(georgeos_keys(tokens_from_latin(ni_surface), ni_surface))
|
| 603 |
-
i += span
|
| 604 |
-
continue
|
| 605 |
key = lower(t)
|
| 606 |
if key in ES2NI:
|
| 607 |
ni = ES2NI[key][0]
|
|
@@ -614,7 +614,6 @@ def translate_es_to_ni_bi(text:str):
|
|
| 614 |
out.append(ph); ib_toks.append(ph)
|
| 615 |
i += 1
|
| 616 |
|
| 617 |
-
# Modalidad -na/-ba + NI sin ¿?¡!
|
| 618 |
if MODAL_SUFFIX_ENABLE:
|
| 619 |
out = add_modal_suffixes_es2ni(out)
|
| 620 |
ib_toks = []
|
|
@@ -631,7 +630,6 @@ def translate_es_to_ni_bi(text:str):
|
|
| 631 |
def translate_ni_to_es_bi(text:str):
|
| 632 |
toks = simple_tokenize(text)
|
| 633 |
|
| 634 |
-
# Modalidad: acepta -na/-ba; segmenta y cierra antes de separadores fuertes
|
| 635 |
if MODAL_SUFFIX_ENABLE:
|
| 636 |
toks = strip_modal_suffixes_ni(toks)
|
| 637 |
|
|
@@ -643,12 +641,9 @@ def translate_ni_to_es_bi(text:str):
|
|
| 643 |
out.append(t); i+=1; continue
|
| 644 |
if is_placeholder(t):
|
| 645 |
out.append(t); i+=1; continue
|
| 646 |
-
# NGRAM NI→ES
|
| 647 |
span, es_surface = _longest_match(toks, i, NIPHRASE2ES)
|
| 648 |
if span > 1:
|
| 649 |
-
out.append(es_surface)
|
| 650 |
-
i += span
|
| 651 |
-
continue
|
| 652 |
|
| 653 |
key = lower(t)
|
| 654 |
if key in NI2ES:
|
|
@@ -664,7 +659,7 @@ def translate_ni_to_es_bi(text:str):
|
|
| 664 |
out = add_inverted_openers(out)
|
| 665 |
|
| 666 |
es_text = detokenize(out)
|
| 667 |
-
es_text = postprocess_spanish(es_text)
|
| 668 |
return es_text
|
| 669 |
|
| 670 |
# ====== Diagnóstico ======
|
|
@@ -1005,12 +1000,19 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue
|
|
| 1005 |
# ====== smoke opcional ======
|
| 1006 |
def _symmetry_smoketest():
|
| 1007 |
print("\n[SMOKE] Prueba ES↔NI (BI-estricto)…")
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
|
| 1015 |
if DEBUG_MODE:
|
| 1016 |
_symmetry_smoketest()
|
|
@@ -1021,4 +1023,3 @@ if __name__ == "__main__":
|
|
| 1021 |
|
| 1022 |
|
| 1023 |
|
| 1024 |
-
|
|
|
|
| 69 |
_num_re = re.compile(r"^\d+([.,]\d+)?$")
|
| 70 |
def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
|
| 71 |
|
| 72 |
+
# --- separadores de cláusula + placeholders atómicos ---
|
| 73 |
+
CLAUSE_BREAKS = {",", ";", "—", "–", ":"} # cortes fuertes (no fin de oración)
|
| 74 |
PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
|
| 75 |
def is_placeholder(tok: str) -> bool:
|
| 76 |
return bool(PLACEHOLDER_RE.match(tok or ""))
|
| 77 |
|
| 78 |
+
def _restore_brk(tok, protected):
|
| 79 |
+
"""
|
| 80 |
+
Restaura __BRKn__ y también __BRKn__-na / __BRKn__-ba a su forma original,
|
| 81 |
+
manteniendo el sufijo modal si existe (p.ej. '[SIN-LEX:Tomás]-na').
|
| 82 |
+
"""
|
| 83 |
+
m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "")
|
| 84 |
+
if not m: return tok
|
| 85 |
+
idx = int(m.group(1))
|
| 86 |
+
suf = m.group(2)
|
| 87 |
+
base = protected[idx] if 0 <= idx < len(protected) else tok
|
| 88 |
+
return base + (f"-{suf}" if suf else "")
|
| 89 |
+
|
| 90 |
def simple_tokenize(text:str):
|
| 91 |
+
"""Tokenización mínima, sin romper [ ... ] ni [ ... ]-na/-ba."""
|
| 92 |
if not text:
|
| 93 |
return []
|
| 94 |
protected = []
|
|
|
|
| 97 |
protected.append(m.group(0))
|
| 98 |
return key
|
| 99 |
|
| 100 |
+
# protegemos bloques [ ... ]
|
| 101 |
t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
|
| 102 |
t = re.sub(r"\s+"," ", t)
|
| 103 |
t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
|
| 104 |
toks = [tok for tok in t.split() if tok]
|
| 105 |
|
| 106 |
+
# restaura bloques protegidos (con soporte -na/-ba adheridos)
|
| 107 |
for i, tok in enumerate(toks):
|
| 108 |
+
# si viene pegado el sufijo modal, no se habrá restaurado; hacemos la restauración robusta
|
| 109 |
+
if tok.startswith("__BRK") and "__" in tok:
|
| 110 |
+
toks[i] = _restore_brk(tok, protected)
|
| 111 |
return toks
|
| 112 |
|
| 113 |
def detokenize(tokens):
|
|
|
|
| 124 |
|
| 125 |
# ====== Modalidad vascoide (-na / -ba) ======
|
| 126 |
# Configuración
|
| 127 |
+
MODAL_SUFFIX_ENABLE = True
|
| 128 |
+
MODAL_ONLY_ON_FINITE = True
|
| 129 |
+
MODAL_STRIP_QE_IN_NI = True
|
| 130 |
|
| 131 |
# Conjuntos y ayudas
|
| 132 |
SENT_END = {".", "!", "?", "…"}
|
| 133 |
OPEN_FOR = {"?": "¿", "!": "¡"}
|
| 134 |
WRAP_PREFIX = set(list("«“‘([{\"'"))
|
|
|
|
| 135 |
PERS_ENDINGS = ("-n","-zu","-gu","-zuk","-zuek","-k")
|
| 136 |
+
TAM_FINITE = ("-ke","-bo","-ta","-ni","-tu")
|
| 137 |
|
| 138 |
def looks_like_finite_ni(tok:str)->bool:
|
| 139 |
t = (tok or "").lower()
|
|
|
|
| 151 |
return i if i >= start else -1
|
| 152 |
|
| 153 |
def strip_qe_punct(tokens):
|
|
|
|
| 154 |
return [t for t in tokens if t not in ("¿","?","¡","!")]
|
| 155 |
|
| 156 |
+
# --- helpers numéricos para no cortar decimales/horas ---
|
| 157 |
+
def _is_numeric_comma(tokens, i):
|
| 158 |
+
return (0 < i < len(tokens)-1 and tokens[i] == "," and
|
| 159 |
+
is_number(tokens[i-1]) and is_number(tokens[i+1]))
|
| 160 |
+
|
| 161 |
+
def _is_time_colon(tokens, i):
|
| 162 |
+
return (0 < i < len(tokens)-1 and tokens[i] == ":" and
|
| 163 |
+
is_number(tokens[i-1]) and is_number(tokens[i+1]))
|
| 164 |
+
|
| 165 |
+
def _is_true_clause_break(tokens, i):
|
| 166 |
+
if tokens[i] not in CLAUSE_BREAKS: return False
|
| 167 |
+
if _is_numeric_comma(tokens, i): return False
|
| 168 |
+
if _is_time_colon(tokens, i): return False
|
| 169 |
+
return True
|
| 170 |
+
|
| 171 |
def add_modal_suffixes_es2ni(tokens):
|
| 172 |
+
"""Añade -na/-ba al último verbo finito (o último constituyente) por oración."""
|
| 173 |
if not MODAL_SUFFIX_ENABLE:
|
| 174 |
return tokens
|
| 175 |
out = tokens[:]
|
|
|
|
| 179 |
while i < n:
|
| 180 |
if out[i] in ("?", "!"):
|
| 181 |
closer = out[i]
|
|
|
|
| 182 |
target = -1
|
| 183 |
j = i - 1
|
| 184 |
while j >= sent_start:
|
|
|
|
| 191 |
suf = "na" if closer == "?" else "ba"
|
| 192 |
if not re.search(rf"-(?:{suf})$", out[target].lower()):
|
| 193 |
out[target] = out[target] + "-" + suf
|
|
|
|
| 194 |
sent_start = i + 1
|
| 195 |
elif out[i] in SENT_END:
|
| 196 |
sent_start = i + 1
|
|
|
|
| 201 |
|
| 202 |
def strip_modal_suffixes_ni(tokens):
|
| 203 |
"""
|
| 204 |
+
Interpreta -na/-ba como modalidad; cierra antes de separadores fuertes,
|
| 205 |
+
excepto cuando la coma/“:” son numéricos (12,75 / 18:30).
|
|
|
|
|
|
|
|
|
|
| 206 |
"""
|
| 207 |
if not MODAL_SUFFIX_ENABLE:
|
| 208 |
return tokens
|
| 209 |
|
| 210 |
+
out = []
|
| 211 |
+
buf = []
|
| 212 |
pending_end = None
|
| 213 |
mode = None # "?" / "!"
|
| 214 |
|
| 215 |
def _emit(end_override=None, also_append=None):
|
| 216 |
nonlocal buf, mode, pending_end, out
|
| 217 |
+
local = [t for t in buf if t not in ("¿","?","¡","!")]
|
| 218 |
if local:
|
| 219 |
end_tok = end_override or ("?" if mode == "?" else "!" if mode == "!" else pending_end or ".")
|
| 220 |
out.extend(local)
|
|
|
|
| 223 |
if also_append:
|
| 224 |
out.append(also_append)
|
| 225 |
|
| 226 |
+
toks = tokens + ["."]
|
| 227 |
+
for i, t in enumerate(toks):
|
| 228 |
+
# Abridores explícitos
|
| 229 |
if t in ("¿", "¡"):
|
| 230 |
+
_emit(); mode = "?" if t == "¿" else "!"
|
|
|
|
| 231 |
continue
|
| 232 |
+
# Cierres explícitos
|
| 233 |
if t in ("?", "!"):
|
| 234 |
+
pending_end = t; _emit(); continue
|
| 235 |
+
# Final de oración
|
|
|
|
|
|
|
| 236 |
if t in SENT_END:
|
| 237 |
+
pending_end = t; _emit(); continue
|
| 238 |
+
# Separadores fuertes (no numéricos)
|
| 239 |
+
if t in CLAUSE_BREAKS and mode in ("?","!"):
|
| 240 |
+
if not _is_true_clause_break(toks, i):
|
| 241 |
+
# es decimal/hora -> no cerrar
|
| 242 |
+
pass
|
| 243 |
+
else:
|
| 244 |
+
_emit(also_append=t); continue
|
| 245 |
|
| 246 |
+
# Sufijos -na/-ba (en cualquier token, incl. placeholders)
|
| 247 |
m = re.search(r"-(na|ba)$", (t or "").lower())
|
| 248 |
if m:
|
| 249 |
+
if mode and buf: _emit()
|
|
|
|
| 250 |
mode = "?" if m.group(1) == "na" else "!"
|
| 251 |
t = t[:-len(m.group(0))]
|
| 252 |
|
| 253 |
if t:
|
| 254 |
buf.append(t)
|
| 255 |
|
| 256 |
+
if len(out) >= 2 and out[-1] == "." and out[-2] == ".": out.pop()
|
|
|
|
|
|
|
| 257 |
return out
|
| 258 |
|
| 259 |
def add_inverted_openers(tokens):
|
| 260 |
+
"""Inserta ¿/¡ al inicio de cada tramo que acaba en ?/!, ignorando comas/“:” numéricos."""
|
| 261 |
out = tokens[:]
|
| 262 |
START_BREAKS = SENT_END | CLAUSE_BREAKS
|
| 263 |
+
def _is_true_start_break(idx):
|
| 264 |
+
if out[idx] in SENT_END: return True
|
| 265 |
+
if out[idx] in CLAUSE_BREAKS: return _is_true_clause_break(out, idx)
|
| 266 |
+
return False
|
| 267 |
+
|
| 268 |
i = 0
|
| 269 |
while i < len(out):
|
| 270 |
if out[i] in ("?", "!"):
|
| 271 |
+
closer = out[i]; opener = OPEN_FOR[closer]
|
| 272 |
+
# inicio del tramo = después del último fin de oración o separador FALSO/VERDADERO
|
|
|
|
| 273 |
j = i - 1
|
| 274 |
+
while j >= 0 and not _is_true_start_break(j):
|
| 275 |
j -= 1
|
| 276 |
start = j + 1
|
|
|
|
| 277 |
k = start
|
| 278 |
while k < i and out[k] in WRAP_PREFIX:
|
| 279 |
k += 1
|
| 280 |
if not (k < len(out) and out[k] == opener):
|
| 281 |
+
out.insert(k, opener); i += 1
|
|
|
|
| 282 |
i += 1
|
| 283 |
return out
|
| 284 |
|
|
|
|
| 348 |
"b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"],
|
| 349 |
"d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"],
|
| 350 |
"t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"],
|
| 351 |
+
"g":["‹GA›","‹GE›","‹GI›","‹DO›","‹GU›"] if False else ["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"],
|
| 352 |
"k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]
|
| 353 |
}
|
| 354 |
ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›",
|
|
|
|
| 450 |
es = lower(es_orig)
|
| 451 |
ni = lower(ni_orig)
|
| 452 |
|
|
|
|
| 453 |
if " " in es:
|
| 454 |
ESPHRASE2NI[es] = (ni_orig, pid)
|
| 455 |
if " " in ni:
|
| 456 |
NIPHRASE2ES[ni] = (es_orig, pid)
|
| 457 |
|
|
|
|
| 458 |
if es in ES2NI: dup_es += 1
|
| 459 |
else: ES2NI[es] = (ni_orig, pid)
|
| 460 |
|
|
|
|
| 461 |
if ni in NI2ES: dup_ni += 1
|
| 462 |
else: NI2ES[ni] = (es_orig, pid)
|
| 463 |
|
| 464 |
base_rows.append((es_orig, ni_orig, pid, flags))
|
| 465 |
rows += 1
|
| 466 |
|
|
|
|
| 467 |
if EXPANSION_ENABLE:
|
| 468 |
for es_orig, ni_orig, pid, flags in base_rows:
|
| 469 |
if not flags: continue
|
|
|
|
| 472 |
pl_key = lower(pl)
|
| 473 |
if pl_key not in ES2NI:
|
| 474 |
ES2NI[pl_key] = (ni_orig, pid)
|
|
|
|
| 475 |
if _has_flag(flags, FLAG_3PL):
|
| 476 |
p3 = _present_3pl_from_3sg(es_orig)
|
| 477 |
p3_key = lower(p3)
|
| 478 |
if p3_key not in ES2NI:
|
| 479 |
ES2NI[p3_key] = (ni_orig, pid)
|
|
|
|
| 480 |
|
|
|
|
| 481 |
for es_low, (ni_surf, _) in ES2NI.items():
|
| 482 |
ni_low = lower(ni_surf)
|
| 483 |
back = NI2ES.get(ni_low)
|
|
|
|
| 501 |
if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
|
| 502 |
if mismatch_backmap:
|
| 503 |
print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
|
|
|
|
|
|
|
| 504 |
|
|
|
|
| 505 |
sam_html = ""
|
| 506 |
if mismatch_samples:
|
| 507 |
sam_rows = "".join(
|
|
|
|
| 518 |
ES únicas (tras expansiones): <b>{es_unique:,}</b> | NI únicas: <b>{ni_unique:,}</b> | pair_id únicos: <b>{pid_unique:,}</b><br>
|
| 519 |
Duplicados ES: <b>{dup_es:,}</b> | Duplicados NI: <b>{dup_ni:,}</b> | Sin pair_id: <b>{empty_pid:,}</b><br>
|
| 520 |
Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b><br>
|
|
|
|
|
|
|
| 521 |
<hr style="border:0;border-top:1px solid #caa">
|
| 522 |
<small>Regla: el motor usa <b>sólo</b> tablas 1:1 y expansiones <b>explícitas por bandera</b> (flags) en el CSV.
|
| 523 |
Nada “adivina”.</small>
|
|
|
|
| 554 |
|
| 555 |
if not in_br and start:
|
| 556 |
if ch.isspace():
|
| 557 |
+
out.append(ch)
|
| 558 |
elif ch in WRAPS:
|
| 559 |
+
out.append(ch)
|
| 560 |
elif ch.isalpha():
|
| 561 |
out.append(ch.upper()); start = False
|
| 562 |
else:
|
| 563 |
out.append(ch)
|
|
|
|
| 564 |
start = ch in "¿¡"
|
| 565 |
else:
|
| 566 |
out.append(ch)
|
|
|
|
| 575 |
return "".join(out)
|
| 576 |
|
| 577 |
def postprocess_spanish(s: str) -> str:
|
| 578 |
+
# compactar horas y decimales
|
| 579 |
+
s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s)
|
| 580 |
+
s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s)
|
| 581 |
+
# espacios y signos
|
| 582 |
s = re.sub(r"\s{2,}", " ", s)
|
|
|
|
| 583 |
s = re.sub(r"\s+([,.;:!?])", r"\1", s)
|
|
|
|
| 584 |
s = re.sub(r"([?!.:,;])([^\s])", r"\1 \2", s)
|
|
|
|
| 585 |
s = re.sub(r"([¿¡])\s+", r"\1", s)
|
| 586 |
+
return sentence_case_spanish(s).strip()
|
|
|
|
|
|
|
| 587 |
|
| 588 |
# ====== Traducción BI estricta ======
|
| 589 |
def translate_es_to_ni_bi(text:str):
|
| 590 |
toks = simple_tokenize(text)
|
| 591 |
|
|
|
|
| 592 |
out=[]; ib_toks=[]
|
| 593 |
i=0
|
| 594 |
while i < len(toks):
|
|
|
|
| 601 |
if span > 1:
|
| 602 |
out.append(ni_surface)
|
| 603 |
ib_toks.append(georgeos_keys(tokens_from_latin(ni_surface), ni_surface))
|
| 604 |
+
i += span; continue
|
|
|
|
| 605 |
key = lower(t)
|
| 606 |
if key in ES2NI:
|
| 607 |
ni = ES2NI[key][0]
|
|
|
|
| 614 |
out.append(ph); ib_toks.append(ph)
|
| 615 |
i += 1
|
| 616 |
|
|
|
|
| 617 |
if MODAL_SUFFIX_ENABLE:
|
| 618 |
out = add_modal_suffixes_es2ni(out)
|
| 619 |
ib_toks = []
|
|
|
|
| 630 |
def translate_ni_to_es_bi(text:str):
|
| 631 |
toks = simple_tokenize(text)
|
| 632 |
|
|
|
|
| 633 |
if MODAL_SUFFIX_ENABLE:
|
| 634 |
toks = strip_modal_suffixes_ni(toks)
|
| 635 |
|
|
|
|
| 641 |
out.append(t); i+=1; continue
|
| 642 |
if is_placeholder(t):
|
| 643 |
out.append(t); i+=1; continue
|
|
|
|
| 644 |
span, es_surface = _longest_match(toks, i, NIPHRASE2ES)
|
| 645 |
if span > 1:
|
| 646 |
+
out.append(es_surface); i += span; continue
|
|
|
|
|
|
|
| 647 |
|
| 648 |
key = lower(t)
|
| 649 |
if key in NI2ES:
|
|
|
|
| 659 |
out = add_inverted_openers(out)
|
| 660 |
|
| 661 |
es_text = detokenize(out)
|
| 662 |
+
es_text = postprocess_spanish(es_text)
|
| 663 |
return es_text
|
| 664 |
|
| 665 |
# ====== Diagnóstico ======
|
|
|
|
| 1000 |
# ====== smoke opcional ======
|
| 1001 |
def _symmetry_smoketest():
|
| 1002 |
print("\n[SMOKE] Prueba ES↔NI (BI-estricto)…")
|
| 1003 |
+
probes = [
|
| 1004 |
+
"nuker-ke ni etxe-ka ?", # modal simple
|
| 1005 |
+
"¿Pagaste 12,75 en la cafetería?", # decimal
|
| 1006 |
+
"Marta llega a las 18:30.", # hora
|
| 1007 |
+
"[SIN-LEX:Tomás]-na euŕak-ke !" # placeholder + -na + cierre explícito
|
| 1008 |
+
]
|
| 1009 |
+
for p in probes:
|
| 1010 |
+
es_from_ni = translate_ni_to_es_bi(p)
|
| 1011 |
+
ni_round, _ = translate_es_to_ni_bi(es_from_ni)
|
| 1012 |
+
print(" IN:", p)
|
| 1013 |
+
print(" ES:", es_from_ni)
|
| 1014 |
+
print(" NI:", ni_round)
|
| 1015 |
+
print("---")
|
| 1016 |
|
| 1017 |
if DEBUG_MODE:
|
| 1018 |
_symmetry_smoketest()
|
|
|
|
| 1023 |
|
| 1024 |
|
| 1025 |
|
|
|