Spanish_NeoIberianNewDesign2

Sleeping

App Files Files Community

LoloSemper commited on May 19

Commit

7d48925

verified ·

1 Parent(s): 62f55fb

Upload 2 files

Browse files

Files changed (2) hide show

153_toponimos_atestiguados.csv.gz +3 -0
app.py +11 -12

153_toponimos_atestiguados.csv.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a70a00347f78a55c73dff59597fbe2777f48e908c0d6c93301abf37afd040b5
+size 1189

app.py CHANGED Viewed

@@ -51,7 +51,6 @@ def _cand(*names):
 # Prioriza los "master/surface-ready"; luego retrocompatibles
 CSV_BI = _cand(
-    "LEXICON_v152_IBERIAN.csv.gz",
     "LEXICON_v86_IBERIAN.csv.gz",
     "LEXICON_v85_IBERIAN.csv.gz",
     "LEXICON_v84_IBERIAN.csv.gz",
@@ -1961,7 +1960,7 @@ _register_ipfv_3s_reverse()
 # Coste medido: ~6 s al arranque (3 ejecuciones: 5.5, 5.5, 7.4 s). Una
 # sola pasada al final, no afecta el bucle interno de override. Corrige
 # ~328K entradas NI2ES (verificado en lex actual con 129 parches).
-VERSION_MARKER = "v137_2026_05_19_lex_consolidado"
 try:
     print(f"[Neoíbero translator] versión cargada: {VERSION_MARKER}", flush=True)
     print(f"[Neoíbero translator] léxico activo: {CSV_BI}", flush=True)
@@ -2326,24 +2325,24 @@ def translate_es_to_ni_bi(text:str):
                 break
         key = lower(t)
-        # v103: detectar NOMBRE PROPIO. Cubre dos casos:
-        #   (a) en mitad de oración (sent_start=False) y capitalizado → propio
-        #       (evita traducir "Marco" como sust. común "marco")
-        #   (b) al inicio o donde sea, capitalizado y NO está en el lex → propio
-        #       (cubre "Aitor", "Bilbilis" tras salto, etc.)
-        # Excluye: inicio de oración con palabra capitalizada que SÍ está en
-        # el lex (típico inicio normal "Te escribo..." donde "Te" debe traducirse).
         key_in_lex = key in ES2NI or fold(key) in ES_FOLD
         is_proper_noun = (
             len(t) >= 2
             and t[0].isupper()
             and not t.isupper()
             and t.isalpha()
-            and (not sent_start or not key_in_lex)
         )
         if is_proper_noun:
-            ph = f"[{t}]"
-            out.append(ph); ib_toks.append(ph)
             prev_key = key
             left_context.append(t)
             i += 1

 # Prioriza los "master/surface-ready"; luego retrocompatibles
 CSV_BI = _cand(
     "LEXICON_v86_IBERIAN.csv.gz",
     "LEXICON_v85_IBERIAN.csv.gz",
     "LEXICON_v84_IBERIAN.csv.gz",
 # Coste medido: ~6 s al arranque (3 ejecuciones: 5.5, 5.5, 7.4 s). Una
 # sola pasada al final, no afecta el bucle interno de override. Corrige
 # ~328K entradas NI2ES (verificado en lex actual con 129 parches).
+VERSION_MARKER = "v138_2026_05_19_propios_sin_corchete"
 try:
     print(f"[Neoíbero translator] versión cargada: {VERSION_MARKER}", flush=True)
     print(f"[Neoíbero translator] léxico activo: {CSV_BI}", flush=True)
                 break
         key = lower(t)
+        # v104: detectar NOMBRE PROPIO / TOPÓNIMO NO ATESTIGUADO.
+        # Si la palabra capitalizada NO está en el lex, devolverla TAL CUAL
+        # (sin corchete), porque puede ser un nombre propio (Ana, Marco) o
+        # un topónimo no atestiguado (Madrid, Valencia). El lex contiene
+        # solo topónimos atestiguados, así que si está en el lex, se traduce.
+        # Para casos de inicio de oración con palabra en lex (típico inicio
+        # normal: "Te escribo..."), siempre traducir.
         key_in_lex = key in ES2NI or fold(key) in ES_FOLD
         is_proper_noun = (
             len(t) >= 2
             and t[0].isupper()
             and not t.isupper()
             and t.isalpha()
+            and not key_in_lex          # SOLO si NO está en el lex
         )
         if is_proper_noun:
+            # v104: devolver tal cual SIN corchete (era "[t]" antes)
+            out.append(t); ib_toks.append(t)
             prev_key = key
             left_context.append(t)
             i += 1