Spaces:

impresso-project
/

multilingual-static-word-embeddings-demo

Sleeping

App Files Files Community

Maslionok commited on May 26

Commit

a69e0b2

1 Parent(s): 09a837f

cleaned up

Browse files

Files changed (3) hide show

README.md +9 -6
app.py +181 -500
requirements.txt +0 -1

README.md CHANGED Viewed

@@ -1,14 +1,16 @@
 ---
-title: Multilingual Static Word Embeddings Demo
 sdk: gradio
 app_file: app.py
 pinned: false
 ---
-# Multilingual Static Word Embeddings Demo
-This Space loads a saved aligned multilingual embedding space and lets users
-search translations and nearest neighbors with adjustable retrieval parameters.
 Required artifact files:
@@ -20,9 +22,9 @@ The app does not use `aligned_all.vec`.
 ## Runtime configuration
-By default, the app lists the newest artifact folder under:
-`s3://131-component-staging/multilingual-static-word-embeddings/stage-6/`
 Set these Hugging Face Space secrets for S3-compatible storage:
@@ -35,6 +37,7 @@ Optional environment overrides:
 - `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
   `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
 - `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
 - `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
 Defaults for `top_k`, `min_score`, `csls_k`, `candidate_retrieval_k`,

 ---
+title: Multilingual Dictionary Explorer
 sdk: gradio
 app_file: app.py
 pinned: false
 ---
+# Multilingual Dictionary Explorer
+This Space is a Gradio UI for the same lookup logic as
+`query_multilingual_space.py`: enter a source language and query word, then get
+translations to all other languages using FAISS, CSLS, and optional
+bidirectional consistency.
 Required artifact files:
 ## Runtime configuration
+By default, the app downloads this artifact folder:
+`s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
 Set these Hugging Face Space secrets for S3-compatible storage:
 - `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
   `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
 - `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
+- `SPACE_DIR`: local artifact folder, useful for local testing
 - `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
 Defaults for `top_k`, `min_score`, `csls_k`, `candidate_retrieval_k`,

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import difflib
 import gc
 import json
 import os
@@ -16,7 +15,6 @@ from urllib.parse import urlparse
 import boto3
 import gradio as gr
 import numpy as np
-import pandas as pd
 from botocore.config import Config
@@ -24,33 +22,13 @@ DEFAULT_ARTIFACT_PREFIX = (
     "s3://131-component-staging/"
     "multilingual-static-word-embeddings/stage-6/"
 )
-ARTIFACT_URI_ENV = "SPACE_ARTIFACT_S3_URI"
-ARTIFACT_PREFIX_ENV = "SPACE_ARTIFACT_S3_PREFIX"
-CACHE_ROOT = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
 REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
-DEFAULT_LANGUAGES = ["de", "en", "fr", "lb"]
-TRANSLATION_COLUMNS = [
-    "target_lang",
-    "translation",
-    "token",
-    "score",
-    "cosine",
-    "rank",
-    "bidirectional",
-    "id",
-    "source_vec_file",
-]
-NEIGHBOR_COLUMNS = [
-    "lang",
-    "word",
-    "token",
-    "score",
-    "cosine",
-    "rank",
-    "id",
-]
-VOCAB_COLUMNS = ["id", "lang", "surface", "token", "source_vec_file"]
 @dataclass
@@ -89,7 +67,7 @@ class Space:
 def parse_s3_uri(uri: str) -> tuple[str, str]:
     parsed = urlparse(uri)
     if parsed.scheme != "s3" or not parsed.netloc:
-        raise ValueError(f"Expected s3://bucket/key URI, got: {uri}")
     return parsed.netloc, parsed.path.lstrip("/")
@@ -98,6 +76,7 @@ def make_s3_client():
     secret_key = os.getenv("SE_SECRET_KEY") or os.getenv("AWS_SECRET_ACCESS_KEY")
     endpoint_url = os.getenv("SE_HOST_URL") or os.getenv("AWS_ENDPOINT_URL")
     region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
     if endpoint_url and not endpoint_url.startswith(("http://", "https://")):
         endpoint_url = f"https://{endpoint_url}"
@@ -107,7 +86,7 @@ def make_s3_client():
         "config": Config(
             signature_version="s3v4",
             s3={"addressing_style": "path"},
-            retries={"max_attempts": 5, "mode": "standard"},
         ),
     }
     if endpoint_url:
@@ -119,66 +98,76 @@ def make_s3_client():
     return boto3.client(**kwargs)
-def find_latest_artifact_uri(client) -> str:
-    explicit_uri = os.getenv(ARTIFACT_URI_ENV, "").strip()
-    if explicit_uri:
-        explicit_uri = explicit_uri.rstrip("/")
-        if "multilingual_space_" in explicit_uri:
-            return explicit_uri
-        bucket, prefix = parse_s3_uri(explicit_uri)
-        return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
-    prefix_uri = os.getenv(ARTIFACT_PREFIX_ENV, DEFAULT_ARTIFACT_PREFIX).strip()
-    bucket, prefix = parse_s3_uri(prefix_uri)
-    return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
-def find_latest_artifact_uri_under_prefix(client, bucket: str, prefix: str) -> str:
     prefix = prefix.rstrip("/") + "/"
     pattern = re.compile(r"(.*multilingual_space_(\d{8}_\d{6})\.json)/config\.json$")
     candidates: list[tuple[str, str]] = []
     paginator = client.get_paginator("list_objects_v2")
     for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
         for obj in page.get("Contents", []):
-            key = obj["Key"]
-            match = pattern.match(key)
             if match:
                 candidates.append((match.group(2), match.group(1)))
     if not candidates:
         raise FileNotFoundError(
-            f"No multilingual_space_*.json/config.json found under s3://{bucket}/{prefix}"
         )
-    _, latest_key = sorted(candidates)[-1]
-    return f"s3://{bucket}/{latest_key}"
-def artifact_cache_dir(artifact_uri: str) -> Path:
-    _, key = parse_s3_uri(artifact_uri)
-    name = Path(key.rstrip("/")).name
-    return CACHE_ROOT / name
-def download_artifact() -> tuple[Path, str]:
     client = make_s3_client()
-    artifact_uri = find_latest_artifact_uri(client)
-    local_dir = artifact_cache_dir(artifact_uri)
     local_dir.mkdir(parents=True, exist_ok=True)
-    bucket, key_prefix = parse_s3_uri(artifact_uri)
-    key_prefix = key_prefix.rstrip("/")
     for filename in REQUIRED_FILES:
-        local_path = local_dir / filename
-        if local_path.exists() and local_path.stat().st_size > 0:
             continue
-        key = f"{key_prefix}/{filename}"
-        print(f"Downloading s3://{bucket}/{key} -> {local_path}", file=sys.stderr)
-        client.download_file(bucket, key, str(local_path))
-    return local_dir, artifact_uri
 def strip_diacritics(text: str) -> str:
@@ -202,18 +191,24 @@ def is_good_token(token: str, min_len: int = 4) -> bool:
 def read_config(space_dir: Path) -> dict[str, Any]:
-    with (space_dir / "config.json").open("r", encoding="utf-8") as f:
         return json.load(f)
 def read_metadata(space_dir: Path) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
-    metadata_path = space_dir / "all_metadata.jsonl"
     metadata: list[dict[str, Any] | None] = []
     ids_by_lang: dict[str, list[int]] = {}
-    with metadata_path.open("r", encoding="utf-8") as f:
         for line in f:
-            if not line.strip():
                 continue
             meta = json.loads(line)
             row_id = int(meta["id"])
@@ -261,17 +256,17 @@ def normalize_rows(vecs: np.ndarray) -> np.ndarray:
 def load_vectors_from_faiss(space_dir: Path, ids_by_lang: dict[str, list[int]]) -> dict[str, np.ndarray]:
-    faiss_path = space_dir / "aligned_all.faiss"
     try:
         import faiss  # type: ignore
     except ImportError as exc:
-        raise RuntimeError(
-            "faiss-cpu is required. The Space must install faiss-cpu from requirements.txt."
-        ) from exc
     print(f"Loading FAISS index: {faiss_path}", file=sys.stderr)
     index = faiss.read_index(str(faiss_path))
     vectors_by_lang: dict[str, np.ndarray] = {}
     for lang, ids in sorted(ids_by_lang.items()):
         print(f"Reconstructing {lang}: {len(ids)} vectors", file=sys.stderr)
@@ -288,16 +283,15 @@ def build_lookup(languages: dict[str, LangVectors]) -> dict[str, dict[str, list[
         lang_lookup: dict[str, list[int]] = {}
         for global_id, meta in zip(data.ids.tolist(), data.metas):
             for value in (meta.get("token"), meta.get("surface")):
-                if not value:
-                    continue
-                lang_lookup.setdefault(lookup_key(str(value)), []).append(int(global_id))
         lookup[lang] = lang_lookup
     return lookup
 @lru_cache(maxsize=1)
 def load_space() -> Space:
-    space_dir, artifact_uri = download_artifact()
     config = read_config(space_dir)
     metadata, ids_by_lang = read_metadata(space_dir)
     vectors_by_lang = load_vectors_from_faiss(space_dir, ids_by_lang)
@@ -335,11 +329,12 @@ def load_space() -> Space:
 def default_options(config: dict[str, Any]) -> RuntimeOptions:
     bidi_config = config.get("bidirectional_consistency") or {}
     return RuntimeOptions(
-        top_k=int(config.get("top_k", 3)),
         min_score=float(config.get("min_score", 0.15)),
         csls_k=int(config.get("csls_k", 10)),
-        candidate_retrieval_k=int(config.get("candidate_retrieval_k", 9)),
         csls_prefetch_k=int(config.get("csls_prefetch_k", 50)),
         bidirectional=bool(bidi_config.get("enabled", True)),
         score_method="csls",
@@ -483,63 +478,31 @@ def format_word(meta: dict[str, Any], opts: RuntimeOptions) -> str:
     return str(meta.get("token") or meta.get("surface") or "")
-def suggestions(space: Space, lang: str, query: str, limit: int = 8) -> list[str]:
-    lang_lookup = space.lookup.get(lang, {})
-    key = lookup_key(query)
-    close_keys = difflib.get_close_matches(key, lang_lookup.keys(), n=limit, cutoff=0.72)
-    labels = []
-    for close_key in close_keys:
-        row_id = lang_lookup[close_key][0]
-        meta = get_meta(space, row_id)
-        label = str(meta.get("surface") or meta.get("token") or "")
-        if label and label not in labels:
-            labels.append(label)
-    return labels
 def resolve_query(space: Space, lang: str, query: str) -> tuple[int, dict[str, Any], str]:
     if lang not in space.by_lang:
         raise ValueError(f"Unknown language {lang!r}. Available: {', '.join(space.languages)}")
-    query = query.strip()
-    if not query:
         raise ValueError("Enter a query word.")
     matches = space.lookup.get(lang, {}).get(lookup_key(query), [])
     if not matches:
-        hint = suggestions(space, lang, query)
-        if hint:
-            raise LookupError(f"No exact match. Close matches: {', '.join(hint)}")
         raise LookupError(f"No exact token/surface match for {lang}:{query!r}")
-    row_id = int(matches[0])
     message = ""
     if len(matches) > 1:
-        shown = []
-        for match_id in matches[:5]:
-            meta = get_meta(space, match_id)
-            shown.append(f"{meta.get('surface') or meta.get('token')} (id {match_id})")
-        message = f"Matched {len(matches)} entries; using {shown[0]}."
     return row_id, get_meta(space, row_id), message
-def translation_dataframe() -> pd.DataFrame:
-    return pd.DataFrame(columns=TRANSLATION_COLUMNS)
-def neighbor_dataframe() -> pd.DataFrame:
-    return pd.DataFrame(columns=NEIGHBOR_COLUMNS)
-def vocabulary_dataframe() -> pd.DataFrame:
-    return pd.DataFrame(columns=VOCAB_COLUMNS)
-def translate_ui(
     query: str,
     source_lang: str,
-    target_langs: list[str] | None,
     top_k: int,
     min_score: float,
     csls_k: int,
@@ -550,10 +513,9 @@ def translate_ui(
     filter_stopwords: bool,
     filter_bad_tokens: bool,
     use_surface: bool,
-) -> tuple[pd.DataFrame, str]:
     try:
         space = load_space()
-        targets = target_langs or [lang for lang in space.languages if lang != source_lang]
         opts = make_options(
             top_k,
             min_score,
@@ -568,19 +530,25 @@ def translate_ui(
         )
         source_id, source_meta, match_message = resolve_query(space, source_lang, query)
         source_vec = get_vec(space, source_id)
-        rows: list[dict[str, Any]] = []
-        grouped: list[str] = [
-            f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
-            f"(token `{source_meta.get('token')}`, id `{source_id}`)"
         ]
         if match_message:
-            grouped.append(match_message)
-        for target_lang in targets:
-            if target_lang == source_lang or target_lang not in space.by_lang:
-                continue
             candidates = rank_candidates(space, source_vec, source_lang, target_lang, opts)
             kept: list[dict[str, Any]] = []
             for cand in candidates:
                 if opts.bidirectional:
                     reverse = rank_candidates(
@@ -596,348 +564,129 @@ def translate_ui(
                         continue
                 else:
                     cand["bidirectional"] = False
                 kept.append(cand)
                 if len(kept) >= opts.top_k:
                     break
-            if kept:
-                grouped.append(f"\n{target_lang}:")
-                for i, cand in enumerate(kept, 1):
-                    meta = cand["meta"]
-                    word = format_word(meta, opts)
-                    grouped.append(f"{i}. {word} ({cand['score']:.4f})")
-                    rows.append(
-                        {
-                            "target_lang": target_lang,
-                            "translation": word,
-                            "token": meta.get("token"),
-                            "score": round(float(cand["score"]), 6),
-                            "cosine": round(float(cand["cosine"]), 6),
-                            "rank": int(cand["rank"]),
-                            "bidirectional": bool(cand["bidirectional"]),
-                            "id": int(cand["global_id"]),
-                            "source_vec_file": meta.get("source_vec_file"),
-                        }
-                    )
-            else:
-                grouped.append(f"\n{target_lang}: no candidates after filters")
-        return pd.DataFrame(rows, columns=TRANSLATION_COLUMNS), "\n".join(grouped)
-    except Exception as exc:
-        return translation_dataframe(), f"Error: {exc}"
-def nearest_ui(
-    query: str,
-    source_lang: str,
-    neighbor_langs: list[str] | None,
-    top_n: int,
-    min_score: float,
-    csls_k: int,
-    score_method: str,
-    include_source_language: bool,
-    use_surface: bool,
-) -> tuple[pd.DataFrame, str]:
-    try:
-        space = load_space()
-        opts = make_options(
-            top_n,
-            min_score,
-            csls_k,
-            max(top_n + 5, 20),
-            max(top_n + 5, 50),
-            False,
-            score_method,
-            False,
-            False,
-            use_surface,
-        )
-        source_id, source_meta, match_message = resolve_query(space, source_lang, query)
-        source_vec = get_vec(space, source_id)
-        targets = neighbor_langs or space.languages
-        if not include_source_language:
-            targets = [lang for lang in targets if lang != source_lang]
-        rows: list[dict[str, Any]] = []
-        for target_lang in targets:
-            if target_lang not in space.by_lang:
                 continue
-            candidates = rank_candidates(
-                space,
-                source_vec,
-                source_lang,
-                target_lang,
-                opts,
-                apply_filters=False,
-            )
-            for cand in candidates:
-                if int(cand["global_id"]) == source_id:
-                    continue
                 meta = cand["meta"]
                 rows.append(
-                    {
-                        "lang": target_lang,
-                        "word": format_word(meta, opts),
-                        "token": meta.get("token"),
-                        "score": round(float(cand["score"]), 6),
-                        "cosine": round(float(cand["cosine"]), 6),
-                        "rank": int(cand["rank"]),
-                        "id": int(cand["global_id"]),
-                    }
                 )
-                if len([row for row in rows if row["lang"] == target_lang]) >= top_n:
-                    break
-        rows = sorted(rows, key=lambda row: row["score"], reverse=True)
-        status = (
-            f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
-            f"(token `{source_meta.get('token')}`, id `{source_id}`)"
-        )
-        if match_message:
-            status += f"\n\n{match_message}"
-        return pd.DataFrame(rows, columns=NEIGHBOR_COLUMNS), status
     except Exception as exc:
-        return neighbor_dataframe(), f"Error: {exc}"
-def browse_ui(lang: str, filter_text: str, limit: int) -> pd.DataFrame:
-    try:
-        space = load_space()
-        if lang not in space.by_lang:
-            return vocabulary_dataframe()
-        needle = lookup_key(filter_text or "")
-        rows = []
-        for row_id, meta in zip(space.by_lang[lang].ids.tolist(), space.by_lang[lang].metas):
-            surface = str(meta.get("surface") or "")
-            token = str(meta.get("token") or "")
-            if needle and needle not in lookup_key(surface) and needle not in lookup_key(token):
-                continue
-            rows.append(
-                {
-                    "id": int(row_id),
-                    "lang": lang,
-                    "surface": surface,
-                    "token": token,
-                    "source_vec_file": meta.get("source_vec_file"),
-                }
-            )
-            if len(rows) >= int(limit):
-                break
-        return pd.DataFrame(rows, columns=VOCAB_COLUMNS)
-    except Exception:
-        return vocabulary_dataframe()
-def config_markdown(space: Space) -> str:
-    config = space.config
-    vocab_sizes = config.get("vocab_sizes") or {
-        lang: len(space.by_lang[lang].metas) for lang in space.languages
-    }
-    bidi = config.get("bidirectional_consistency") or {}
-    lines = [
-        f"Artifact: `{space.artifact_uri}`",
-        f"Created: `{config.get('created_at', 'unknown')}`",
-        f"Languages: `{', '.join(space.languages)}`",
-        f"Pivot language: `{config.get('pivot_lang', 'unknown')}`",
-        f"Vector dim: `{config.get('vector_dim', 'unknown')}`",
-        f"Top N vocab: `{config.get('top_n_vocab', 'unknown')}`",
-        f"Output top: `{config.get('out_top', 'unknown')}`",
-        f"Default top_k: `{config.get('top_k', 3)}`",
-        f"Default min_score: `{config.get('min_score', 0.15)}`",
-        f"Default csls_k: `{config.get('csls_k', 10)}`",
-        f"Bidirectional consistency: `{bool(bidi.get('enabled', True))}`",
-        "",
-        "Vocabulary sizes:",
-    ]
-    for lang, size in sorted(vocab_sizes.items()):
-        lines.append(f"- `{lang}`: `{size}`")
-    return "\n".join(lines)
-def initialize_ui():
     try:
         space = load_space()
         opts = default_options(space.config)
-        source = space.config.get("pivot_lang", "de")
-        if source not in space.languages:
-            source = space.languages[0]
-        targets = [lang for lang in space.languages if lang != source]
-        status = f"Loaded `{space.artifact_uri}` with `{sum(len(v.metas) for v in space.by_lang.values())}` vectors."
         return (
             status,
-            gr.update(choices=space.languages, value=source),
-            gr.update(choices=space.languages, value=targets),
             opts.top_k,
             opts.min_score,
             opts.csls_k,
             opts.candidate_retrieval_k,
             opts.csls_prefetch_k,
             opts.bidirectional,
-            gr.update(choices=space.languages, value=source),
-            gr.update(choices=space.languages, value=space.languages),
-            opts.csls_k,
-            gr.update(choices=space.languages, value=source),
-            config_markdown(space),
         )
     except Exception as exc:
-        status = f"Load error: {exc}"
         return (
-            status,
-            gr.update(choices=DEFAULT_LANGUAGES, value="de"),
-            gr.update(choices=DEFAULT_LANGUAGES, value=["en", "fr", "lb"]),
             3,
             0.15,
             10,
             9,
             50,
             True,
-            gr.update(choices=DEFAULT_LANGUAGES, value="de"),
-            gr.update(choices=DEFAULT_LANGUAGES, value=DEFAULT_LANGUAGES),
-            10,
-            gr.update(choices=DEFAULT_LANGUAGES, value="de"),
-            status,
-        )
-def update_targets(source_lang: str) -> gr.CheckboxGroup:
-    try:
-        space = load_space()
-        return gr.update(
-            choices=space.languages,
-            value=[lang for lang in space.languages if lang != source_lang],
         )
-    except Exception:
-        return gr.update(
-            choices=DEFAULT_LANGUAGES,
-            value=[lang for lang in DEFAULT_LANGUAGES if lang != source_lang],
-        )
-def update_neighbor_langs(source_lang: str, include_source: bool) -> gr.CheckboxGroup:
-    try:
-        space = load_space()
-        choices = space.languages
-    except Exception:
-        choices = DEFAULT_LANGUAGES
-    values = choices if include_source else [lang for lang in choices if lang != source_lang]
-    return gr.update(choices=choices, value=values)
-css = """
 .app-title h1 { margin-bottom: 0.15rem; }
-.status-line { font-size: 0.9rem; color: #475569; }
 """
-with gr.Blocks(title="Multilingual Static Word Embeddings", css=css) as demo:
     gr.Markdown(
-        "# Multilingual Static Word Embeddings\n"
-        "Search the aligned FAISS space for cross-lingual word neighbors."
     )
-    status_md = gr.Markdown("Loading artifacts...", elem_classes=["status-line"])
-    with gr.Tab("Translate"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=320):
-                query = gr.Textbox(label="Query word", value="haus")
-                source_lang = gr.Dropdown(
-                    label="Source language",
-                    choices=DEFAULT_LANGUAGES,
-                    value="de",
-                )
-                target_langs = gr.CheckboxGroup(
-                    label="Target languages",
-                    choices=DEFAULT_LANGUAGES,
-                    value=["en", "fr", "lb"],
-                )
-                translate_btn = gr.Button("Search", variant="primary")
-                with gr.Accordion("Retrieval parameters", open=True):
-                    top_k = gr.Slider(1, 20, value=3, step=1, label="Top K")
-                    min_score = gr.Slider(-2.0, 2.0, value=0.15, step=0.01, label="Min score")
-                    score_method = gr.Radio(
-                        ["csls", "cosine"],
-                        value="csls",
-                        label="Score method",
-                    )
-                    csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
-                    candidate_retrieval_k = gr.Slider(
-                        1,
-                        100,
-                        value=9,
-                        step=1,
-                        label="Candidate retrieval K",
-                    )
-                    csls_prefetch_k = gr.Slider(
-                        10,
-                        500,
-                        value=50,
-                        step=1,
-                        label="CSLS prefetch K",
-                    )
-                    bidirectional = gr.Checkbox(value=True, label="Bidirectional consistency")
-                    filter_stopwords = gr.Checkbox(value=True, label="Filter stopwords")
-                    filter_bad_tokens = gr.Checkbox(value=True, label="Filter noisy tokens")
-                    use_surface = gr.Checkbox(value=True, label="Show surface forms")
-            with gr.Column(scale=2):
-                translate_summary = gr.Markdown()
-                translation_results = gr.Dataframe(
-                    headers=TRANSLATION_COLUMNS,
-                    datatype=["str", "str", "str", "number", "number", "number", "bool", "number", "str"],
-                    interactive=False,
-                    wrap=True,
                 )
-    with gr.Tab("Nearest Neighbors"):
-        with gr.Row():
-            with gr.Column(scale=1, min_width=320):
-                nn_query = gr.Textbox(label="Query word", value="haus")
-                nn_source_lang = gr.Dropdown(
-                    label="Source language",
-                    choices=DEFAULT_LANGUAGES,
-                    value="de",
                 )
-                nn_langs = gr.CheckboxGroup(
-                    label="Neighbor languages",
-                    choices=DEFAULT_LANGUAGES,
-                    value=DEFAULT_LANGUAGES,
-                )
-                nn_top_n = gr.Slider(1, 50, value=20, step=1, label="Top N per language")
-                nn_min_score = gr.Slider(-2.0, 2.0, value=-2.0, step=0.01, label="Min score")
-                nn_score_method = gr.Radio(["csls", "cosine"], value="cosine", label="Score method")
-                nn_csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
-                nn_include_source = gr.Checkbox(value=True, label="Include source language")
-                nn_use_surface = gr.Checkbox(value=True, label="Show surface forms")
-                nn_btn = gr.Button("Find neighbors", variant="primary")
-            with gr.Column(scale=2):
-                nn_summary = gr.Markdown()
-                nn_results = gr.Dataframe(
-                    headers=NEIGHBOR_COLUMNS,
-                    datatype=["str", "str", "str", "number", "number", "number", "number"],
-                    interactive=False,
-                    wrap=True,
-                )
-    with gr.Tab("Browse Vocabulary"):
-        with gr.Row():
-            vocab_lang = gr.Dropdown(label="Language", choices=DEFAULT_LANGUAGES, value="de")
-            vocab_filter = gr.Textbox(label="Filter", placeholder="Type part of a token or surface form")
-            vocab_limit = gr.Slider(10, 500, value=100, step=10, label="Limit")
-        vocab_results = gr.Dataframe(
-            headers=VOCAB_COLUMNS,
-            datatype=["number", "str", "str", "str", "str"],
-            interactive=False,
-            wrap=True,
-        )
-    with gr.Tab("Artifact Info"):
-        artifact_info = gr.Markdown("Loading config...")
-    translate_inputs = [
         query,
         source_lang,
-        target_langs,
         top_k,
         min_score,
         csls_k,
@@ -949,90 +698,22 @@ with gr.Blocks(title="Multilingual Static Word Embeddings", css=css) as demo:
         filter_bad_tokens,
         use_surface,
     ]
-    translate_btn.click(
-        translate_ui,
-        inputs=translate_inputs,
-        outputs=[translation_results, translate_summary],
-    )
-    query.submit(
-        translate_ui,
-        inputs=translate_inputs,
-        outputs=[translation_results, translate_summary],
-    )
-    source_lang.change(update_targets, inputs=source_lang, outputs=target_langs)
-    nn_btn.click(
-        nearest_ui,
-        inputs=[
-            nn_query,
-            nn_source_lang,
-            nn_langs,
-            nn_top_n,
-            nn_min_score,
-            nn_csls_k,
-            nn_score_method,
-            nn_include_source,
-            nn_use_surface,
-        ],
-        outputs=[nn_results, nn_summary],
-    )
-    nn_query.submit(
-        nearest_ui,
-        inputs=[
-            nn_query,
-            nn_source_lang,
-            nn_langs,
-            nn_top_n,
-            nn_min_score,
-            nn_csls_k,
-            nn_score_method,
-            nn_include_source,
-            nn_use_surface,
-        ],
-        outputs=[nn_results, nn_summary],
-    )
-    nn_source_lang.change(
-        update_neighbor_langs,
-        inputs=[nn_source_lang, nn_include_source],
-        outputs=nn_langs,
-    )
-    nn_include_source.change(
-        update_neighbor_langs,
-        inputs=[nn_source_lang, nn_include_source],
-        outputs=nn_langs,
-    )
-    vocab_lang.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
-    vocab_filter.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
-    vocab_limit.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
     demo.load(
-        initialize_ui,
         outputs=[
-            status_md,
             source_lang,
-            target_langs,
             top_k,
             min_score,
             csls_k,
             candidate_retrieval_k,
             csls_prefetch_k,
             bidirectional,
-            nn_source_lang,
-            nn_langs,
-            nn_csls_k,
-            vocab_lang,
-            artifact_info,
         ],
-    ).then(
-        translate_ui,
-        inputs=translate_inputs,
-        outputs=[translation_results, translate_summary],
-    ).then(
-        browse_ui,
-        inputs=[vocab_lang, vocab_filter, vocab_limit],
-        outputs=vocab_results,
-    )
 if __name__ == "__main__":

 from __future__ import annotations
 import gc
 import json
 import os
 import boto3
 import gradio as gr
 import numpy as np
 from botocore.config import Config
     "s3://131-component-staging/"
     "multilingual-static-word-embeddings/stage-6/"
 )
+DEFAULT_ARTIFACT_URI = (
+    DEFAULT_ARTIFACT_PREFIX + "multilingual_space_20260521_133953.json"
+)
+DEFAULT_LOCAL_SPACE = Path("multilingual_space_20260521_133953.json")
+DEFAULT_LANGS = ["de", "en", "fr", "lb"]
 REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
+CACHE_DIR = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
 @dataclass
 def parse_s3_uri(uri: str) -> tuple[str, str]:
     parsed = urlparse(uri)
     if parsed.scheme != "s3" or not parsed.netloc:
+        raise ValueError(f"Expected s3://bucket/key URI, got {uri!r}")
     return parsed.netloc, parsed.path.lstrip("/")
     secret_key = os.getenv("SE_SECRET_KEY") or os.getenv("AWS_SECRET_ACCESS_KEY")
     endpoint_url = os.getenv("SE_HOST_URL") or os.getenv("AWS_ENDPOINT_URL")
     region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
     if endpoint_url and not endpoint_url.startswith(("http://", "https://")):
         endpoint_url = f"https://{endpoint_url}"
         "config": Config(
             signature_version="s3v4",
             s3={"addressing_style": "path"},
+            retries={"max_attempts": 3, "mode": "standard"},
         ),
     }
     if endpoint_url:
     return boto3.client(**kwargs)
+def latest_artifact_uri(client) -> str:
+    explicit = os.getenv("SPACE_ARTIFACT_S3_URI", "").strip().rstrip("/")
+    if explicit:
+        return explicit
+    prefix_override = os.getenv("SPACE_ARTIFACT_S3_PREFIX", "").strip()
+    if not prefix_override:
+        return DEFAULT_ARTIFACT_URI
+    prefix_uri = prefix_override
+    bucket, prefix = parse_s3_uri(prefix_uri)
     prefix = prefix.rstrip("/") + "/"
     pattern = re.compile(r"(.*multilingual_space_(\d{8}_\d{6})\.json)/config\.json$")
     candidates: list[tuple[str, str]] = []
     paginator = client.get_paginator("list_objects_v2")
     for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
         for obj in page.get("Contents", []):
+            match = pattern.match(obj["Key"])
             if match:
                 candidates.append((match.group(2), match.group(1)))
     if not candidates:
         raise FileNotFoundError(
+            f"No multilingual_space_*.json/config.json found under {prefix_uri}"
         )
+    _, key = sorted(candidates)[-1]
+    return f"s3://{bucket}/{key}"
+def local_cache_for_uri(uri: str) -> Path:
+    _, key = parse_s3_uri(uri)
+    return CACHE_DIR / Path(key.rstrip("/")).name
+def download_space_from_s3() -> tuple[Path, str]:
     client = make_s3_client()
+    uri = latest_artifact_uri(client)
+    local_dir = local_cache_for_uri(uri)
     local_dir.mkdir(parents=True, exist_ok=True)
+    bucket, prefix = parse_s3_uri(uri)
+    prefix = prefix.rstrip("/")
     for filename in REQUIRED_FILES:
+        dst = local_dir / filename
+        if dst.exists() and dst.stat().st_size > 0:
             continue
+        key = f"{prefix}/{filename}"
+        print(f"Downloading s3://{bucket}/{key}", file=sys.stderr)
+        client.download_file(bucket, key, str(dst))
+    return local_dir, uri
+def find_space_dir() -> tuple[Path, str]:
+    local_override = os.getenv("SPACE_DIR", "").strip()
+    if local_override:
+        path = Path(local_override)
+        if path.exists():
+            return path, str(path)
+    if DEFAULT_LOCAL_SPACE.exists():
+        return DEFAULT_LOCAL_SPACE, str(DEFAULT_LOCAL_SPACE)
+    local_candidates = sorted(Path(".").glob("multilingual_space_*.json"))
+    if local_candidates:
+        return local_candidates[-1], str(local_candidates[-1])
+    return download_space_from_s3()
 def strip_diacritics(text: str) -> str:
 def read_config(space_dir: Path) -> dict[str, Any]:
+    path = space_dir / "config.json"
+    if not path.exists():
+        raise FileNotFoundError(f"Missing config.json in {space_dir}")
+    with path.open("r", encoding="utf-8") as f:
         return json.load(f)
 def read_metadata(space_dir: Path) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
+    path = space_dir / "all_metadata.jsonl"
+    if not path.exists():
+        raise FileNotFoundError(f"Missing all_metadata.jsonl in {space_dir}")
     metadata: list[dict[str, Any] | None] = []
     ids_by_lang: dict[str, list[int]] = {}
+    with path.open("r", encoding="utf-8") as f:
         for line in f:
+            line = line.strip()
+            if not line:
                 continue
             meta = json.loads(line)
             row_id = int(meta["id"])
 def load_vectors_from_faiss(space_dir: Path, ids_by_lang: dict[str, list[int]]) -> dict[str, np.ndarray]:
     try:
         import faiss  # type: ignore
     except ImportError as exc:
+        raise RuntimeError("faiss-cpu is required to read aligned_all.faiss") from exc
+    faiss_path = space_dir / "aligned_all.faiss"
+    if not faiss_path.exists():
+        raise FileNotFoundError(f"Missing aligned_all.faiss in {space_dir}")
     print(f"Loading FAISS index: {faiss_path}", file=sys.stderr)
     index = faiss.read_index(str(faiss_path))
     vectors_by_lang: dict[str, np.ndarray] = {}
     for lang, ids in sorted(ids_by_lang.items()):
         print(f"Reconstructing {lang}: {len(ids)} vectors", file=sys.stderr)
         lang_lookup: dict[str, list[int]] = {}
         for global_id, meta in zip(data.ids.tolist(), data.metas):
             for value in (meta.get("token"), meta.get("surface")):
+                if value:
+                    lang_lookup.setdefault(lookup_key(str(value)), []).append(int(global_id))
         lookup[lang] = lang_lookup
     return lookup
 @lru_cache(maxsize=1)
 def load_space() -> Space:
+    space_dir, artifact_uri = find_space_dir()
     config = read_config(space_dir)
     metadata, ids_by_lang = read_metadata(space_dir)
     vectors_by_lang = load_vectors_from_faiss(space_dir, ids_by_lang)
 def default_options(config: dict[str, Any]) -> RuntimeOptions:
     bidi_config = config.get("bidirectional_consistency") or {}
+    top_k = int(config.get("top_k", 3))
     return RuntimeOptions(
+        top_k=top_k,
         min_score=float(config.get("min_score", 0.15)),
         csls_k=int(config.get("csls_k", 10)),
+        candidate_retrieval_k=int(config.get("candidate_retrieval_k", top_k * 3)),
         csls_prefetch_k=int(config.get("csls_prefetch_k", 50)),
         bidirectional=bool(bidi_config.get("enabled", True)),
         score_method="csls",
     return str(meta.get("token") or meta.get("surface") or "")
 def resolve_query(space: Space, lang: str, query: str) -> tuple[int, dict[str, Any], str]:
     if lang not in space.by_lang:
         raise ValueError(f"Unknown language {lang!r}. Available: {', '.join(space.languages)}")
+    if not query.strip():
         raise ValueError("Enter a query word.")
     matches = space.lookup.get(lang, {}).get(lookup_key(query), [])
     if not matches:
         raise LookupError(f"No exact token/surface match for {lang}:{query!r}")
     message = ""
     if len(matches) > 1:
+        preview = []
+        for row_id in matches[:5]:
+            meta = get_meta(space, int(row_id))
+            preview.append(f"{meta.get('surface') or meta.get('token')} (id {row_id})")
+        message = f"Matched {len(matches)} entries; using the first: {preview[0]}"
+    row_id = int(matches[0])
     return row_id, get_meta(space, row_id), message
+def translate_like_terminal(
     query: str,
     source_lang: str,
     top_k: int,
     min_score: float,
     csls_k: int,
     filter_stopwords: bool,
     filter_bad_tokens: bool,
     use_surface: bool,
+) -> tuple[str, list[list[Any]]]:
     try:
         space = load_space()
         opts = make_options(
             top_k,
             min_score,
         )
         source_id, source_meta, match_message = resolve_query(space, source_lang, query)
         source_vec = get_vec(space, source_id)
+        source_word = format_word(source_meta, opts)
+        target_langs = [lang for lang in space.languages if lang != source_lang]
+        lines = [
+            f"Query: {source_lang}:{source_word} "
+            f"(token={source_meta.get('token')}, id={source_id})",
+            f"Settings: score={opts.score_method}, top_k={opts.top_k}, "
+            f"min_score={opts.min_score}, csls_k={opts.csls_k}, "
+            f"candidate_retrieval_k={opts.candidate_retrieval_k}, "
+            f"bidirectional={opts.bidirectional}",
         ]
         if match_message:
+            lines.append(match_message)
+        rows: list[list[Any]] = []
+        for target_lang in target_langs:
             candidates = rank_candidates(space, source_vec, source_lang, target_lang, opts)
             kept: list[dict[str, Any]] = []
             for cand in candidates:
                 if opts.bidirectional:
                     reverse = rank_candidates(
                         continue
                 else:
                     cand["bidirectional"] = False
                 kept.append(cand)
                 if len(kept) >= opts.top_k:
                     break
+            lines.append("")
+            lines.append(f"{target_lang}:")
+            if not kept:
+                lines.append("  no candidates after filters")
                 continue
+            for i, cand in enumerate(kept, 1):
                 meta = cand["meta"]
+                word = format_word(meta, opts)
+                token = meta.get("token")
+                bidi = "yes" if cand["bidirectional"] else "no"
+                lines.append(
+                    f"  {i}. {word} "
+                    f"(token={token}, score={cand['score']:.4f}, "
+                    f"cosine={cand['cosine']:.4f}, bidi={bidi})"
+                )
                 rows.append(
+                    [
+                        target_lang,
+                        i,
+                        word,
+                        token,
+                        round(float(cand["score"]), 6),
+                        round(float(cand["cosine"]), 6),
+                        bidi,
+                    ]
                 )
+        return "\n".join(lines), rows
     except Exception as exc:
+        return f"Error: {exc}", []
+def initialize() -> tuple[Any, ...]:
     try:
         space = load_space()
         opts = default_options(space.config)
+        source_lang = space.config.get("pivot_lang", "de")
+        if source_lang not in space.languages:
+            source_lang = space.languages[0]
+        status = (
+            f"Loaded {space.artifact_uri} with "
+            f"{sum(len(item.metas) for item in space.by_lang.values()):,} vectors."
+        )
         return (
             status,
+            gr.update(choices=space.languages, value=source_lang),
             opts.top_k,
             opts.min_score,
             opts.csls_k,
             opts.candidate_retrieval_k,
             opts.csls_prefetch_k,
             opts.bidirectional,
         )
     except Exception as exc:
         return (
+            f"Load error: {exc}",
+            gr.update(choices=DEFAULT_LANGS, value="de"),
             3,
             0.15,
             10,
             9,
             50,
             True,
         )
+CSS = """
+body { background: #f7f5ef; }
+.gradio-container { max-width: 1120px !important; }
 .app-title h1 { margin-bottom: 0.15rem; }
+.status { color: #5f6b7a; font-size: 0.92rem; }
+textarea { font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; }
 """
+with gr.Blocks(title="Multilingual Dictionary Explorer", css=CSS) as demo:
     gr.Markdown(
+        "# Multilingual Dictionary Explorer\n"
+        "FAISS + CSLS translation lookup from the aligned multilingual space.",
+        elem_classes=["app-title"],
     )
+    status = gr.Markdown("Loading artifacts...", elem_classes=["status"])
+    with gr.Row():
+        with gr.Column(scale=1, min_width=320):
+            query = gr.Textbox(label="Query word", value="haus")
+            source_lang = gr.Dropdown(label="Language", choices=DEFAULT_LANGS, value="de")
+            search = gr.Button("Search", variant="primary")
+            with gr.Accordion("Parameters", open=False):
+                top_k = gr.Slider(1, 20, value=3, step=1, label="top_k")
+                min_score = gr.Slider(-2.0, 2.0, value=0.15, step=0.01, label="min_score")
+                csls_k = gr.Slider(1, 50, value=10, step=1, label="csls_k")
+                candidate_retrieval_k = gr.Slider(
+                    1, 100, value=9, step=1, label="candidate_retrieval_k"
                 )
+                csls_prefetch_k = gr.Slider(
+                    10, 500, value=50, step=1, label="csls_prefetch_k"
                 )
+                score_method = gr.Radio(["csls", "cosine"], value="csls", label="score")
+                bidirectional = gr.Checkbox(value=True, label="bidirectional_consistency")
+                filter_stopwords = gr.Checkbox(value=True, label="filter stopwords")
+                filter_bad_tokens = gr.Checkbox(value=True, label="filter bad tokens")
+                use_surface = gr.Checkbox(value=True, label="show surface forms")
+        with gr.Column(scale=2):
+            output_text = gr.Textbox(label="Terminal-style output", lines=18)
+            output_table = gr.Dataframe(
+                headers=["target_lang", "rank", "word", "token", "score", "cosine", "bidi"],
+                datatype=["str", "number", "str", "str", "number", "number", "str"],
+                interactive=False,
+                wrap=True,
+            )
+    inputs = [
         query,
         source_lang,
         top_k,
         min_score,
         csls_k,
         filter_bad_tokens,
         use_surface,
     ]
+    search.click(translate_like_terminal, inputs=inputs, outputs=[output_text, output_table])
+    query.submit(translate_like_terminal, inputs=inputs, outputs=[output_text, output_table])
     demo.load(
+        initialize,
         outputs=[
+            status,
             source_lang,
             top_k,
             min_score,
             csls_k,
             candidate_retrieval_k,
             csls_prefetch_k,
             bidirectional,
         ],
+    ).then(translate_like_terminal, inputs=inputs, outputs=[output_text, output_table])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 gradio
 faiss-cpu
 numpy
-pandas
 boto3
 botocore

 gradio
 faiss-cpu
 numpy
 boto3
 botocore