Spaces:

impresso-project
/

multilingual-static-word-embeddings-demo

Sleeping

App Files Files Community

Maslionok commited on May 26

Commit

09a837f

1 Parent(s): 245b51c

new app

Browse files

Files changed (4) hide show

.gitignore +8 -0
README.md +41 -0
app.py +1039 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__/
+*.py[cod]
+.env
+.venv/
+multilingual_space_*.json/
+*.faiss
+*.vec
+*.jsonl

README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+title: Multilingual Static Word Embeddings Demo
+sdk: gradio
+app_file: app.py
+pinned: false
+---
+# Multilingual Static Word Embeddings Demo
+This Space loads a saved aligned multilingual embedding space and lets users
+search translations and nearest neighbors with adjustable retrieval parameters.
+Required artifact files:
+- `aligned_all.faiss`
+- `all_metadata.jsonl`
+- `config.json`
+The app does not use `aligned_all.vec`.
+## Runtime configuration
+By default, the app lists the newest artifact folder under:
+`s3://131-component-staging/multilingual-static-word-embeddings/stage-6/`
+Set these Hugging Face Space secrets for S3-compatible storage:
+- `SE_ACCESS_KEY`
+- `SE_SECRET_KEY`
+- `SE_HOST_URL`
+Optional environment overrides:
+- `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
+  `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
+- `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
+- `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
+Defaults for `top_k`, `min_score`, `csls_k`, `candidate_retrieval_k`,
+`csls_prefetch_k`, and bidirectional consistency are read from `config.json`.

app.py ADDED Viewed

	@@ -0,0 +1,1039 @@

+from __future__ import annotations
+import difflib
+import gc
+import json
+import os
+import re
+import sys
+import unicodedata
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse
+import boto3
+import gradio as gr
+import numpy as np
+import pandas as pd
+from botocore.config import Config
+DEFAULT_ARTIFACT_PREFIX = (
+    "s3://131-component-staging/"
+    "multilingual-static-word-embeddings/stage-6/"
+)
+ARTIFACT_URI_ENV = "SPACE_ARTIFACT_S3_URI"
+ARTIFACT_PREFIX_ENV = "SPACE_ARTIFACT_S3_PREFIX"
+CACHE_ROOT = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
+REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
+DEFAULT_LANGUAGES = ["de", "en", "fr", "lb"]
+TRANSLATION_COLUMNS = [
+    "target_lang",
+    "translation",
+    "token",
+    "score",
+    "cosine",
+    "rank",
+    "bidirectional",
+    "id",
+    "source_vec_file",
+]
+NEIGHBOR_COLUMNS = [
+    "lang",
+    "word",
+    "token",
+    "score",
+    "cosine",
+    "rank",
+    "id",
+]
+VOCAB_COLUMNS = ["id", "lang", "surface", "token", "source_vec_file"]
+@dataclass
+class LangVectors:
+    lang: str
+    ids: np.ndarray
+    metas: list[dict[str, Any]]
+    vecs: np.ndarray
+@dataclass
+class RuntimeOptions:
+    top_k: int
+    min_score: float
+    csls_k: int
+    candidate_retrieval_k: int
+    csls_prefetch_k: int
+    bidirectional: bool
+    score_method: str
+    filter_stopwords: bool
+    filter_bad_tokens: bool
+    use_surface: bool
+@dataclass
+class Space:
+    root: Path
+    artifact_uri: str
+    config: dict[str, Any]
+    languages: list[str]
+    by_lang: dict[str, LangVectors]
+    lookup: dict[str, dict[str, list[int]]]
+    id_to_location: dict[int, tuple[str, int]]
+def parse_s3_uri(uri: str) -> tuple[str, str]:
+    parsed = urlparse(uri)
+    if parsed.scheme != "s3" or not parsed.netloc:
+        raise ValueError(f"Expected s3://bucket/key URI, got: {uri}")
+    return parsed.netloc, parsed.path.lstrip("/")
+def make_s3_client():
+    access_key = os.getenv("SE_ACCESS_KEY") or os.getenv("AWS_ACCESS_KEY_ID")
+    secret_key = os.getenv("SE_SECRET_KEY") or os.getenv("AWS_SECRET_ACCESS_KEY")
+    endpoint_url = os.getenv("SE_HOST_URL") or os.getenv("AWS_ENDPOINT_URL")
+    region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
+    if endpoint_url and not endpoint_url.startswith(("http://", "https://")):
+        endpoint_url = f"https://{endpoint_url}"
+    kwargs: dict[str, Any] = {
+        "service_name": "s3",
+        "region_name": region,
+        "config": Config(
+            signature_version="s3v4",
+            s3={"addressing_style": "path"},
+            retries={"max_attempts": 5, "mode": "standard"},
+        ),
+    }
+    if endpoint_url:
+        kwargs["endpoint_url"] = endpoint_url
+    if access_key and secret_key:
+        kwargs["aws_access_key_id"] = access_key
+        kwargs["aws_secret_access_key"] = secret_key
+    return boto3.client(**kwargs)
+def find_latest_artifact_uri(client) -> str:
+    explicit_uri = os.getenv(ARTIFACT_URI_ENV, "").strip()
+    if explicit_uri:
+        explicit_uri = explicit_uri.rstrip("/")
+        if "multilingual_space_" in explicit_uri:
+            return explicit_uri
+        bucket, prefix = parse_s3_uri(explicit_uri)
+        return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
+    prefix_uri = os.getenv(ARTIFACT_PREFIX_ENV, DEFAULT_ARTIFACT_PREFIX).strip()
+    bucket, prefix = parse_s3_uri(prefix_uri)
+    return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
+def find_latest_artifact_uri_under_prefix(client, bucket: str, prefix: str) -> str:
+    prefix = prefix.rstrip("/") + "/"
+    pattern = re.compile(r"(.*multilingual_space_(\d{8}_\d{6})\.json)/config\.json$")
+    candidates: list[tuple[str, str]] = []
+    paginator = client.get_paginator("list_objects_v2")
+    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
+        for obj in page.get("Contents", []):
+            key = obj["Key"]
+            match = pattern.match(key)
+            if match:
+                candidates.append((match.group(2), match.group(1)))
+    if not candidates:
+        raise FileNotFoundError(
+            f"No multilingual_space_*.json/config.json found under s3://{bucket}/{prefix}"
+        )
+    _, latest_key = sorted(candidates)[-1]
+    return f"s3://{bucket}/{latest_key}"
+def artifact_cache_dir(artifact_uri: str) -> Path:
+    _, key = parse_s3_uri(artifact_uri)
+    name = Path(key.rstrip("/")).name
+    return CACHE_ROOT / name
+def download_artifact() -> tuple[Path, str]:
+    client = make_s3_client()
+    artifact_uri = find_latest_artifact_uri(client)
+    local_dir = artifact_cache_dir(artifact_uri)
+    local_dir.mkdir(parents=True, exist_ok=True)
+    bucket, key_prefix = parse_s3_uri(artifact_uri)
+    key_prefix = key_prefix.rstrip("/")
+    for filename in REQUIRED_FILES:
+        local_path = local_dir / filename
+        if local_path.exists() and local_path.stat().st_size > 0:
+            continue
+        key = f"{key_prefix}/{filename}"
+        print(f"Downloading s3://{bucket}/{key} -> {local_path}", file=sys.stderr)
+        client.download_file(bucket, key, str(local_path))
+    return local_dir, artifact_uri
+def strip_diacritics(text: str) -> str:
+    return "".join(
+        ch for ch in unicodedata.normalize("NFKD", text) if not unicodedata.combining(ch)
+    )
+def lookup_key(text: str) -> str:
+    text = " ".join(text.strip().casefold().split())
+    return strip_diacritics(text)
+def is_good_token(token: str, min_len: int = 4) -> bool:
+    if not token or len(token) < min_len or token.isdigit():
+        return False
+    alpha = sum(ch.isalpha() for ch in token)
+    if alpha < 2:
+        return False
+    return all(ch.isalnum() or ch in "-'_" for ch in token)
+def read_config(space_dir: Path) -> dict[str, Any]:
+    with (space_dir / "config.json").open("r", encoding="utf-8") as f:
+        return json.load(f)
+def read_metadata(space_dir: Path) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
+    metadata_path = space_dir / "all_metadata.jsonl"
+    metadata: list[dict[str, Any] | None] = []
+    ids_by_lang: dict[str, list[int]] = {}
+    with metadata_path.open("r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            meta = json.loads(line)
+            row_id = int(meta["id"])
+            while len(metadata) <= row_id:
+                metadata.append(None)
+            metadata[row_id] = meta
+            ids_by_lang.setdefault(str(meta["lang"]), []).append(row_id)
+    missing = [i for i, meta in enumerate(metadata) if meta is None]
+    if missing:
+        raise ValueError(f"Metadata ids are not contiguous; first missing id is {missing[0]}")
+    return [m for m in metadata if m is not None], ids_by_lang
+def reconstruct_range(index: Any, start: int, count: int) -> np.ndarray:
+    try:
+        vecs = index.reconstruct_n(start, count)
+    except TypeError:
+        vecs = np.empty((count, index.d), dtype=np.float32)
+        index.reconstruct_n(start, count, vecs)
+    return np.ascontiguousarray(vecs, dtype=np.float32)
+def reconstruct_ids(index: Any, ids: list[int]) -> np.ndarray:
+    if not ids:
+        return np.empty((0, index.d), dtype=np.float32)
+    start = ids[0]
+    if ids == list(range(start, start + len(ids))):
+        return reconstruct_range(index, start, len(ids))
+    vecs = np.empty((len(ids), index.d), dtype=np.float32)
+    for local_i, row_id in enumerate(ids):
+        try:
+            vecs[local_i] = index.reconstruct(int(row_id))
+        except TypeError:
+            index.reconstruct(int(row_id), vecs[local_i])
+    return np.ascontiguousarray(vecs, dtype=np.float32)
+def normalize_rows(vecs: np.ndarray) -> np.ndarray:
+    norms = np.linalg.norm(vecs, axis=1, keepdims=True)
+    return (vecs / (norms + 1e-12)).astype(np.float32, copy=False)
+def load_vectors_from_faiss(space_dir: Path, ids_by_lang: dict[str, list[int]]) -> dict[str, np.ndarray]:
+    faiss_path = space_dir / "aligned_all.faiss"
+    try:
+        import faiss  # type: ignore
+    except ImportError as exc:
+        raise RuntimeError(
+            "faiss-cpu is required. The Space must install faiss-cpu from requirements.txt."
+        ) from exc
+    print(f"Loading FAISS index: {faiss_path}", file=sys.stderr)
+    index = faiss.read_index(str(faiss_path))
+    vectors_by_lang: dict[str, np.ndarray] = {}
+    for lang, ids in sorted(ids_by_lang.items()):
+        print(f"Reconstructing {lang}: {len(ids)} vectors", file=sys.stderr)
+        vectors_by_lang[lang] = normalize_rows(reconstruct_ids(index, ids))
+    del index
+    gc.collect()
+    return vectors_by_lang
+def build_lookup(languages: dict[str, LangVectors]) -> dict[str, dict[str, list[int]]]:
+    lookup: dict[str, dict[str, list[int]]] = {}
+    for lang, data in languages.items():
+        lang_lookup: dict[str, list[int]] = {}
+        for global_id, meta in zip(data.ids.tolist(), data.metas):
+            for value in (meta.get("token"), meta.get("surface")):
+                if not value:
+                    continue
+                lang_lookup.setdefault(lookup_key(str(value)), []).append(int(global_id))
+        lookup[lang] = lang_lookup
+    return lookup
+@lru_cache(maxsize=1)
+def load_space() -> Space:
+    space_dir, artifact_uri = download_artifact()
+    config = read_config(space_dir)
+    metadata, ids_by_lang = read_metadata(space_dir)
+    vectors_by_lang = load_vectors_from_faiss(space_dir, ids_by_lang)
+    by_lang: dict[str, LangVectors] = {}
+    id_to_location: dict[int, tuple[str, int]] = {}
+    languages = list(config.get("languages") or sorted(ids_by_lang))
+    for lang in languages:
+        ids = ids_by_lang.get(lang)
+        if not ids:
+            continue
+        metas = [metadata[row_id] for row_id in ids]
+        vecs = vectors_by_lang[lang]
+        by_lang[lang] = LangVectors(
+            lang=lang,
+            ids=np.asarray(ids, dtype=np.int64),
+            metas=metas,
+            vecs=vecs,
+        )
+        for local_i, row_id in enumerate(ids):
+            id_to_location[int(row_id)] = (lang, local_i)
+    languages = [lang for lang in languages if lang in by_lang]
+    return Space(
+        root=space_dir,
+        artifact_uri=artifact_uri,
+        config=config,
+        languages=languages,
+        by_lang=by_lang,
+        lookup=build_lookup(by_lang),
+        id_to_location=id_to_location,
+    )
+def default_options(config: dict[str, Any]) -> RuntimeOptions:
+    bidi_config = config.get("bidirectional_consistency") or {}
+    return RuntimeOptions(
+        top_k=int(config.get("top_k", 3)),
+        min_score=float(config.get("min_score", 0.15)),
+        csls_k=int(config.get("csls_k", 10)),
+        candidate_retrieval_k=int(config.get("candidate_retrieval_k", 9)),
+        csls_prefetch_k=int(config.get("csls_prefetch_k", 50)),
+        bidirectional=bool(bidi_config.get("enabled", True)),
+        score_method="csls",
+        filter_stopwords=True,
+        filter_bad_tokens=True,
+        use_surface=True,
+    )
+def make_options(
+    top_k: int,
+    min_score: float,
+    csls_k: int,
+    candidate_retrieval_k: int,
+    csls_prefetch_k: int,
+    bidirectional: bool,
+    score_method: str,
+    filter_stopwords: bool,
+    filter_bad_tokens: bool,
+    use_surface: bool,
+) -> RuntimeOptions:
+    return RuntimeOptions(
+        top_k=int(top_k),
+        min_score=float(min_score),
+        csls_k=int(csls_k),
+        candidate_retrieval_k=int(candidate_retrieval_k),
+        csls_prefetch_k=int(csls_prefetch_k),
+        bidirectional=bool(bidirectional),
+        score_method=str(score_method).lower(),
+        filter_stopwords=bool(filter_stopwords),
+        filter_bad_tokens=bool(filter_bad_tokens),
+        use_surface=bool(use_surface),
+    )
+def top_indices(values: np.ndarray, k: int) -> np.ndarray:
+    k = min(max(0, k), values.shape[0])
+    if k == 0:
+        return np.empty((0,), dtype=np.int64)
+    if k >= values.shape[0]:
+        return np.argsort(-values)
+    idx = np.argpartition(-values, k - 1)[:k]
+    return idx[np.argsort(-values[idx])]
+def top_mean(values: np.ndarray, k: int) -> float:
+    k = min(max(1, k), values.shape[0])
+    idx = top_indices(values, k)
+    return float(values[idx].mean())
+def candidate_allowed(meta: dict[str, Any], lang: str, space: Space, opts: RuntimeOptions) -> bool:
+    token = str(meta.get("token") or "")
+    if opts.filter_bad_tokens:
+        min_len = int((space.config.get("filters") or {}).get("target_is_good_token_min_len", 4))
+        if not is_good_token(token, min_len):
+            return False
+    if opts.filter_stopwords:
+        stopwords = set((space.config.get("stopwords") or {}).get(lang, []))
+        if token.lower() in stopwords:
+            return False
+    return True
+def rank_candidates(
+    space: Space,
+    query_vec: np.ndarray,
+    source_lang: str,
+    target_lang: str,
+    opts: RuntimeOptions,
+    *,
+    apply_filters: bool = True,
+) -> list[dict[str, Any]]:
+    source = space.by_lang[source_lang]
+    target = space.by_lang[target_lang]
+    cosine_all = target.vecs @ query_vec
+    prefetch_k = max(opts.candidate_retrieval_k, opts.csls_prefetch_k, opts.top_k)
+    prefetch_ids = top_indices(cosine_all, min(prefetch_k, len(target.metas)))
+    candidate_cosines = cosine_all[prefetch_ids]
+    if opts.score_method == "csls":
+        r_query = top_mean(cosine_all, opts.csls_k)
+        candidate_vecs = target.vecs[prefetch_ids]
+        reverse_sims = candidate_vecs @ source.vecs.T
+        r_targets = np.asarray(
+            [top_mean(reverse_sims[i], opts.csls_k) for i in range(reverse_sims.shape[0])],
+            dtype=np.float32,
+        )
+        scores = (2.0 * candidate_cosines - r_query - r_targets).astype(np.float32)
+    else:
+        scores = candidate_cosines.astype(np.float32)
+    order = np.argsort(-scores)[: opts.candidate_retrieval_k]
+    results: list[dict[str, Any]] = []
+    seen_surfaces: set[str] = set()
+    dedupe_surfaces = bool(
+        (space.config.get("filters") or {}).get("duplicate_target_surfaces_removed", True)
+    )
+    for rank, pos in enumerate(order, 1):
+        local_id = int(prefetch_ids[pos])
+        meta = target.metas[local_id]
+        score = float(scores[pos])
+        if score < opts.min_score:
+            continue
+        if apply_filters and not candidate_allowed(meta, target_lang, space, opts):
+            continue
+        surface = str(meta.get("surface") or meta.get("token") or "")
+        if dedupe_surfaces and surface in seen_surfaces:
+            continue
+        seen_surfaces.add(surface)
+        results.append(
+            {
+                "rank": rank,
+                "global_id": int(target.ids[local_id]),
+                "local_id": local_id,
+                "meta": meta,
+                "score": score,
+                "cosine": float(candidate_cosines[pos]),
+                "bidirectional": None,
+            }
+        )
+    return results
+def get_meta(space: Space, global_id: int) -> dict[str, Any]:
+    lang, local_id = space.id_to_location[int(global_id)]
+    return space.by_lang[lang].metas[local_id]
+def get_vec(space: Space, global_id: int) -> np.ndarray:
+    lang, local_id = space.id_to_location[int(global_id)]
+    return space.by_lang[lang].vecs[local_id]
+def format_word(meta: dict[str, Any], opts: RuntimeOptions) -> str:
+    if opts.use_surface:
+        return str(meta.get("surface") or meta.get("token") or "")
+    return str(meta.get("token") or meta.get("surface") or "")
+def suggestions(space: Space, lang: str, query: str, limit: int = 8) -> list[str]:
+    lang_lookup = space.lookup.get(lang, {})
+    key = lookup_key(query)
+    close_keys = difflib.get_close_matches(key, lang_lookup.keys(), n=limit, cutoff=0.72)
+    labels = []
+    for close_key in close_keys:
+        row_id = lang_lookup[close_key][0]
+        meta = get_meta(space, row_id)
+        label = str(meta.get("surface") or meta.get("token") or "")
+        if label and label not in labels:
+            labels.append(label)
+    return labels
+def resolve_query(space: Space, lang: str, query: str) -> tuple[int, dict[str, Any], str]:
+    if lang not in space.by_lang:
+        raise ValueError(f"Unknown language {lang!r}. Available: {', '.join(space.languages)}")
+    query = query.strip()
+    if not query:
+        raise ValueError("Enter a query word.")
+    matches = space.lookup.get(lang, {}).get(lookup_key(query), [])
+    if not matches:
+        hint = suggestions(space, lang, query)
+        if hint:
+            raise LookupError(f"No exact match. Close matches: {', '.join(hint)}")
+        raise LookupError(f"No exact token/surface match for {lang}:{query!r}")
+    row_id = int(matches[0])
+    message = ""
+    if len(matches) > 1:
+        shown = []
+        for match_id in matches[:5]:
+            meta = get_meta(space, match_id)
+            shown.append(f"{meta.get('surface') or meta.get('token')} (id {match_id})")
+        message = f"Matched {len(matches)} entries; using {shown[0]}."
+    return row_id, get_meta(space, row_id), message
+def translation_dataframe() -> pd.DataFrame:
+    return pd.DataFrame(columns=TRANSLATION_COLUMNS)
+def neighbor_dataframe() -> pd.DataFrame:
+    return pd.DataFrame(columns=NEIGHBOR_COLUMNS)
+def vocabulary_dataframe() -> pd.DataFrame:
+    return pd.DataFrame(columns=VOCAB_COLUMNS)
+def translate_ui(
+    query: str,
+    source_lang: str,
+    target_langs: list[str] | None,
+    top_k: int,
+    min_score: float,
+    csls_k: int,
+    candidate_retrieval_k: int,
+    csls_prefetch_k: int,
+    bidirectional: bool,
+    score_method: str,
+    filter_stopwords: bool,
+    filter_bad_tokens: bool,
+    use_surface: bool,
+) -> tuple[pd.DataFrame, str]:
+    try:
+        space = load_space()
+        targets = target_langs or [lang for lang in space.languages if lang != source_lang]
+        opts = make_options(
+            top_k,
+            min_score,
+            csls_k,
+            candidate_retrieval_k,
+            csls_prefetch_k,
+            bidirectional,
+            score_method,
+            filter_stopwords,
+            filter_bad_tokens,
+            use_surface,
+        )
+        source_id, source_meta, match_message = resolve_query(space, source_lang, query)
+        source_vec = get_vec(space, source_id)
+        rows: list[dict[str, Any]] = []
+        grouped: list[str] = [
+            f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
+            f"(token `{source_meta.get('token')}`, id `{source_id}`)"
+        ]
+        if match_message:
+            grouped.append(match_message)
+        for target_lang in targets:
+            if target_lang == source_lang or target_lang not in space.by_lang:
+                continue
+            candidates = rank_candidates(space, source_vec, source_lang, target_lang, opts)
+            kept: list[dict[str, Any]] = []
+            for cand in candidates:
+                if opts.bidirectional:
+                    reverse = rank_candidates(
+                        space,
+                        get_vec(space, int(cand["global_id"])),
+                        target_lang,
+                        source_lang,
+                        opts,
+                    )
+                    reverse_ids = {int(item["global_id"]) for item in reverse}
+                    cand["bidirectional"] = source_id in reverse_ids
+                    if not cand["bidirectional"]:
+                        continue
+                else:
+                    cand["bidirectional"] = False
+                kept.append(cand)
+                if len(kept) >= opts.top_k:
+                    break
+            if kept:
+                grouped.append(f"\n{target_lang}:")
+                for i, cand in enumerate(kept, 1):
+                    meta = cand["meta"]
+                    word = format_word(meta, opts)
+                    grouped.append(f"{i}. {word} ({cand['score']:.4f})")
+                    rows.append(
+                        {
+                            "target_lang": target_lang,
+                            "translation": word,
+                            "token": meta.get("token"),
+                            "score": round(float(cand["score"]), 6),
+                            "cosine": round(float(cand["cosine"]), 6),
+                            "rank": int(cand["rank"]),
+                            "bidirectional": bool(cand["bidirectional"]),
+                            "id": int(cand["global_id"]),
+                            "source_vec_file": meta.get("source_vec_file"),
+                        }
+                    )
+            else:
+                grouped.append(f"\n{target_lang}: no candidates after filters")
+        return pd.DataFrame(rows, columns=TRANSLATION_COLUMNS), "\n".join(grouped)
+    except Exception as exc:
+        return translation_dataframe(), f"Error: {exc}"
+def nearest_ui(
+    query: str,
+    source_lang: str,
+    neighbor_langs: list[str] | None,
+    top_n: int,
+    min_score: float,
+    csls_k: int,
+    score_method: str,
+    include_source_language: bool,
+    use_surface: bool,
+) -> tuple[pd.DataFrame, str]:
+    try:
+        space = load_space()
+        opts = make_options(
+            top_n,
+            min_score,
+            csls_k,
+            max(top_n + 5, 20),
+            max(top_n + 5, 50),
+            False,
+            score_method,
+            False,
+            False,
+            use_surface,
+        )
+        source_id, source_meta, match_message = resolve_query(space, source_lang, query)
+        source_vec = get_vec(space, source_id)
+        targets = neighbor_langs or space.languages
+        if not include_source_language:
+            targets = [lang for lang in targets if lang != source_lang]
+        rows: list[dict[str, Any]] = []
+        for target_lang in targets:
+            if target_lang not in space.by_lang:
+                continue
+            candidates = rank_candidates(
+                space,
+                source_vec,
+                source_lang,
+                target_lang,
+                opts,
+                apply_filters=False,
+            )
+            for cand in candidates:
+                if int(cand["global_id"]) == source_id:
+                    continue
+                meta = cand["meta"]
+                rows.append(
+                    {
+                        "lang": target_lang,
+                        "word": format_word(meta, opts),
+                        "token": meta.get("token"),
+                        "score": round(float(cand["score"]), 6),
+                        "cosine": round(float(cand["cosine"]), 6),
+                        "rank": int(cand["rank"]),
+                        "id": int(cand["global_id"]),
+                    }
+                )
+                if len([row for row in rows if row["lang"] == target_lang]) >= top_n:
+                    break
+        rows = sorted(rows, key=lambda row: row["score"], reverse=True)
+        status = (
+            f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
+            f"(token `{source_meta.get('token')}`, id `{source_id}`)"
+        )
+        if match_message:
+            status += f"\n\n{match_message}"
+        return pd.DataFrame(rows, columns=NEIGHBOR_COLUMNS), status
+    except Exception as exc:
+        return neighbor_dataframe(), f"Error: {exc}"
+def browse_ui(lang: str, filter_text: str, limit: int) -> pd.DataFrame:
+    try:
+        space = load_space()
+        if lang not in space.by_lang:
+            return vocabulary_dataframe()
+        needle = lookup_key(filter_text or "")
+        rows = []
+        for row_id, meta in zip(space.by_lang[lang].ids.tolist(), space.by_lang[lang].metas):
+            surface = str(meta.get("surface") or "")
+            token = str(meta.get("token") or "")
+            if needle and needle not in lookup_key(surface) and needle not in lookup_key(token):
+                continue
+            rows.append(
+                {
+                    "id": int(row_id),
+                    "lang": lang,
+                    "surface": surface,
+                    "token": token,
+                    "source_vec_file": meta.get("source_vec_file"),
+                }
+            )
+            if len(rows) >= int(limit):
+                break
+        return pd.DataFrame(rows, columns=VOCAB_COLUMNS)
+    except Exception:
+        return vocabulary_dataframe()
+def config_markdown(space: Space) -> str:
+    config = space.config
+    vocab_sizes = config.get("vocab_sizes") or {
+        lang: len(space.by_lang[lang].metas) for lang in space.languages
+    }
+    bidi = config.get("bidirectional_consistency") or {}
+    lines = [
+        f"Artifact: `{space.artifact_uri}`",
+        f"Created: `{config.get('created_at', 'unknown')}`",
+        f"Languages: `{', '.join(space.languages)}`",
+        f"Pivot language: `{config.get('pivot_lang', 'unknown')}`",
+        f"Vector dim: `{config.get('vector_dim', 'unknown')}`",
+        f"Top N vocab: `{config.get('top_n_vocab', 'unknown')}`",
+        f"Output top: `{config.get('out_top', 'unknown')}`",
+        f"Default top_k: `{config.get('top_k', 3)}`",
+        f"Default min_score: `{config.get('min_score', 0.15)}`",
+        f"Default csls_k: `{config.get('csls_k', 10)}`",
+        f"Bidirectional consistency: `{bool(bidi.get('enabled', True))}`",
+        "",
+        "Vocabulary sizes:",
+    ]
+    for lang, size in sorted(vocab_sizes.items()):
+        lines.append(f"- `{lang}`: `{size}`")
+    return "\n".join(lines)
+def initialize_ui():
+    try:
+        space = load_space()
+        opts = default_options(space.config)
+        source = space.config.get("pivot_lang", "de")
+        if source not in space.languages:
+            source = space.languages[0]
+        targets = [lang for lang in space.languages if lang != source]
+        status = f"Loaded `{space.artifact_uri}` with `{sum(len(v.metas) for v in space.by_lang.values())}` vectors."
+        return (
+            status,
+            gr.update(choices=space.languages, value=source),
+            gr.update(choices=space.languages, value=targets),
+            opts.top_k,
+            opts.min_score,
+            opts.csls_k,
+            opts.candidate_retrieval_k,
+            opts.csls_prefetch_k,
+            opts.bidirectional,
+            gr.update(choices=space.languages, value=source),
+            gr.update(choices=space.languages, value=space.languages),
+            opts.csls_k,
+            gr.update(choices=space.languages, value=source),
+            config_markdown(space),
+        )
+    except Exception as exc:
+        status = f"Load error: {exc}"
+        return (
+            status,
+            gr.update(choices=DEFAULT_LANGUAGES, value="de"),
+            gr.update(choices=DEFAULT_LANGUAGES, value=["en", "fr", "lb"]),
+            3,
+            0.15,
+            10,
+            9,
+            50,
+            True,
+            gr.update(choices=DEFAULT_LANGUAGES, value="de"),
+            gr.update(choices=DEFAULT_LANGUAGES, value=DEFAULT_LANGUAGES),
+            10,
+            gr.update(choices=DEFAULT_LANGUAGES, value="de"),
+            status,
+        )
+def update_targets(source_lang: str) -> gr.CheckboxGroup:
+    try:
+        space = load_space()
+        return gr.update(
+            choices=space.languages,
+            value=[lang for lang in space.languages if lang != source_lang],
+        )
+    except Exception:
+        return gr.update(
+            choices=DEFAULT_LANGUAGES,
+            value=[lang for lang in DEFAULT_LANGUAGES if lang != source_lang],
+        )
+def update_neighbor_langs(source_lang: str, include_source: bool) -> gr.CheckboxGroup:
+    try:
+        space = load_space()
+        choices = space.languages
+    except Exception:
+        choices = DEFAULT_LANGUAGES
+    values = choices if include_source else [lang for lang in choices if lang != source_lang]
+    return gr.update(choices=choices, value=values)
+css = """
+.app-title h1 { margin-bottom: 0.15rem; }
+.status-line { font-size: 0.9rem; color: #475569; }
+"""
+with gr.Blocks(title="Multilingual Static Word Embeddings", css=css) as demo:
+    gr.Markdown(
+        "# Multilingual Static Word Embeddings\n"
+        "Search the aligned FAISS space for cross-lingual word neighbors."
+    )
+    status_md = gr.Markdown("Loading artifacts...", elem_classes=["status-line"])
+    with gr.Tab("Translate"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=320):
+                query = gr.Textbox(label="Query word", value="haus")
+                source_lang = gr.Dropdown(
+                    label="Source language",
+                    choices=DEFAULT_LANGUAGES,
+                    value="de",
+                )
+                target_langs = gr.CheckboxGroup(
+                    label="Target languages",
+                    choices=DEFAULT_LANGUAGES,
+                    value=["en", "fr", "lb"],
+                )
+                translate_btn = gr.Button("Search", variant="primary")
+                with gr.Accordion("Retrieval parameters", open=True):
+                    top_k = gr.Slider(1, 20, value=3, step=1, label="Top K")
+                    min_score = gr.Slider(-2.0, 2.0, value=0.15, step=0.01, label="Min score")
+                    score_method = gr.Radio(
+                        ["csls", "cosine"],
+                        value="csls",
+                        label="Score method",
+                    )
+                    csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
+                    candidate_retrieval_k = gr.Slider(
+                        1,
+                        100,
+                        value=9,
+                        step=1,
+                        label="Candidate retrieval K",
+                    )
+                    csls_prefetch_k = gr.Slider(
+                        10,
+                        500,
+                        value=50,
+                        step=1,
+                        label="CSLS prefetch K",
+                    )
+                    bidirectional = gr.Checkbox(value=True, label="Bidirectional consistency")
+                    filter_stopwords = gr.Checkbox(value=True, label="Filter stopwords")
+                    filter_bad_tokens = gr.Checkbox(value=True, label="Filter noisy tokens")
+                    use_surface = gr.Checkbox(value=True, label="Show surface forms")
+            with gr.Column(scale=2):
+                translate_summary = gr.Markdown()
+                translation_results = gr.Dataframe(
+                    headers=TRANSLATION_COLUMNS,
+                    datatype=["str", "str", "str", "number", "number", "number", "bool", "number", "str"],
+                    interactive=False,
+                    wrap=True,
+                )
+    with gr.Tab("Nearest Neighbors"):
+        with gr.Row():
+            with gr.Column(scale=1, min_width=320):
+                nn_query = gr.Textbox(label="Query word", value="haus")
+                nn_source_lang = gr.Dropdown(
+                    label="Source language",
+                    choices=DEFAULT_LANGUAGES,
+                    value="de",
+                )
+                nn_langs = gr.CheckboxGroup(
+                    label="Neighbor languages",
+                    choices=DEFAULT_LANGUAGES,
+                    value=DEFAULT_LANGUAGES,
+                )
+                nn_top_n = gr.Slider(1, 50, value=20, step=1, label="Top N per language")
+                nn_min_score = gr.Slider(-2.0, 2.0, value=-2.0, step=0.01, label="Min score")
+                nn_score_method = gr.Radio(["csls", "cosine"], value="cosine", label="Score method")
+                nn_csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
+                nn_include_source = gr.Checkbox(value=True, label="Include source language")
+                nn_use_surface = gr.Checkbox(value=True, label="Show surface forms")
+                nn_btn = gr.Button("Find neighbors", variant="primary")
+            with gr.Column(scale=2):
+                nn_summary = gr.Markdown()
+                nn_results = gr.Dataframe(
+                    headers=NEIGHBOR_COLUMNS,
+                    datatype=["str", "str", "str", "number", "number", "number", "number"],
+                    interactive=False,
+                    wrap=True,
+                )
+    with gr.Tab("Browse Vocabulary"):
+        with gr.Row():
+            vocab_lang = gr.Dropdown(label="Language", choices=DEFAULT_LANGUAGES, value="de")
+            vocab_filter = gr.Textbox(label="Filter", placeholder="Type part of a token or surface form")
+            vocab_limit = gr.Slider(10, 500, value=100, step=10, label="Limit")
+        vocab_results = gr.Dataframe(
+            headers=VOCAB_COLUMNS,
+            datatype=["number", "str", "str", "str", "str"],
+            interactive=False,
+            wrap=True,
+        )
+    with gr.Tab("Artifact Info"):
+        artifact_info = gr.Markdown("Loading config...")
+    translate_inputs = [
+        query,
+        source_lang,
+        target_langs,
+        top_k,
+        min_score,
+        csls_k,
+        candidate_retrieval_k,
+        csls_prefetch_k,
+        bidirectional,
+        score_method,
+        filter_stopwords,
+        filter_bad_tokens,
+        use_surface,
+    ]
+    translate_btn.click(
+        translate_ui,
+        inputs=translate_inputs,
+        outputs=[translation_results, translate_summary],
+    )
+    query.submit(
+        translate_ui,
+        inputs=translate_inputs,
+        outputs=[translation_results, translate_summary],
+    )
+    source_lang.change(update_targets, inputs=source_lang, outputs=target_langs)
+    nn_btn.click(
+        nearest_ui,
+        inputs=[
+            nn_query,
+            nn_source_lang,
+            nn_langs,
+            nn_top_n,
+            nn_min_score,
+            nn_csls_k,
+            nn_score_method,
+            nn_include_source,
+            nn_use_surface,
+        ],
+        outputs=[nn_results, nn_summary],
+    )
+    nn_query.submit(
+        nearest_ui,
+        inputs=[
+            nn_query,
+            nn_source_lang,
+            nn_langs,
+            nn_top_n,
+            nn_min_score,
+            nn_csls_k,
+            nn_score_method,
+            nn_include_source,
+            nn_use_surface,
+        ],
+        outputs=[nn_results, nn_summary],
+    )
+    nn_source_lang.change(
+        update_neighbor_langs,
+        inputs=[nn_source_lang, nn_include_source],
+        outputs=nn_langs,
+    )
+    nn_include_source.change(
+        update_neighbor_langs,
+        inputs=[nn_source_lang, nn_include_source],
+        outputs=nn_langs,
+    )
+    vocab_lang.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
+    vocab_filter.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
+    vocab_limit.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
+    demo.load(
+        initialize_ui,
+        outputs=[
+            status_md,
+            source_lang,
+            target_langs,
+            top_k,
+            min_score,
+            csls_k,
+            candidate_retrieval_k,
+            csls_prefetch_k,
+            bidirectional,
+            nn_source_lang,
+            nn_langs,
+            nn_csls_k,
+            vocab_lang,
+            artifact_info,
+        ],
+    ).then(
+        translate_ui,
+        inputs=translate_inputs,
+        outputs=[translation_results, translate_summary],
+    ).then(
+        browse_ui,
+        inputs=[vocab_lang, vocab_filter, vocab_limit],
+        outputs=vocab_results,
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+faiss-cpu
+numpy
+pandas
+boto3
+botocore