Spaces:

impresso-project
/

multilingual-static-word-embeddings-demo

Sleeping

App Files Files Community

Maslionok commited on May 26

Commit

245b51c

1 Parent(s): e746bd7

cleaned

Browse files

Files changed (4) hide show

.gitattributes +0 -35
README.md +0 -55
app.py +0 -1697
requirements.txt +0 -8

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md DELETED Viewed

@@ -1,55 +0,0 @@
----
-title: Multilingual Static Word Embeddings Demo
-emoji: 🧭
-colorFrom: green
-colorTo: indigo
-sdk: gradio
-sdk_version: 6.14.0
-python_version: "3.11"
-app_file: app.py
-pinned: false
----
-# Multilingual Static Word Embeddings Demo
-Gradio Space for exploring an aligned multilingual static word embedding artifact produced by Stage 6 of `build_multilingual_dictionary.py` when `SAVE_ALLIGNED_SPACE=true`.
-The app loads the newest S3 folder matching:
-```text
-s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_*.json/
-```
-Required files inside the artifact folder:
-- `aligned_all.faiss`
-- `all_metadata.jsonl`
-- `config.json`
-`aligned_all.vec` is downloaded only if vectors cannot be reconstructed from the FAISS index.
-The selected artifact's S3 `config.json` is used for live UI defaults, stopwords, and metadata display. Changing the artifact dropdown reloads that folder's corresponding config.
-## Space Secrets
-Set these Hugging Face Space secrets if the S3 bucket is private:
-- `SE_ACCESS_KEY`
-- `SE_SECRET_KEY`
-- `SE_HOST`
-`SE_HOST` may be either a hostname or a full `https://...` endpoint URL. The app also still supports `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` as fallback names.
-The app lists existing aligned-space folders in a dropdown using the timestamp in names like `multilingual_space_20260521_133953.json/`. The newest timestamp is selected by default.
-To preselect a specific artifact instead of the newest folder, set:
-```text
-SPACE_ARTIFACT_S3_URI=s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_<TIMESTAMP>.json/
-```
-## What Can Be Changed Live
-The Translate tab allows live changes to retrieval and filtering parameters such as `top_k`, `min_score`, `csls_k`, candidate multiplier, FAISS prefetch, score method, stopword filtering, minimum token length, fuzzy lookup, and bidirectional consistency.
-Alignment/build parameters such as `pivot_lang`, `top_n_vocab`, `out_top`, `align_iters`, `init_pairs`, and `max_pairs` are shown as read-only artifact metadata because changing them requires rebuilding the aligned vector space.

app.py DELETED Viewed

@@ -1,1697 +0,0 @@
-from __future__ import annotations
-import fnmatch
-import hashlib
-import json
-import math
-import os
-import random
-import re
-import sys
-import threading
-import unicodedata
-from collections import defaultdict
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
-from urllib.parse import urlparse
-os.environ.setdefault("MPLCONFIGDIR", "/tmp/matplotlib")
-os.environ.setdefault("XDG_CACHE_HOME", "/tmp/.cache")
-os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")
-os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
-_ORIGINAL_UNRAISABLEHOOK = sys.unraisablehook
-def _quiet_asyncio_invalid_fd(unraisable):
-    if (
-        isinstance(unraisable.exc_value, ValueError)
-        and "Invalid file descriptor: -1" in str(unraisable.exc_value)
-        and "BaseEventLoop.__del__" in repr(unraisable.object)
-    ):
-        return
-    _ORIGINAL_UNRAISABLEHOOK(unraisable)
-sys.unraisablehook = _quiet_asyncio_invalid_fd
-import boto3
-import gradio as gr
-import numpy as np
-import pandas as pd
-from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
-from dotenv import load_dotenv
-try:
-    import faiss
-except Exception as exc:  # pragma: no cover - shown as a startup error in the UI.
-    faiss = None
-    FAISS_IMPORT_ERROR = exc
-else:
-    FAISS_IMPORT_ERROR = None
-try:
-    from rapidfuzz import fuzz, process
-except Exception:  # pragma: no cover - rapidfuzz is optional at runtime.
-    fuzz = None
-    process = None
-load_dotenv()
-BASE_ARTIFACT_S3_URI = "s3://131-component-staging/multilingual-static-word-embeddings/stage-6/"
-ARTIFACT_ENV_VAR = "SPACE_ARTIFACT_S3_URI"
-SE_ACCESS_KEY_ENV = "SE_ACCESS_KEY"
-SE_SECRET_KEY_ENV = "SE_SECRET_KEY"
-SE_HOST_ENV = "SE_HOST"
-CACHE_ROOT = Path("/tmp/multilingual_space_artifacts")
-REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
-OPTIONAL_VEC_FILE = "aligned_all.vec"
-def _config_get_raw(config: dict[str, Any], keys: tuple[str, ...], default: Any = "") -> Any:
-    for key in keys:
-        if key in config:
-            return config[key]
-    for section_name in ("config", "params", "args", "stage_6", "alignment", "dictionary", "preprocessing", "filters"):
-        section = config.get(section_name)
-        if isinstance(section, dict):
-            found = _config_get_raw(section, keys, None)
-            if found is not None:
-                return found
-    return default
-def _as_bool(value: Any, default: bool) -> bool:
-    if isinstance(value, dict) and "enabled" in value:
-        return _as_bool(value["enabled"], default)
-    if isinstance(value, bool):
-        return value
-    if isinstance(value, str):
-        normalized = value.strip().casefold()
-        if normalized in {"1", "true", "yes", "y", "on"}:
-            return True
-        if normalized in {"0", "false", "no", "n", "off"}:
-            return False
-    if value is None:
-        return default
-    return bool(value)
-def _as_int(value: Any, default: int) -> int:
-    try:
-        return int(value)
-    except (TypeError, ValueError):
-        return default
-def _as_float(value: Any, default: float) -> float:
-    try:
-        return float(value)
-    except (TypeError, ValueError):
-        return default
-BASE_DEFAULTS = {
-    "pivot_lang": "de",
-    "top_n_vocab": 150000,
-    "out_top": 50000,
-    "top_k": 3,
-    "min_score": 0.15,
-    "align_iters": 5,
-    "init_pairs": 5000,
-    "max_pairs": 15000,
-    "csls_k": 10,
-    "candidate_retrieval_k_multiplier": 3,
-    "csls_prefetch_k": 50,
-    "bidirectional_consistency": True,
-    "use_surface_forms": True,
-    "hide_stopwords": True,
-    "min_token_length": 4,
-}
-INTERACTIVE_DEFAULTS = {
-    "candidate_retrieval_k_multiplier": 3,
-    "csls_prefetch_k": 20,
-    "bidirectional_consistency": False,
-    "fuzzy_fallback": False,
-}
-def _defaults_from_config(config: dict[str, Any], fallback: dict[str, Any] | None = None) -> dict[str, Any]:
-    defaults = dict(fallback or BASE_DEFAULTS)
-    top_k = _as_int(_config_get_raw(config, ("top_k",), defaults["top_k"]), defaults["top_k"])
-    candidate_retrieval_k = _config_get_raw(config, ("candidate_retrieval_k",), None)
-    if candidate_retrieval_k is not None and top_k > 0:
-        candidate_multiplier = max(1, math.ceil(_as_int(candidate_retrieval_k, top_k * 3) / top_k))
-    else:
-        candidate_multiplier = _as_int(
-            _config_get_raw(config, ("candidate_retrieval_k_multiplier",), defaults["candidate_retrieval_k_multiplier"]),
-            defaults["candidate_retrieval_k_multiplier"],
-        )
-    defaults.update(
-        {
-            "pivot_lang": str(_config_get_raw(config, ("pivot_lang", "pivot_language"), defaults["pivot_lang"])),
-            "top_n_vocab": _as_int(_config_get_raw(config, ("top_n_vocab",), defaults["top_n_vocab"]), defaults["top_n_vocab"]),
-            "out_top": _as_int(_config_get_raw(config, ("out_top",), defaults["out_top"]), defaults["out_top"]),
-            "top_k": top_k,
-            "min_score": _as_float(_config_get_raw(config, ("min_score",), defaults["min_score"]), defaults["min_score"]),
-            "align_iters": _as_int(_config_get_raw(config, ("align_iters",), defaults["align_iters"]), defaults["align_iters"]),
-            "init_pairs": _as_int(_config_get_raw(config, ("init_pairs",), defaults["init_pairs"]), defaults["init_pairs"]),
-            "max_pairs": _as_int(_config_get_raw(config, ("max_pairs",), defaults["max_pairs"]), defaults["max_pairs"]),
-            "csls_k": _as_int(_config_get_raw(config, ("csls_k",), defaults["csls_k"]), defaults["csls_k"]),
-            "candidate_retrieval_k_multiplier": candidate_multiplier,
-            "csls_prefetch_k": _as_int(
-                _config_get_raw(config, ("csls_prefetch_k",), defaults["csls_prefetch_k"]),
-                defaults["csls_prefetch_k"],
-            ),
-            "bidirectional_consistency": _as_bool(
-                _config_get_raw(config, ("bidirectional_consistency", "bidirectional"), defaults["bidirectional_consistency"]),
-                defaults["bidirectional_consistency"],
-            ),
-            "use_surface_forms": _as_bool(
-                _config_get_raw(config, ("surface_forms_enabled", "use_surface_forms"), defaults["use_surface_forms"]),
-                defaults["use_surface_forms"],
-            ),
-            "hide_stopwords": _as_bool(
-                _config_get_raw(
-                    config,
-                    ("target_stopwords_filtered_in_translation_candidates", "hide_stopwords"),
-                    defaults["hide_stopwords"],
-                ),
-                defaults["hide_stopwords"],
-            ),
-            "min_token_length": _as_int(
-                _config_get_raw(config, ("target_is_good_token_min_len", "min_token_length"), defaults["min_token_length"]),
-                defaults["min_token_length"],
-            ),
-        }
-    )
-    return defaults
-DEFAULTS = dict(BASE_DEFAULTS)
-STOPWORDS = {
-    "a",
-    "an",
-    "and",
-    "are",
-    "as",
-    "at",
-    "be",
-    "by",
-    "das",
-    "de",
-    "del",
-    "der",
-    "des",
-    "die",
-    "du",
-    "e",
-    "el",
-    "en",
-    "es",
-    "et",
-    "for",
-    "from",
-    "he",
-    "het",
-    "i",
-    "ich",
-    "il",
-    "in",
-    "is",
-    "it",
-    "la",
-    "las",
-    "le",
-    "les",
-    "lo",
-    "los",
-    "of",
-    "on",
-    "or",
-    "que",
-    "she",
-    "the",
-    "to",
-    "un",
-    "una",
-    "und",
-    "une",
-    "von",
-    "was",
-    "we",
-    "you",
-}
-class ArtifactError(RuntimeError):
-    """Raised when the Space cannot resolve, download, or load artifacts."""
-@dataclass(frozen=True)
-class ArtifactPaths:
-    s3_uri: str
-    local_dir: Path
-    faiss_path: Path
-    metadata_path: Path
-    config_path: Path
-@dataclass
-class SpaceData:
-    artifact_uri: str
-    local_dir: Path
-    config: dict[str, Any]
-    id_to_meta: dict[int, dict[str, Any]]
-    languages: list[str]
-    lang_to_ids: dict[str, np.ndarray]
-    lang_to_matrix: dict[str, np.ndarray]
-    lang_to_index: dict[str, Any]
-    id_to_lang_local: dict[int, tuple[str, int]]
-    lookup: dict[str, dict[str, dict[str, list[int]]]]
-    fuzzy_choices: dict[str, list[str]]
-    vector_dim: int
-    vector_source: str
-    vocab_sizes: dict[str, int]
-    stopwords: dict[str, set[str]]
-    csls_avg_cache: dict[tuple[int, str, int], float]
-    def vector_for_id(self, vector_id: int) -> np.ndarray:
-        lang, local_idx = self.id_to_lang_local[int(vector_id)]
-        return self.lang_to_matrix[lang][local_idx]
-_SPACE_CACHE: dict[str, SpaceData] = {}
-_LOAD_LOCK = threading.Lock()
-def _progress(progress: gr.Progress | None, value: float, message: str) -> None:
-    if progress is not None:
-        progress(value, desc=message)
-def _normalize_text(text: Any) -> str:
-    if text is None:
-        return ""
-    normalized = unicodedata.normalize("NFKC", str(text))
-    return " ".join(normalized.strip().casefold().split())
-def _display_value(value: Any) -> str:
-    if value is None:
-        return ""
-    if isinstance(value, float) and math.isnan(value):
-        return ""
-    return str(value)
-def _parse_s3_uri(uri: str) -> tuple[str, str]:
-    parsed = urlparse(uri)
-    if parsed.scheme != "s3" or not parsed.netloc:
-        raise ArtifactError(f"Expected an S3 URI like s3://bucket/prefix/, got: {uri}")
-    prefix = parsed.path.lstrip("/")
-    return parsed.netloc, prefix
-def _join_s3(base_uri: str, filename: str) -> str:
-    return f"{base_uri.rstrip('/')}/{filename}"
-def _normalize_endpoint_url(host: str) -> str | None:
-    host = host.strip()
-    if not host:
-        return None
-    if host.startswith(("http://", "https://")):
-        return host
-    return f"https://{host}"
-def _s3_client():
-    region = os.getenv("AWS_DEFAULT_REGION") or "us-east-1"
-    access_key = os.getenv(SE_ACCESS_KEY_ENV) or os.getenv("AWS_ACCESS_KEY_ID")
-    secret_key = os.getenv(SE_SECRET_KEY_ENV) or os.getenv("AWS_SECRET_ACCESS_KEY")
-    endpoint_url = _normalize_endpoint_url(os.getenv(SE_HOST_ENV, ""))
-    kwargs: dict[str, Any] = {"region_name": region}
-    if access_key and secret_key:
-        kwargs["aws_access_key_id"] = access_key
-        kwargs["aws_secret_access_key"] = secret_key
-    if endpoint_url:
-        kwargs["endpoint_url"] = endpoint_url
-    return boto3.session.Session(region_name=region).client("s3", **kwargs)
-def _credential_hint() -> str:
-    return (
-        "Set SE_ACCESS_KEY, SE_SECRET_KEY, and SE_HOST as Hugging Face Space secrets. "
-        "AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY are also supported as a fallback."
-    )
-def _is_multilingual_space_prefix(prefix: str) -> bool:
-    name = prefix.rstrip("/").split("/")[-1]
-    return fnmatch.fnmatch(name, "multilingual_space_*.json")
-def _timestamp_key_from_prefix(prefix: str) -> str:
-    name = prefix.rstrip("/").split("/")[-1]
-    match = re.search(r"multilingual_space_(.+)\.json$", name)
-    return match.group(1) if match else name
-def _normalize_artifact_uri(uri: str) -> str:
-    uri = uri.strip()
-    if uri and not uri.endswith("/"):
-        uri += "/"
-    return uri
-def _artifact_timestamp(uri: str) -> str:
-    return _timestamp_key_from_prefix(urlparse(uri).path.rstrip("/").split("/")[-1])
-def _artifact_label(uri: str) -> str:
-    timestamp = _artifact_timestamp(uri)
-    match = re.fullmatch(r"(\d{8})_(\d{6})", timestamp)
-    if not match:
-        return uri.rstrip("/").split("/")[-1]
-    date_part, time_part = match.groups()
-    return (
-        f"{date_part[:4]}-{date_part[4:6]}-{date_part[6:8]} "
-        f"{time_part[:2]}:{time_part[2:4]}:{time_part[4:6]}"
-    )
-def _artifact_dropdown_choices(uris: list[str]) -> list[tuple[str, str]]:
-    return [(f"{_artifact_label(uri)}  |  {uri.rstrip('/').split('/')[-1]}", uri) for uri in uris]
-def _list_artifact_uris(progress: gr.Progress | None = None) -> list[str]:
-    bucket, base_prefix = _parse_s3_uri(BASE_ARTIFACT_S3_URI)
-    if base_prefix and not base_prefix.endswith("/"):
-        base_prefix += "/"
-    _progress(progress, 0.05, "Listing multilingual_space_*.json artifacts")
-    client = _s3_client()
-    candidates: dict[str, Any] = {}
-    try:
-        paginator = client.get_paginator("list_objects_v2")
-        for page in paginator.paginate(Bucket=bucket, Prefix=base_prefix, Delimiter="/"):
-            for item in page.get("CommonPrefixes", []):
-                prefix = item.get("Prefix", "")
-                if _is_multilingual_space_prefix(prefix):
-                    candidates[prefix] = None
-        if not candidates:
-            for page in paginator.paginate(Bucket=bucket, Prefix=base_prefix):
-                for obj in page.get("Contents", []):
-                    key = obj.get("Key", "")
-                    parts = key.split("/")
-                    for idx, part in enumerate(parts):
-                        if fnmatch.fnmatch(part, "multilingual_space_*.json"):
-                            prefix = "/".join(parts[: idx + 1]) + "/"
-                            last_modified = obj.get("LastModified")
-                            if prefix not in candidates or (
-                                last_modified and candidates[prefix] and last_modified > candidates[prefix]
-                            ):
-                                candidates[prefix] = last_modified
-                            elif prefix not in candidates:
-                                candidates[prefix] = last_modified
-                            break
-    except (NoCredentialsError, PartialCredentialsError) as exc:
-        raise ArtifactError(f"S3 credentials are missing or incomplete. {_credential_hint()}") from exc
-    except ClientError as exc:
-        code = exc.response.get("Error", {}).get("Code", "unknown")
-        raise ArtifactError(f"Could not list {BASE_ARTIFACT_S3_URI} ({code}). {_credential_hint()}") from exc
-    if not candidates:
-        raise ArtifactError(f"No artifact folder matching multilingual_space_*.json/ was found under {BASE_ARTIFACT_S3_URI}")
-    def sort_key(item: tuple[str, Any]) -> tuple[str, str]:
-        prefix, last_modified = item
-        modified_key = last_modified.isoformat() if last_modified else ""
-        return (_timestamp_key_from_prefix(prefix), modified_key)
-    prefixes = [prefix for prefix, _ in sorted(candidates.items(), key=sort_key, reverse=True)]
-    return [f"s3://{bucket}/{prefix}" for prefix in prefixes]
-def _find_newest_artifact_uri(progress: gr.Progress | None = None) -> str:
-    return _list_artifact_uris(progress)[0]
-def _resolve_artifact_options(progress: gr.Progress | None = None) -> tuple[list[str], str]:
-    override_uri = _normalize_artifact_uri(os.getenv(ARTIFACT_ENV_VAR, "").strip())
-    try:
-        uris = _list_artifact_uris(progress)
-    except ArtifactError:
-        if not override_uri:
-            raise
-        return [override_uri], override_uri
-    if override_uri and override_uri not in uris:
-        uris.insert(0, override_uri)
-    selected_uri = override_uri or uris[0]
-    return uris, selected_uri
-def _local_cache_dir_for_uri(s3_uri: str) -> Path:
-    digest = hashlib.sha256(s3_uri.encode("utf-8")).hexdigest()[:16]
-    name = s3_uri.rstrip("/").split("/")[-1]
-    safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", name)
-    return CACHE_ROOT / f"{safe_name}_{digest}"
-def _download_file_if_missing(s3_uri: str, local_path: Path) -> None:
-    if local_path.exists() and local_path.stat().st_size > 0:
-        return
-    bucket, key = _parse_s3_uri(s3_uri)
-    local_path.parent.mkdir(parents=True, exist_ok=True)
-    client = _s3_client()
-    try:
-        client.download_file(bucket, key, str(local_path))
-    except (NoCredentialsError, PartialCredentialsError) as exc:
-        raise ArtifactError(f"S3 credentials are missing or incomplete while downloading {s3_uri}. {_credential_hint()}") from exc
-    except ClientError as exc:
-        code = exc.response.get("Error", {}).get("Code", "unknown")
-        raise ArtifactError(f"Could not download {s3_uri} ({code}). {_credential_hint()}") from exc
-def _prepare_artifacts(artifact_uri: str | None = None, progress: gr.Progress | None = None) -> ArtifactPaths:
-    artifact_uri = _normalize_artifact_uri(artifact_uri or "")
-    if artifact_uri:
-        _progress(progress, 0.03, f"Using selected artifact {_artifact_label(artifact_uri)}")
-    else:
-        artifact_uri = _normalize_artifact_uri(os.getenv(ARTIFACT_ENV_VAR, "").strip())
-    if not artifact_uri:
-        artifact_uri = _find_newest_artifact_uri(progress)
-    artifact_uri = _normalize_artifact_uri(artifact_uri)
-    local_dir = _local_cache_dir_for_uri(artifact_uri)
-    local_dir.mkdir(parents=True, exist_ok=True)
-    for idx, filename in enumerate(REQUIRED_FILES):
-        _progress(progress, 0.10 + idx * 0.07, f"Checking {filename}")
-        _download_file_if_missing(_join_s3(artifact_uri, filename), local_dir / filename)
-    return ArtifactPaths(
-        s3_uri=artifact_uri,
-        local_dir=local_dir,
-        faiss_path=local_dir / "aligned_all.faiss",
-        metadata_path=local_dir / "all_metadata.jsonl",
-        config_path=local_dir / "config.json",
-    )
-def _load_json(path: Path) -> dict[str, Any]:
-    with path.open("r", encoding="utf-8") as handle:
-        return json.load(handle)
-def _load_metadata(path: Path) -> list[dict[str, Any]]:
-    rows: list[dict[str, Any]] = []
-    with path.open("r", encoding="utf-8") as handle:
-        for line_no, line in enumerate(handle, start=1):
-            if not line.strip():
-                continue
-            try:
-                row = json.loads(line)
-            except json.JSONDecodeError as exc:
-                raise ArtifactError(f"Invalid JSON in {path.name} at line {line_no}: {exc}") from exc
-            if "id" not in row or "lang" not in row:
-                raise ArtifactError(f"Metadata line {line_no} is missing required fields: id/lang")
-            row["id"] = int(row["id"])
-            row["lang"] = str(row["lang"])
-            rows.append(row)
-    if not rows:
-        raise ArtifactError(f"{path.name} is empty")
-    return rows
-def _reconstruct_vectors(index: Any) -> np.ndarray:
-    n_vectors = int(index.ntotal)
-    dim = int(index.d)
-    if n_vectors <= 0:
-        raise ArtifactError("FAISS index contains no vectors")
-    try:
-        vectors = index.reconstruct_n(0, n_vectors)
-        vectors = np.asarray(vectors, dtype=np.float32)
-        if vectors.shape == (n_vectors, dim):
-            return vectors
-    except Exception:
-        pass
-    try:
-        vectors = np.empty((n_vectors, dim), dtype=np.float32)
-        index.reconstruct_n(0, n_vectors, vectors)
-        return vectors
-    except Exception:
-        pass
-    try:
-        rows = [np.asarray(index.reconstruct(i), dtype=np.float32) for i in range(n_vectors)]
-        return np.vstack(rows).astype(np.float32, copy=False)
-    except Exception as exc:
-        raise ArtifactError("FAISS vectors could not be reconstructed") from exc
-def _load_vec_fallback(paths: ArtifactPaths, expected_count: int) -> np.ndarray:
-    vec_path = paths.local_dir / OPTIONAL_VEC_FILE
-    _download_file_if_missing(_join_s3(paths.s3_uri, OPTIONAL_VEC_FILE), vec_path)
-    vectors: list[np.ndarray] = []
-    expected_dim: int | None = None
-    with vec_path.open("r", encoding="utf-8", errors="replace") as handle:
-        first = handle.readline()
-        parts = first.strip().split()
-        has_header = len(parts) == 2 and all(part.isdigit() for part in parts)
-        if has_header:
-            expected_dim = int(parts[1])
-        else:
-            handle.seek(0)
-        for line_no, line in enumerate(handle, start=2 if has_header else 1):
-            parts = line.rstrip("\n").split()
-            if not parts:
-                continue
-            if expected_dim is None:
-                expected_dim = len(parts) - 1
-            values = parts[-expected_dim:]
-            try:
-                vectors.append(np.asarray(values, dtype=np.float32))
-            except ValueError as exc:
-                raise ArtifactError(f"Could not parse vector values in {vec_path.name} at line {line_no}") from exc
-    if len(vectors) != expected_count:
-        raise ArtifactError(
-            f"{vec_path.name} contains {len(vectors):,} vectors, but metadata/FAISS expects {expected_count:,}"
-        )
-    return np.vstack(vectors).astype(np.float32, copy=False)
-def _l2_normalize(matrix: np.ndarray) -> np.ndarray:
-    matrix = np.asarray(matrix, dtype=np.float32)
-    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
-    norms[norms == 0.0] = 1.0
-    matrix /= norms
-    return matrix
-def _language_order(metadata_rows: list[dict[str, Any]], config: dict[str, Any]) -> list[str]:
-    config_languages = _config_get(config, ("languages", "langs", "language_codes"), None)
-    if isinstance(config_languages, dict):
-        ordered = [str(key) for key in config_languages.keys()]
-    elif isinstance(config_languages, list):
-        ordered = [str(item) for item in config_languages]
-    else:
-        ordered = []
-    metadata_languages = sorted({row["lang"] for row in metadata_rows})
-    for lang in metadata_languages:
-        if lang not in ordered:
-            ordered.append(lang)
-    return ordered
-def _build_lookup(metadata_rows: list[dict[str, Any]]) -> dict[str, dict[str, dict[str, list[int]]]]:
-    lookup: dict[str, dict[str, dict[str, list[int]]]] = defaultdict(lambda: {"token": defaultdict(list), "surface": defaultdict(list)})
-    for row in metadata_rows:
-        lang = row["lang"]
-        vector_id = int(row["id"])
-        for field in ("token", "surface"):
-            value = _normalize_text(row.get(field, ""))
-            if value:
-                lookup[lang][field][value].append(vector_id)
-    return {
-        lang: {
-            field: {key: sorted(ids) for key, ids in field_map.items()}
-            for field, field_map in maps.items()
-        }
-        for lang, maps in lookup.items()
-    }
-def _stopwords_from_config(config: dict[str, Any], languages: list[str]) -> dict[str, set[str]]:
-    stopwords: dict[str, set[str]] = {}
-    raw_stopwords = _config_get(config, ("stopwords",), None)
-    if isinstance(raw_stopwords, dict):
-        for lang, values in raw_stopwords.items():
-            if isinstance(values, list):
-                stopwords.setdefault(str(lang), set()).update(
-                    _normalize_text(value) for value in values if _normalize_text(value)
-                )
-    for lang in languages:
-        stopwords.setdefault(lang, set()).update(STOPWORDS)
-    return stopwords
-def _build_space(paths: ArtifactPaths, progress: gr.Progress | None = None) -> SpaceData:
-    if faiss is None:
-        raise ArtifactError(f"faiss-cpu could not be imported: {FAISS_IMPORT_ERROR}")
-    _progress(progress, 0.34, "Loading config and metadata")
-    config = _load_json(paths.config_path)
-    metadata_rows = _load_metadata(paths.metadata_path)
-    metadata_rows.sort(key=lambda row: int(row["id"]))
-    id_to_meta = {int(row["id"]): row for row in metadata_rows}
-    if len(id_to_meta) != len(metadata_rows):
-        raise ArtifactError("Metadata contains duplicate vector ids")
-    expected_ids = sorted(id_to_meta)
-    expected_count = len(expected_ids)
-    _progress(progress, 0.45, "Loading FAISS index")
-    index = faiss.read_index(str(paths.faiss_path))
-    if int(index.ntotal) != expected_count:
-        raise ArtifactError(
-            f"FAISS index has {int(index.ntotal):,} vectors but metadata has {expected_count:,} rows"
-        )
-    _progress(progress, 0.56, "Reconstructing aligned vectors from FAISS")
-    vector_source = "faiss"
-    try:
-        vectors = _reconstruct_vectors(index)
-    except ArtifactError:
-        _progress(progress, 0.56, "FAISS reconstruction failed; downloading aligned_all.vec fallback")
-        vector_source = "aligned_all.vec"
-        vectors = _load_vec_fallback(paths, expected_count)
-    if vectors.shape[0] != expected_count:
-        raise ArtifactError(f"Vector matrix has {vectors.shape[0]:,} rows but metadata has {expected_count:,}")
-    if expected_ids[0] != 0 or expected_ids[-1] >= vectors.shape[0]:
-        raise ArtifactError("Metadata ids must be contiguous FAISS vector ids starting at 0")
-    _progress(progress, 0.70, "Normalizing vectors and building language indexes")
-    vectors = _l2_normalize(vectors)
-    languages = _language_order(metadata_rows, config)
-    lang_to_ids: dict[str, np.ndarray] = {}
-    lang_to_matrix: dict[str, np.ndarray] = {}
-    lang_to_index: dict[str, Any] = {}
-    id_to_lang_local: dict[int, tuple[str, int]] = {}
-    for lang in languages:
-        ids = np.asarray([int(row["id"]) for row in metadata_rows if row["lang"] == lang], dtype=np.int64)
-        if ids.size == 0:
-            continue
-        lang_matrix = np.ascontiguousarray(vectors[ids], dtype=np.float32)
-        lang_index = faiss.IndexFlatIP(lang_matrix.shape[1])
-        lang_index.add(lang_matrix)
-        lang_to_ids[lang] = ids
-        lang_to_matrix[lang] = lang_matrix
-        lang_to_index[lang] = lang_index
-        for local_idx, vector_id in enumerate(ids.tolist()):
-            id_to_lang_local[int(vector_id)] = (lang, local_idx)
-    languages = [lang for lang in languages if lang in lang_to_ids]
-    lookup = _build_lookup(metadata_rows)
-    fuzzy_choices = {
-        lang: sorted(set(lookup.get(lang, {}).get("token", {})) | set(lookup.get(lang, {}).get("surface", {})))
-        for lang in languages
-    }
-    vocab_sizes = {lang: int(lang_to_ids[lang].size) for lang in languages}
-    stopwords = _stopwords_from_config(config, languages)
-    vector_dim = int(next(iter(lang_to_matrix.values())).shape[1])
-    del vectors
-    _progress(progress, 0.92, "Ready")
-    return SpaceData(
-        artifact_uri=paths.s3_uri,
-        local_dir=paths.local_dir,
-        config=config,
-        id_to_meta=id_to_meta,
-        languages=languages,
-        lang_to_ids=lang_to_ids,
-        lang_to_matrix=lang_to_matrix,
-        lang_to_index=lang_to_index,
-        id_to_lang_local=id_to_lang_local,
-        lookup=lookup,
-        fuzzy_choices=fuzzy_choices,
-        vector_dim=vector_dim,
-        vector_source=vector_source,
-        vocab_sizes=vocab_sizes,
-        stopwords=stopwords,
-        csls_avg_cache={},
-    )
-def get_space(artifact_uri: str | None = None, progress: gr.Progress | None = None) -> SpaceData:
-    artifact_uri = _normalize_artifact_uri(artifact_uri or os.getenv(ARTIFACT_ENV_VAR, "").strip())
-    if not artifact_uri:
-        artifact_uri = _find_newest_artifact_uri(progress)
-    artifact_uri = _normalize_artifact_uri(artifact_uri)
-    if artifact_uri in _SPACE_CACHE:
-        return _SPACE_CACHE[artifact_uri]
-    with _LOAD_LOCK:
-        if artifact_uri in _SPACE_CACHE:
-            return _SPACE_CACHE[artifact_uri]
-        _progress(progress, 0.01, "Preparing multilingual embedding artifacts")
-        paths = _prepare_artifacts(artifact_uri, progress)
-        _SPACE_CACHE[artifact_uri] = _build_space(paths, progress)
-        return _SPACE_CACHE[artifact_uri]
-def _meta_display(meta: dict[str, Any]) -> dict[str, Any]:
-    return {
-        "id": int(meta.get("id", -1)),
-        "lang": _display_value(meta.get("lang")),
-        "token": _display_value(meta.get("token")),
-        "surface": _display_value(meta.get("surface")),
-        "source_vec_file": _display_value(meta.get("source_vec_file")),
-    }
-def _candidate_dataframe(space: SpaceData, ids: list[int], match_type: str) -> pd.DataFrame:
-    rows = []
-    for vector_id in ids[:25]:
-        meta = _meta_display(space.id_to_meta[int(vector_id)])
-        meta["match_type"] = match_type
-        rows.append(meta)
-    return pd.DataFrame(rows, columns=["match_type", "id", "lang", "token", "surface", "source_vec_file"])
-def _resolve_query(
-    space: SpaceData,
-    word: str,
-    source_lang: str,
-    use_surface_forms: bool,
-    fuzzy_fallback: bool,
-) -> tuple[int | None, pd.DataFrame, pd.DataFrame, str]:
-    normalized = _normalize_text(word)
-    if not normalized:
-        return None, pd.DataFrame(), pd.DataFrame(), "Enter a query word."
-    if source_lang not in space.languages:
-        return None, pd.DataFrame(), pd.DataFrame(), f"Source language '{source_lang}' is not available."
-    exact_ids: list[int] = []
-    match_type = ""
-    lang_lookup = space.lookup.get(source_lang, {})
-    if use_surface_forms:
-        exact_ids = list(lang_lookup.get("surface", {}).get(normalized, []))
-        match_type = "surface"
-    if not exact_ids:
-        exact_ids = list(lang_lookup.get("token", {}).get(normalized, []))
-        match_type = "token"
-    if exact_ids:
-        candidates = _candidate_dataframe(space, exact_ids, f"exact_{match_type}")
-        chosen_id = int(exact_ids[0])
-        chosen = space.id_to_meta[chosen_id]
-        if len(exact_ids) > 1:
-            message = (
-                f"Using exact {match_type} match `{_display_value(chosen.get(match_type, chosen.get('token')))}` "
-                f"(id {chosen_id}); {len(exact_ids)} exact candidates found."
-            )
-        else:
-            message = f"Using exact {match_type} match `{_display_value(chosen.get(match_type, chosen.get('token')))}`."
-        return chosen_id, candidates, pd.DataFrame(), message
-    suggestions = _fuzzy_suggestions(space, normalized, source_lang) if fuzzy_fallback else pd.DataFrame()
-    if fuzzy_fallback and suggestions.empty:
-        message = "No exact match found, and no fuzzy suggestions were available."
-    elif fuzzy_fallback:
-        message = "No exact match found. Pick or type one of the fuzzy suggestions."
-    else:
-        message = "No exact match found. Enable fuzzy fallback to see suggestions."
-    return None, pd.DataFrame(), suggestions, message
-def _fuzzy_suggestions(space: SpaceData, normalized_word: str, lang: str, limit: int = 10) -> pd.DataFrame:
-    if process is None or fuzz is None:
-        return pd.DataFrame([{"suggestion": "rapidfuzz is not installed", "score": "", "token": "", "surface": "", "id": ""}])
-    choices = space.fuzzy_choices.get(lang, [])
-    if not choices:
-        return pd.DataFrame()
-    matches = process.extract(normalized_word, choices, scorer=fuzz.WRatio, limit=limit)
-    rows = []
-    for suggestion, score, _ in matches:
-        ids = (
-            space.lookup.get(lang, {}).get("surface", {}).get(suggestion)
-            or space.lookup.get(lang, {}).get("token", {}).get(suggestion)
-            or []
-        )
-        meta = space.id_to_meta[ids[0]] if ids else {}
-        rows.append(
-            {
-                "suggestion": suggestion,
-                "score": round(float(score), 2),
-                "token": _display_value(meta.get("token")),
-                "surface": _display_value(meta.get("surface")),
-                "id": int(meta["id"]) if meta else "",
-            }
-        )
-    return pd.DataFrame(rows, columns=["suggestion", "score", "token", "surface", "id"])
-def _avg_topk(space: SpaceData, vectors: np.ndarray, lang: str, k: int) -> np.ndarray:
-    index = space.lang_to_index.get(lang)
-    if index is None or int(index.ntotal) == 0:
-        return np.zeros((vectors.shape[0],), dtype=np.float32)
-    k = max(1, min(int(k), int(index.ntotal)))
-    distances, _ = index.search(np.ascontiguousarray(vectors, dtype=np.float32), k)
-    return distances.mean(axis=1).astype(np.float32)
-def _avg_topk_for_ids(space: SpaceData, vector_ids: list[int], search_lang: str, k: int) -> np.ndarray:
-    values = np.empty((len(vector_ids),), dtype=np.float32)
-    missing_positions: list[int] = []
-    missing_ids: list[int] = []
-    k = int(k)
-    for pos, vector_id in enumerate(vector_ids):
-        cache_key = (int(vector_id), search_lang, k)
-        cached = space.csls_avg_cache.get(cache_key)
-        if cached is None:
-            missing_positions.append(pos)
-            missing_ids.append(int(vector_id))
-        else:
-            values[pos] = cached
-    if missing_ids:
-        vectors = np.vstack([space.vector_for_id(vector_id) for vector_id in missing_ids]).astype(np.float32, copy=False)
-        computed = _avg_topk(space, vectors, search_lang, k)
-        for pos, vector_id, value in zip(missing_positions, missing_ids, computed):
-            float_value = float(value)
-            space.csls_avg_cache[(int(vector_id), search_lang, k)] = float_value
-            values[pos] = float_value
-    return values
-def _raw_candidates(
-    space: SpaceData,
-    query_vector: np.ndarray,
-    source_lang: str,
-    target_lang: str,
-    retrieval_k: int,
-    csls_k: int,
-    score_method: str,
-    query_id: int | None = None,
-) -> list[dict[str, Any]]:
-    if target_lang not in space.lang_to_index:
-        return []
-    index = space.lang_to_index[target_lang]
-    retrieval_k = max(1, min(int(retrieval_k), int(index.ntotal)))
-    query_matrix = np.ascontiguousarray(query_vector.reshape(1, -1), dtype=np.float32)
-    distances, local_indices = index.search(query_matrix, retrieval_k)
-    local_indices = local_indices[0]
-    cosines = distances[0].astype(np.float32)
-    valid = local_indices >= 0
-    if not np.any(valid):
-        return []
-    local_indices = local_indices[valid].astype(np.int64)
-    cosines = cosines[valid]
-    global_ids = space.lang_to_ids[target_lang][local_indices]
-    candidate_vectors = space.lang_to_matrix[target_lang][local_indices]
-    if score_method.casefold() == "csls":
-        if query_id is None:
-            r_q = float(_avg_topk(space, query_matrix, target_lang, csls_k)[0])
-        else:
-            r_q = float(_avg_topk_for_ids(space, [int(query_id)], target_lang, csls_k)[0])
-        r_x = _avg_topk_for_ids(space, global_ids.astype(int).tolist(), source_lang, csls_k)
-        scores = (2.0 * cosines) - r_q - r_x
-    else:
-        scores = cosines
-    rows = []
-    for local_idx, vector_id, score, cosine in zip(local_indices, global_ids, scores, cosines):
-        rows.append(
-            {
-                "id": int(vector_id),
-                "local_idx": int(local_idx),
-                "score": float(score),
-                "cosine": float(cosine),
-            }
-        )
-    rows.sort(key=lambda row: row["score"], reverse=True)
-    return rows
-def _is_filtered_word(space: SpaceData, meta: dict[str, Any], hide_stopwords: bool, min_token_length: int) -> bool:
-    token = _normalize_text(meta.get("token", ""))
-    surface = _normalize_text(meta.get("surface", ""))
-    candidate = surface or token
-    compact = candidate.replace(" ", "")
-    if min_token_length and len(compact) < int(min_token_length):
-        return True
-    lang_stopwords = space.stopwords.get(str(meta.get("lang")), STOPWORDS)
-    if hide_stopwords and (token in lang_stopwords or surface in lang_stopwords or candidate in lang_stopwords):
-        return True
-    return False
-def _reverse_contains_source(
-    space: SpaceData,
-    target_id: int,
-    source_id: int,
-    source_lang: str,
-    target_lang: str,
-    top_k: int,
-    candidate_multiplier: int,
-    prefetch_k: int,
-    min_score: float,
-    csls_k: int,
-    score_method: str,
-) -> bool:
-    target_vector = space.vector_for_id(target_id)
-    retrieval_k = max(1, int(top_k) * int(candidate_multiplier))
-    reverse_rows = _raw_candidates(
-        space=space,
-        query_vector=target_vector,
-        source_lang=target_lang,
-        target_lang=source_lang,
-        retrieval_k=retrieval_k,
-        csls_k=csls_k,
-        score_method=score_method,
-        query_id=target_id,
-    )
-    reverse_rows = [row for row in reverse_rows if row["score"] >= float(min_score)]
-    reverse_rows = reverse_rows[: max(1, int(top_k) * int(candidate_multiplier))]
-    reverse_ids = {row["id"] for row in reverse_rows}
-    if int(source_id) in reverse_ids:
-        return True
-    source_meta = space.id_to_meta[int(source_id)]
-    source_token = _normalize_text(source_meta.get("token"))
-    source_surface = _normalize_text(source_meta.get("surface"))
-    for row in reverse_rows:
-        meta = space.id_to_meta[row["id"]]
-        if _normalize_text(meta.get("token")) == source_token:
-            return True
-        if source_surface and _normalize_text(meta.get("surface")) == source_surface:
-            return True
-    return False
-def _translate_one_target(
-    space: SpaceData,
-    source_id: int,
-    target_lang: str,
-    top_k: int,
-    min_score: float,
-    csls_k: int,
-    candidate_multiplier: int,
-    prefetch_k: int,
-    score_method: str,
-    bidirectional_consistency: bool,
-    hide_stopwords: bool,
-    min_token_length: int,
-) -> list[dict[str, Any]]:
-    source_meta = space.id_to_meta[int(source_id)]
-    source_lang = source_meta["lang"]
-    source_vector = space.vector_for_id(source_id)
-    retrieval_k = max(int(prefetch_k), int(top_k) * int(candidate_multiplier))
-    raw_rows = _raw_candidates(
-        space=space,
-        query_vector=source_vector,
-        source_lang=source_lang,
-        target_lang=target_lang,
-        retrieval_k=retrieval_k,
-        csls_k=csls_k,
-        score_method=score_method,
-        query_id=source_id,
-    )
-    results = []
-    candidate_budget = max(int(top_k), int(top_k) * int(candidate_multiplier))
-    for row in raw_rows[:candidate_budget]:
-        if row["score"] < float(min_score):
-            continue
-        target_meta = space.id_to_meta[row["id"]]
-        if _is_filtered_word(space, target_meta, hide_stopwords, min_token_length):
-            continue
-        if bidirectional_consistency:
-            passed = _reverse_contains_source(
-                space=space,
-                target_id=row["id"],
-                source_id=source_id,
-                source_lang=source_lang,
-                target_lang=target_lang,
-                top_k=top_k,
-                candidate_multiplier=candidate_multiplier,
-                prefetch_k=prefetch_k,
-                min_score=min_score,
-                csls_k=csls_k,
-                score_method=score_method,
-            )
-            if not passed:
-                continue
-            bidirectional_value: Any = True
-        else:
-            bidirectional_value = "not_checked"
-        results.append(
-            {
-                "source word": _display_value(source_meta.get("surface") or source_meta.get("token")),
-                "source language": source_lang,
-                "target language": target_lang,
-                "translated token": _display_value(target_meta.get("token")),
-                "translated surface": _display_value(target_meta.get("surface")),
-                "score": round(float(row["score"]), 6),
-                "cosine": round(float(row["cosine"]), 6),
-                "rank": len(results) + 1,
-                "bidirectional_passed": bidirectional_value,
-                "target source_vec_file": _display_value(target_meta.get("source_vec_file")),
-            }
-        )
-        if len(results) >= int(top_k):
-            break
-    return results
-def translate(
-    artifact_uri: str,
-    word: str,
-    source_lang: str,
-    target_langs: list[str] | None,
-    top_k: int,
-    min_score: float,
-    csls_k: int,
-    candidate_multiplier: int,
-    prefetch_k: int,
-    score_method: str,
-    bidirectional_consistency: bool,
-    use_surface_forms: bool,
-    hide_stopwords: bool,
-    min_token_length: int,
-    fuzzy_fallback: bool,
-    progress: gr.Progress = gr.Progress(),
-):
-    try:
-        space = get_space(artifact_uri, progress)
-        source_id, candidates, suggestions, message = _resolve_query(
-            space, word, source_lang, use_surface_forms, fuzzy_fallback
-        )
-        if source_id is None:
-            return (
-                pd.DataFrame(columns=_translation_columns()),
-                "No translation run because the query word was not found.",
-                candidates,
-                suggestions,
-                message,
-            )
-        selected_targets = [lang for lang in (target_langs or []) if lang in space.languages]
-        if not selected_targets:
-            selected_targets = [lang for lang in space.languages if lang != source_lang]
-        rows: list[dict[str, Any]] = []
-        for target_lang in selected_targets:
-            rows.extend(
-                _translate_one_target(
-                    space=space,
-                    source_id=source_id,
-                    target_lang=target_lang,
-                    top_k=top_k,
-                    min_score=min_score,
-                    csls_k=csls_k,
-                    candidate_multiplier=candidate_multiplier,
-                    prefetch_k=prefetch_k,
-                    score_method=score_method,
-                    bidirectional_consistency=bidirectional_consistency,
-                    hide_stopwords=hide_stopwords,
-                    min_token_length=min_token_length,
-                )
-            )
-        table = pd.DataFrame(rows, columns=_translation_columns())
-        grouped = _group_translation_markdown(table)
-        return table, grouped, candidates, suggestions, message
-    except Exception as exc:
-        return (
-            pd.DataFrame(columns=_translation_columns()),
-            "Translation failed.",
-            pd.DataFrame(),
-            pd.DataFrame(),
-            f"Error: {exc}",
-        )
-def _translation_columns() -> list[str]:
-    return [
-        "source word",
-        "source language",
-        "target language",
-        "translated token",
-        "translated surface",
-        "score",
-        "cosine",
-        "rank",
-        "bidirectional_passed",
-        "target source_vec_file",
-    ]
-def _group_translation_markdown(table: pd.DataFrame) -> str:
-    if table.empty:
-        return "No candidates passed the current filters."
-    lines = []
-    for lang, group in table.groupby("target language", sort=False):
-        parts = []
-        for _, row in group.iterrows():
-            label = row["translated surface"] or row["translated token"]
-            parts.append(f"{row['rank']}. `{label}` ({row['score']:.3f})")
-        lines.append(f"**{lang}**: " + "  |  ".join(parts))
-    return "\n\n".join(lines)
-def nearest_neighbors(
-    artifact_uri: str,
-    word: str,
-    language: str,
-    neighbor_mode: str,
-    selected_languages: list[str] | None,
-    top_n: int,
-    score_method: str,
-    min_score: float,
-    include_same_language: bool,
-    use_surface_forms: bool,
-    fuzzy_fallback: bool,
-    progress: gr.Progress = gr.Progress(),
-):
-    columns = ["language", "token", "surface", "score", "cosine", "rank", "id", "source_vec_file"]
-    try:
-        space = get_space(artifact_uri, progress)
-        runtime_defaults = _defaults_from_config(space.config)
-        source_id, candidates, suggestions, message = _resolve_query(space, word, language, use_surface_forms, fuzzy_fallback)
-        if source_id is None:
-            hint = suggestions if not suggestions.empty else candidates
-            return pd.DataFrame(columns=columns), hint, message
-        if neighbor_mode == "same language":
-            target_languages = [language]
-        elif neighbor_mode == "selected languages":
-            target_languages = [lang for lang in (selected_languages or []) if lang in space.languages]
-        else:
-            target_languages = list(space.languages)
-        if not include_same_language and neighbor_mode != "same language":
-            target_languages = [lang for lang in target_languages if lang != language]
-        if not target_languages:
-            return pd.DataFrame(columns=columns), pd.DataFrame(), "No neighbor languages selected."
-        source_vector = space.vector_for_id(source_id)
-        retrieval_k = max(50, int(top_n) * 3)
-        rows = []
-        for target_lang in target_languages:
-            raw_rows = _raw_candidates(
-                space=space,
-                query_vector=source_vector,
-                source_lang=language,
-                target_lang=target_lang,
-                retrieval_k=retrieval_k,
-                csls_k=runtime_defaults["csls_k"],
-                score_method=score_method,
-                query_id=source_id,
-            )
-            for row in raw_rows:
-                if row["id"] == int(source_id):
-                    continue
-                if row["score"] < float(min_score):
-                    continue
-                meta = space.id_to_meta[row["id"]]
-                rows.append(
-                    {
-                        "language": target_lang,
-                        "token": _display_value(meta.get("token")),
-                        "surface": _display_value(meta.get("surface")),
-                        "score": round(float(row["score"]), 6),
-                        "cosine": round(float(row["cosine"]), 6),
-                        "rank": 0,
-                        "id": int(row["id"]),
-                        "source_vec_file": _display_value(meta.get("source_vec_file")),
-                    }
-                )
-        rows.sort(key=lambda row: row["score"], reverse=True)
-        rows = rows[: int(top_n)]
-        for idx, row in enumerate(rows, start=1):
-            row["rank"] = idx
-        return pd.DataFrame(rows, columns=columns), candidates, message
-    except Exception as exc:
-        return pd.DataFrame(columns=columns), pd.DataFrame(), f"Error: {exc}"
-def browse_vocab(artifact_uri: str, language: str, filter_text: str, limit: int, progress: gr.Progress = gr.Progress()):
-    try:
-        space = get_space(artifact_uri, progress)
-        rows = _browse_rows(space, language, filter_text, int(limit), randomize=False)
-        df = pd.DataFrame(rows, columns=_browse_columns())
-        return df, df
-    except Exception as exc:
-        df = pd.DataFrame([{"token": f"Error: {exc}", "surface": "", "id": "", "source_vec_file": ""}])
-        return df, df
-def random_vocab(artifact_uri: str, language: str, limit: int, progress: gr.Progress = gr.Progress()):
-    try:
-        space = get_space(artifact_uri, progress)
-        rows = _browse_rows(space, language, "", int(limit), randomize=True)
-        df = pd.DataFrame(rows, columns=_browse_columns())
-        return df, df
-    except Exception as exc:
-        df = pd.DataFrame([{"token": f"Error: {exc}", "surface": "", "id": "", "source_vec_file": ""}])
-        return df, df
-def _browse_columns() -> list[str]:
-    return ["token", "surface", "id", "source_vec_file"]
-def _browse_rows(space: SpaceData, language: str, filter_text: str, limit: int, randomize: bool) -> list[dict[str, Any]]:
-    if language not in space.lang_to_ids:
-        return []
-    ids = space.lang_to_ids[language].tolist()
-    normalized_filter = _normalize_text(filter_text)
-    if randomize and len(ids) > limit:
-        ids = random.sample(ids, limit)
-    rows = []
-    for vector_id in ids:
-        meta = space.id_to_meta[int(vector_id)]
-        token = _display_value(meta.get("token"))
-        surface = _display_value(meta.get("surface"))
-        if normalized_filter:
-            haystack = f"{_normalize_text(token)} {_normalize_text(surface)}"
-            if normalized_filter not in haystack:
-                continue
-        rows.append(
-            {
-                "token": token,
-                "surface": surface,
-                "id": int(vector_id),
-                "source_vec_file": _display_value(meta.get("source_vec_file")),
-            }
-        )
-        if len(rows) >= limit:
-            break
-    return rows
-def use_selected_vocab(table_data: Any, browse_language: str, evt: gr.SelectData):
-    try:
-        if isinstance(table_data, pd.DataFrame):
-            df = table_data
-        else:
-            df = pd.DataFrame(table_data, columns=_browse_columns())
-        row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
-        row = df.iloc[int(row_idx)]
-        word = row.get("surface") or row.get("token") or ""
-        return str(word), gr.update(value=browse_language)
-    except Exception:
-        return gr.update(), gr.update()
-def _config_get(config: dict[str, Any], keys: tuple[str, ...], default: Any = "") -> Any:
-    return _config_get_raw(config, keys, default)
-def _format_config_value(value: Any) -> str:
-    if value is None:
-        return ""
-    if isinstance(value, (dict, list)):
-        return json.dumps(value, ensure_ascii=False, sort_keys=True)
-    return str(value)
-def artifact_info_markdown(space: SpaceData) -> str:
-    runtime_defaults = _defaults_from_config(space.config)
-    candidate_retrieval_k = int(runtime_defaults["top_k"]) * int(runtime_defaults["candidate_retrieval_k_multiplier"])
-    fields = [
-        ("artifact S3 URI", space.artifact_uri),
-        ("created_at", _config_get(space.config, ("created_at", "created", "timestamp"), "")),
-        ("languages", ", ".join(space.languages)),
-        ("pivot_lang", _config_get(space.config, ("pivot_lang", "pivot_language"), runtime_defaults["pivot_lang"])),
-        ("vector_dim", _config_get(space.config, ("vector_dim", "dim", "dimension"), space.vector_dim)),
-        ("vocab sizes", space.vocab_sizes),
-        ("top_n_vocab", runtime_defaults["top_n_vocab"]),
-        ("out_top", runtime_defaults["out_top"]),
-        ("top_k", runtime_defaults["top_k"]),
-        ("min_score", runtime_defaults["min_score"]),
-        ("csls_k", runtime_defaults["csls_k"]),
-        ("candidate_retrieval_k", _config_get(space.config, ("candidate_retrieval_k",), candidate_retrieval_k)),
-        ("candidate multiplier", runtime_defaults["candidate_retrieval_k_multiplier"]),
-        ("csls_prefetch_k", runtime_defaults["csls_prefetch_k"]),
-        ("align_iters", runtime_defaults["align_iters"]),
-        ("init_pairs", runtime_defaults["init_pairs"]),
-        ("max_pairs", runtime_defaults["max_pairs"]),
-        (
-            "bidirectional consistency",
-            runtime_defaults["bidirectional_consistency"],
-        ),
-        ("surface forms enabled", runtime_defaults["use_surface_forms"]),
-        ("hide stopwords default", runtime_defaults["hide_stopwords"]),
-        ("min token length default", runtime_defaults["min_token_length"]),
-        ("vector preprocessing", _config_get(space.config, ("vector_preprocessing", "preprocessing"), "")),
-        ("source vec files", _config_get(space.config, ("source_vec_files", "vec_files"), "")),
-        ("surface files", _config_get(space.config, ("surface_files",), "")),
-        ("local cache", str(space.local_dir)),
-        ("vector source", space.vector_source),
-    ]
-    lines = ["| Field | Value |", "| --- | --- |"]
-    for field, value in fields:
-        lines.append(f"| {field} | {_format_config_value(value)} |")
-    return "\n".join(lines)
-def _empty_artifact_updates(message: str):
-    return (
-        message,
-        gr.update(choices=[], value=None),
-        gr.update(choices=[], value=[]),
-        gr.update(choices=[], value=None),
-        gr.update(choices=[], value=[]),
-        gr.update(choices=[], value=None),
-        message,
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-        gr.update(),
-    )
-def load_selected_artifact(artifact_uri: str, progress: gr.Progress = gr.Progress()):
-    try:
-        space = get_space(artifact_uri, progress)
-        runtime_defaults = _defaults_from_config(space.config)
-        pivot = str(runtime_defaults["pivot_lang"])
-        source_default = pivot if pivot in space.languages else space.languages[0]
-        targets_default = [lang for lang in space.languages if lang != source_default]
-        if not targets_default:
-            targets_default = [source_default]
-        status = (
-            f"Loaded {sum(space.vocab_sizes.values()):,} vectors across {len(space.languages)} languages "
-            f"from `{space.artifact_uri}`."
-        )
-        return (
-            status,
-            gr.update(choices=space.languages, value=source_default),
-            gr.update(choices=space.languages, value=targets_default),
-            gr.update(choices=space.languages, value=source_default),
-            gr.update(choices=space.languages, value=targets_default),
-            gr.update(choices=space.languages, value=source_default),
-            artifact_info_markdown(space),
-            gr.update(value=runtime_defaults["top_k"]),
-            gr.update(value=runtime_defaults["min_score"]),
-            gr.update(value=runtime_defaults["csls_k"]),
-            gr.update(value=min(runtime_defaults["candidate_retrieval_k_multiplier"], INTERACTIVE_DEFAULTS["candidate_retrieval_k_multiplier"])),
-            gr.update(value=min(runtime_defaults["csls_prefetch_k"], INTERACTIVE_DEFAULTS["csls_prefetch_k"])),
-            gr.update(value=INTERACTIVE_DEFAULTS["bidirectional_consistency"]),
-            gr.update(value=runtime_defaults["use_surface_forms"]),
-            gr.update(value=runtime_defaults["hide_stopwords"]),
-            gr.update(value=runtime_defaults["min_token_length"]),
-            gr.update(value=runtime_defaults["min_score"]),
-            gr.update(value=runtime_defaults["use_surface_forms"]),
-        )
-    except Exception as exc:
-        return _empty_artifact_updates(f"Startup error: {exc}")
-def initialize_app(progress: gr.Progress = gr.Progress()):
-    try:
-        artifact_uris, selected_uri = _resolve_artifact_options(progress)
-        artifact_update = gr.update(
-            choices=_artifact_dropdown_choices(artifact_uris),
-            value=selected_uri,
-            interactive=True,
-        )
-        updates = load_selected_artifact(selected_uri, progress)
-        return (updates[0], artifact_update, *updates[1:])
-    except Exception as exc:
-        message = f"Startup error: {exc}"
-        empty_updates = _empty_artifact_updates(message)
-        return (empty_updates[0], gr.update(choices=[], value=None, interactive=False), *empty_updates[1:])
-def update_default_targets(artifact_uri: str, source_language: str):
-    try:
-        space = get_space(artifact_uri, None)
-        targets = [lang for lang in space.languages if lang != source_language]
-        return gr.update(choices=space.languages, value=targets or [source_language])
-    except Exception:
-        return gr.update()
-CSS = """
-.compact-result table { font-size: 0.92rem; }
-"""
-with gr.Blocks(title="Multilingual Static Word Embeddings") as demo:
-    gr.Markdown("## Multilingual Static Word Embeddings Explorer")
-    load_status = gr.Markdown("Loading artifacts from S3...")
-    artifact_selector = gr.Dropdown(
-        label="Aligned space artifact",
-        choices=[],
-        interactive=True,
-    )
-    with gr.Tabs():
-        with gr.Tab("Translate"):
-            with gr.Row():
-                with gr.Column(scale=1, min_width=280):
-                    query_word = gr.Textbox(label="Query word", placeholder="Enter a word")
-                    source_lang = gr.Dropdown(label="Source language", choices=[], interactive=True)
-                    target_langs = gr.Dropdown(
-                        label="Target languages",
-                        choices=[],
-                        multiselect=True,
-                        interactive=True,
-                    )
-                    translate_button = gr.Button("Translate", variant="primary")
-                    with gr.Accordion("Retrieval and filters", open=False):
-                        top_k = gr.Slider(1, 20, value=DEFAULTS["top_k"], step=1, label="top_k")
-                        min_score = gr.Slider(-2.0, 2.0, value=DEFAULTS["min_score"], step=0.01, label="min_score")
-                        csls_k = gr.Slider(1, 50, value=DEFAULTS["csls_k"], step=1, label="csls_k")
-                        candidate_multiplier = gr.Slider(
-                            1,
-                            10,
-                            value=DEFAULTS["candidate_retrieval_k_multiplier"],
-                            step=1,
-                            label="candidate multiplier",
-                        )
-                        prefetch_k = gr.Slider(
-                            10,
-                            500,
-                            value=INTERACTIVE_DEFAULTS["csls_prefetch_k"],
-                            step=10,
-                            label="FAISS prefetch",
-                        )
-                        score_method = gr.Radio(["cosine", "CSLS"], value="cosine", label="score method")
-                        bidirectional = gr.Checkbox(
-                            value=INTERACTIVE_DEFAULTS["bidirectional_consistency"],
-                            label="bidirectional consistency",
-                        )
-                        use_surface_forms = gr.Checkbox(value=DEFAULTS["use_surface_forms"], label="use surface forms")
-                        hide_stopwords = gr.Checkbox(value=DEFAULTS["hide_stopwords"], label="hide stopwords")
-                        min_token_length = gr.Slider(
-                            1,
-                            20,
-                            value=DEFAULTS["min_token_length"],
-                            step=1,
-                            label="min token length",
-                        )
-                        fuzzy_fallback = gr.Checkbox(value=INTERACTIVE_DEFAULTS["fuzzy_fallback"], label="fuzzy match fallback")
-                with gr.Column(scale=2, min_width=520):
-                    translate_message = gr.Markdown()
-                    grouped_results = gr.Markdown()
-                    translation_table = gr.Dataframe(
-                        label="Translations",
-                        headers=_translation_columns(),
-                        datatype=["str", "str", "str", "str", "str", "number", "number", "number", "str", "str"],
-                        wrap=True,
-                        elem_classes=["compact-result"],
-                    )
-                    with gr.Accordion("Source matches and suggestions", open=False):
-                        match_candidates = gr.Dataframe(label="Exact candidates", wrap=True)
-                        fuzzy_suggestions = gr.Dataframe(label="Fuzzy suggestions", wrap=True)
-        with gr.Tab("Nearest Neighbors"):
-            with gr.Row():
-                with gr.Column(scale=1, min_width=280):
-                    nn_word = gr.Textbox(label="Word", placeholder="Enter a word")
-                    nn_language = gr.Dropdown(label="Language", choices=[], interactive=True)
-                    neighbor_mode = gr.Radio(
-                        ["same language", "all languages", "selected languages"],
-                        value="all languages",
-                        label="Neighbor languages",
-                    )
-                    nn_selected_languages = gr.Dropdown(
-                        label="Selected languages",
-                        choices=[],
-                        multiselect=True,
-                        interactive=True,
-                    )
-                    nn_top_n = gr.Slider(1, 100, value=20, step=1, label="top_n")
-                    nn_score_method = gr.Radio(["cosine", "CSLS"], value="cosine", label="score method")
-                    nn_min_score = gr.Slider(-2.0, 2.0, value=DEFAULTS["min_score"], step=0.01, label="min score")
-                    nn_include_same = gr.Checkbox(value=False, label="include same language")
-                    nn_surface = gr.Checkbox(value=DEFAULTS["use_surface_forms"], label="use surface forms")
-                    nn_fuzzy = gr.Checkbox(value=INTERACTIVE_DEFAULTS["fuzzy_fallback"], label="fuzzy match fallback")
-                    nn_button = gr.Button("Find Neighbors", variant="primary")
-                with gr.Column(scale=2, min_width=520):
-                    nn_message = gr.Markdown()
-                    nn_table = gr.Dataframe(label="Nearest words", wrap=True)
-                    nn_matches = gr.Dataframe(label="Source match / suggestions", wrap=True)
-        with gr.Tab("Browse Vocabulary"):
-            with gr.Row():
-                with gr.Column(scale=1, min_width=280):
-                    browse_language = gr.Dropdown(label="Language", choices=[], interactive=True)
-                    browse_filter = gr.Textbox(label="Search/filter", placeholder="token or surface substring")
-                    browse_limit = gr.Slider(10, 1000, value=100, step=10, label="limit")
-                    with gr.Row():
-                        browse_button = gr.Button("Browse", variant="primary")
-                        random_button = gr.Button("Random Sample")
-                with gr.Column(scale=2, min_width=520):
-                    browse_table = gr.Dataframe(label="Vocabulary", wrap=True)
-                    browse_state = gr.State(pd.DataFrame(columns=_browse_columns()))
-        with gr.Tab("Artifact Info"):
-            artifact_info = gr.Markdown("Artifact metadata will appear after loading.")
-    demo.load(
-        initialize_app,
-        outputs=[
-            load_status,
-            artifact_selector,
-            source_lang,
-            target_langs,
-            nn_language,
-            nn_selected_languages,
-            browse_language,
-            artifact_info,
-            top_k,
-            min_score,
-            csls_k,
-            candidate_multiplier,
-            prefetch_k,
-            bidirectional,
-            use_surface_forms,
-            hide_stopwords,
-            min_token_length,
-            nn_min_score,
-            nn_surface,
-        ],
-    )
-    artifact_selector.change(
-        load_selected_artifact,
-        inputs=[artifact_selector],
-        outputs=[
-            load_status,
-            source_lang,
-            target_langs,
-            nn_language,
-            nn_selected_languages,
-            browse_language,
-            artifact_info,
-            top_k,
-            min_score,
-            csls_k,
-            candidate_multiplier,
-            prefetch_k,
-            bidirectional,
-            use_surface_forms,
-            hide_stopwords,
-            min_token_length,
-            nn_min_score,
-            nn_surface,
-        ],
-    )
-    source_lang.change(update_default_targets, inputs=[artifact_selector, source_lang], outputs=[target_langs])
-    translate_button.click(
-        translate,
-        inputs=[
-            artifact_selector,
-            query_word,
-            source_lang,
-            target_langs,
-            top_k,
-            min_score,
-            csls_k,
-            candidate_multiplier,
-            prefetch_k,
-            score_method,
-            bidirectional,
-            use_surface_forms,
-            hide_stopwords,
-            min_token_length,
-            fuzzy_fallback,
-        ],
-        outputs=[translation_table, grouped_results, match_candidates, fuzzy_suggestions, translate_message],
-    )
-    nn_button.click(
-        nearest_neighbors,
-        inputs=[
-            artifact_selector,
-            nn_word,
-            nn_language,
-            neighbor_mode,
-            nn_selected_languages,
-            nn_top_n,
-            nn_score_method,
-            nn_min_score,
-            nn_include_same,
-            nn_surface,
-            nn_fuzzy,
-        ],
-        outputs=[nn_table, nn_matches, nn_message],
-    )
-    browse_button.click(
-        browse_vocab,
-        inputs=[artifact_selector, browse_language, browse_filter, browse_limit],
-        outputs=[browse_table, browse_state],
-    )
-    browse_filter.submit(
-        browse_vocab,
-        inputs=[artifact_selector, browse_language, browse_filter, browse_limit],
-        outputs=[browse_table, browse_state],
-    )
-    random_button.click(
-        random_vocab,
-        inputs=[artifact_selector, browse_language, browse_limit],
-        outputs=[browse_table, browse_state],
-    )
-    browse_table.select(
-        use_selected_vocab,
-        inputs=[browse_state, browse_language],
-        outputs=[query_word, source_lang],
-    )
-if __name__ == "__main__":
-    demo.queue(default_concurrency_limit=4).launch(css=CSS, ssr_mode=False)

requirements.txt DELETED Viewed

@@ -1,8 +0,0 @@
-gradio
-faiss-cpu
-numpy
-pandas
-boto3
-smart_open
-python-dotenv
-rapidfuzz