Maslionok commited on
Commit
09a837f
·
1 Parent(s): 245b51c
Files changed (4) hide show
  1. .gitignore +8 -0
  2. README.md +41 -0
  3. app.py +1039 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ .env
4
+ .venv/
5
+ multilingual_space_*.json/
6
+ *.faiss
7
+ *.vec
8
+ *.jsonl
README.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Multilingual Static Word Embeddings Demo
3
+ sdk: gradio
4
+ app_file: app.py
5
+ pinned: false
6
+ ---
7
+
8
+ # Multilingual Static Word Embeddings Demo
9
+
10
+ This Space loads a saved aligned multilingual embedding space and lets users
11
+ search translations and nearest neighbors with adjustable retrieval parameters.
12
+
13
+ Required artifact files:
14
+
15
+ - `aligned_all.faiss`
16
+ - `all_metadata.jsonl`
17
+ - `config.json`
18
+
19
+ The app does not use `aligned_all.vec`.
20
+
21
+ ## Runtime configuration
22
+
23
+ By default, the app lists the newest artifact folder under:
24
+
25
+ `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/`
26
+
27
+ Set these Hugging Face Space secrets for S3-compatible storage:
28
+
29
+ - `SE_ACCESS_KEY`
30
+ - `SE_SECRET_KEY`
31
+ - `SE_HOST_URL`
32
+
33
+ Optional environment overrides:
34
+
35
+ - `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
36
+ `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
37
+ - `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
38
+ - `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
39
+
40
+ Defaults for `top_k`, `min_score`, `csls_k`, `candidate_retrieval_k`,
41
+ `csls_prefetch_k`, and bidirectional consistency are read from `config.json`.
app.py ADDED
@@ -0,0 +1,1039 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import difflib
4
+ import gc
5
+ import json
6
+ import os
7
+ import re
8
+ import sys
9
+ import unicodedata
10
+ from dataclasses import dataclass
11
+ from functools import lru_cache
12
+ from pathlib import Path
13
+ from typing import Any
14
+ from urllib.parse import urlparse
15
+
16
+ import boto3
17
+ import gradio as gr
18
+ import numpy as np
19
+ import pandas as pd
20
+ from botocore.config import Config
21
+
22
+
23
+ DEFAULT_ARTIFACT_PREFIX = (
24
+ "s3://131-component-staging/"
25
+ "multilingual-static-word-embeddings/stage-6/"
26
+ )
27
+ ARTIFACT_URI_ENV = "SPACE_ARTIFACT_S3_URI"
28
+ ARTIFACT_PREFIX_ENV = "SPACE_ARTIFACT_S3_PREFIX"
29
+ CACHE_ROOT = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
30
+ REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
31
+ DEFAULT_LANGUAGES = ["de", "en", "fr", "lb"]
32
+
33
+ TRANSLATION_COLUMNS = [
34
+ "target_lang",
35
+ "translation",
36
+ "token",
37
+ "score",
38
+ "cosine",
39
+ "rank",
40
+ "bidirectional",
41
+ "id",
42
+ "source_vec_file",
43
+ ]
44
+ NEIGHBOR_COLUMNS = [
45
+ "lang",
46
+ "word",
47
+ "token",
48
+ "score",
49
+ "cosine",
50
+ "rank",
51
+ "id",
52
+ ]
53
+ VOCAB_COLUMNS = ["id", "lang", "surface", "token", "source_vec_file"]
54
+
55
+
56
+ @dataclass
57
+ class LangVectors:
58
+ lang: str
59
+ ids: np.ndarray
60
+ metas: list[dict[str, Any]]
61
+ vecs: np.ndarray
62
+
63
+
64
+ @dataclass
65
+ class RuntimeOptions:
66
+ top_k: int
67
+ min_score: float
68
+ csls_k: int
69
+ candidate_retrieval_k: int
70
+ csls_prefetch_k: int
71
+ bidirectional: bool
72
+ score_method: str
73
+ filter_stopwords: bool
74
+ filter_bad_tokens: bool
75
+ use_surface: bool
76
+
77
+
78
+ @dataclass
79
+ class Space:
80
+ root: Path
81
+ artifact_uri: str
82
+ config: dict[str, Any]
83
+ languages: list[str]
84
+ by_lang: dict[str, LangVectors]
85
+ lookup: dict[str, dict[str, list[int]]]
86
+ id_to_location: dict[int, tuple[str, int]]
87
+
88
+
89
+ def parse_s3_uri(uri: str) -> tuple[str, str]:
90
+ parsed = urlparse(uri)
91
+ if parsed.scheme != "s3" or not parsed.netloc:
92
+ raise ValueError(f"Expected s3://bucket/key URI, got: {uri}")
93
+ return parsed.netloc, parsed.path.lstrip("/")
94
+
95
+
96
+ def make_s3_client():
97
+ access_key = os.getenv("SE_ACCESS_KEY") or os.getenv("AWS_ACCESS_KEY_ID")
98
+ secret_key = os.getenv("SE_SECRET_KEY") or os.getenv("AWS_SECRET_ACCESS_KEY")
99
+ endpoint_url = os.getenv("SE_HOST_URL") or os.getenv("AWS_ENDPOINT_URL")
100
+ region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
101
+ if endpoint_url and not endpoint_url.startswith(("http://", "https://")):
102
+ endpoint_url = f"https://{endpoint_url}"
103
+
104
+ kwargs: dict[str, Any] = {
105
+ "service_name": "s3",
106
+ "region_name": region,
107
+ "config": Config(
108
+ signature_version="s3v4",
109
+ s3={"addressing_style": "path"},
110
+ retries={"max_attempts": 5, "mode": "standard"},
111
+ ),
112
+ }
113
+ if endpoint_url:
114
+ kwargs["endpoint_url"] = endpoint_url
115
+ if access_key and secret_key:
116
+ kwargs["aws_access_key_id"] = access_key
117
+ kwargs["aws_secret_access_key"] = secret_key
118
+
119
+ return boto3.client(**kwargs)
120
+
121
+
122
+ def find_latest_artifact_uri(client) -> str:
123
+ explicit_uri = os.getenv(ARTIFACT_URI_ENV, "").strip()
124
+ if explicit_uri:
125
+ explicit_uri = explicit_uri.rstrip("/")
126
+ if "multilingual_space_" in explicit_uri:
127
+ return explicit_uri
128
+ bucket, prefix = parse_s3_uri(explicit_uri)
129
+ return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
130
+
131
+ prefix_uri = os.getenv(ARTIFACT_PREFIX_ENV, DEFAULT_ARTIFACT_PREFIX).strip()
132
+ bucket, prefix = parse_s3_uri(prefix_uri)
133
+ return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
134
+
135
+
136
+ def find_latest_artifact_uri_under_prefix(client, bucket: str, prefix: str) -> str:
137
+ prefix = prefix.rstrip("/") + "/"
138
+
139
+ pattern = re.compile(r"(.*multilingual_space_(\d{8}_\d{6})\.json)/config\.json$")
140
+ candidates: list[tuple[str, str]] = []
141
+ paginator = client.get_paginator("list_objects_v2")
142
+ for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
143
+ for obj in page.get("Contents", []):
144
+ key = obj["Key"]
145
+ match = pattern.match(key)
146
+ if match:
147
+ candidates.append((match.group(2), match.group(1)))
148
+
149
+ if not candidates:
150
+ raise FileNotFoundError(
151
+ f"No multilingual_space_*.json/config.json found under s3://{bucket}/{prefix}"
152
+ )
153
+
154
+ _, latest_key = sorted(candidates)[-1]
155
+ return f"s3://{bucket}/{latest_key}"
156
+
157
+
158
+ def artifact_cache_dir(artifact_uri: str) -> Path:
159
+ _, key = parse_s3_uri(artifact_uri)
160
+ name = Path(key.rstrip("/")).name
161
+ return CACHE_ROOT / name
162
+
163
+
164
+ def download_artifact() -> tuple[Path, str]:
165
+ client = make_s3_client()
166
+ artifact_uri = find_latest_artifact_uri(client)
167
+ local_dir = artifact_cache_dir(artifact_uri)
168
+ local_dir.mkdir(parents=True, exist_ok=True)
169
+
170
+ bucket, key_prefix = parse_s3_uri(artifact_uri)
171
+ key_prefix = key_prefix.rstrip("/")
172
+
173
+ for filename in REQUIRED_FILES:
174
+ local_path = local_dir / filename
175
+ if local_path.exists() and local_path.stat().st_size > 0:
176
+ continue
177
+ key = f"{key_prefix}/{filename}"
178
+ print(f"Downloading s3://{bucket}/{key} -> {local_path}", file=sys.stderr)
179
+ client.download_file(bucket, key, str(local_path))
180
+
181
+ return local_dir, artifact_uri
182
+
183
+
184
+ def strip_diacritics(text: str) -> str:
185
+ return "".join(
186
+ ch for ch in unicodedata.normalize("NFKD", text) if not unicodedata.combining(ch)
187
+ )
188
+
189
+
190
+ def lookup_key(text: str) -> str:
191
+ text = " ".join(text.strip().casefold().split())
192
+ return strip_diacritics(text)
193
+
194
+
195
+ def is_good_token(token: str, min_len: int = 4) -> bool:
196
+ if not token or len(token) < min_len or token.isdigit():
197
+ return False
198
+ alpha = sum(ch.isalpha() for ch in token)
199
+ if alpha < 2:
200
+ return False
201
+ return all(ch.isalnum() or ch in "-'_" for ch in token)
202
+
203
+
204
+ def read_config(space_dir: Path) -> dict[str, Any]:
205
+ with (space_dir / "config.json").open("r", encoding="utf-8") as f:
206
+ return json.load(f)
207
+
208
+
209
+ def read_metadata(space_dir: Path) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
210
+ metadata_path = space_dir / "all_metadata.jsonl"
211
+ metadata: list[dict[str, Any] | None] = []
212
+ ids_by_lang: dict[str, list[int]] = {}
213
+
214
+ with metadata_path.open("r", encoding="utf-8") as f:
215
+ for line in f:
216
+ if not line.strip():
217
+ continue
218
+ meta = json.loads(line)
219
+ row_id = int(meta["id"])
220
+ while len(metadata) <= row_id:
221
+ metadata.append(None)
222
+ metadata[row_id] = meta
223
+ ids_by_lang.setdefault(str(meta["lang"]), []).append(row_id)
224
+
225
+ missing = [i for i, meta in enumerate(metadata) if meta is None]
226
+ if missing:
227
+ raise ValueError(f"Metadata ids are not contiguous; first missing id is {missing[0]}")
228
+
229
+ return [m for m in metadata if m is not None], ids_by_lang
230
+
231
+
232
+ def reconstruct_range(index: Any, start: int, count: int) -> np.ndarray:
233
+ try:
234
+ vecs = index.reconstruct_n(start, count)
235
+ except TypeError:
236
+ vecs = np.empty((count, index.d), dtype=np.float32)
237
+ index.reconstruct_n(start, count, vecs)
238
+ return np.ascontiguousarray(vecs, dtype=np.float32)
239
+
240
+
241
+ def reconstruct_ids(index: Any, ids: list[int]) -> np.ndarray:
242
+ if not ids:
243
+ return np.empty((0, index.d), dtype=np.float32)
244
+
245
+ start = ids[0]
246
+ if ids == list(range(start, start + len(ids))):
247
+ return reconstruct_range(index, start, len(ids))
248
+
249
+ vecs = np.empty((len(ids), index.d), dtype=np.float32)
250
+ for local_i, row_id in enumerate(ids):
251
+ try:
252
+ vecs[local_i] = index.reconstruct(int(row_id))
253
+ except TypeError:
254
+ index.reconstruct(int(row_id), vecs[local_i])
255
+ return np.ascontiguousarray(vecs, dtype=np.float32)
256
+
257
+
258
+ def normalize_rows(vecs: np.ndarray) -> np.ndarray:
259
+ norms = np.linalg.norm(vecs, axis=1, keepdims=True)
260
+ return (vecs / (norms + 1e-12)).astype(np.float32, copy=False)
261
+
262
+
263
+ def load_vectors_from_faiss(space_dir: Path, ids_by_lang: dict[str, list[int]]) -> dict[str, np.ndarray]:
264
+ faiss_path = space_dir / "aligned_all.faiss"
265
+ try:
266
+ import faiss # type: ignore
267
+ except ImportError as exc:
268
+ raise RuntimeError(
269
+ "faiss-cpu is required. The Space must install faiss-cpu from requirements.txt."
270
+ ) from exc
271
+
272
+ print(f"Loading FAISS index: {faiss_path}", file=sys.stderr)
273
+ index = faiss.read_index(str(faiss_path))
274
+
275
+ vectors_by_lang: dict[str, np.ndarray] = {}
276
+ for lang, ids in sorted(ids_by_lang.items()):
277
+ print(f"Reconstructing {lang}: {len(ids)} vectors", file=sys.stderr)
278
+ vectors_by_lang[lang] = normalize_rows(reconstruct_ids(index, ids))
279
+
280
+ del index
281
+ gc.collect()
282
+ return vectors_by_lang
283
+
284
+
285
+ def build_lookup(languages: dict[str, LangVectors]) -> dict[str, dict[str, list[int]]]:
286
+ lookup: dict[str, dict[str, list[int]]] = {}
287
+ for lang, data in languages.items():
288
+ lang_lookup: dict[str, list[int]] = {}
289
+ for global_id, meta in zip(data.ids.tolist(), data.metas):
290
+ for value in (meta.get("token"), meta.get("surface")):
291
+ if not value:
292
+ continue
293
+ lang_lookup.setdefault(lookup_key(str(value)), []).append(int(global_id))
294
+ lookup[lang] = lang_lookup
295
+ return lookup
296
+
297
+
298
+ @lru_cache(maxsize=1)
299
+ def load_space() -> Space:
300
+ space_dir, artifact_uri = download_artifact()
301
+ config = read_config(space_dir)
302
+ metadata, ids_by_lang = read_metadata(space_dir)
303
+ vectors_by_lang = load_vectors_from_faiss(space_dir, ids_by_lang)
304
+
305
+ by_lang: dict[str, LangVectors] = {}
306
+ id_to_location: dict[int, tuple[str, int]] = {}
307
+ languages = list(config.get("languages") or sorted(ids_by_lang))
308
+
309
+ for lang in languages:
310
+ ids = ids_by_lang.get(lang)
311
+ if not ids:
312
+ continue
313
+ metas = [metadata[row_id] for row_id in ids]
314
+ vecs = vectors_by_lang[lang]
315
+ by_lang[lang] = LangVectors(
316
+ lang=lang,
317
+ ids=np.asarray(ids, dtype=np.int64),
318
+ metas=metas,
319
+ vecs=vecs,
320
+ )
321
+ for local_i, row_id in enumerate(ids):
322
+ id_to_location[int(row_id)] = (lang, local_i)
323
+
324
+ languages = [lang for lang in languages if lang in by_lang]
325
+ return Space(
326
+ root=space_dir,
327
+ artifact_uri=artifact_uri,
328
+ config=config,
329
+ languages=languages,
330
+ by_lang=by_lang,
331
+ lookup=build_lookup(by_lang),
332
+ id_to_location=id_to_location,
333
+ )
334
+
335
+
336
+ def default_options(config: dict[str, Any]) -> RuntimeOptions:
337
+ bidi_config = config.get("bidirectional_consistency") or {}
338
+ return RuntimeOptions(
339
+ top_k=int(config.get("top_k", 3)),
340
+ min_score=float(config.get("min_score", 0.15)),
341
+ csls_k=int(config.get("csls_k", 10)),
342
+ candidate_retrieval_k=int(config.get("candidate_retrieval_k", 9)),
343
+ csls_prefetch_k=int(config.get("csls_prefetch_k", 50)),
344
+ bidirectional=bool(bidi_config.get("enabled", True)),
345
+ score_method="csls",
346
+ filter_stopwords=True,
347
+ filter_bad_tokens=True,
348
+ use_surface=True,
349
+ )
350
+
351
+
352
+ def make_options(
353
+ top_k: int,
354
+ min_score: float,
355
+ csls_k: int,
356
+ candidate_retrieval_k: int,
357
+ csls_prefetch_k: int,
358
+ bidirectional: bool,
359
+ score_method: str,
360
+ filter_stopwords: bool,
361
+ filter_bad_tokens: bool,
362
+ use_surface: bool,
363
+ ) -> RuntimeOptions:
364
+ return RuntimeOptions(
365
+ top_k=int(top_k),
366
+ min_score=float(min_score),
367
+ csls_k=int(csls_k),
368
+ candidate_retrieval_k=int(candidate_retrieval_k),
369
+ csls_prefetch_k=int(csls_prefetch_k),
370
+ bidirectional=bool(bidirectional),
371
+ score_method=str(score_method).lower(),
372
+ filter_stopwords=bool(filter_stopwords),
373
+ filter_bad_tokens=bool(filter_bad_tokens),
374
+ use_surface=bool(use_surface),
375
+ )
376
+
377
+
378
+ def top_indices(values: np.ndarray, k: int) -> np.ndarray:
379
+ k = min(max(0, k), values.shape[0])
380
+ if k == 0:
381
+ return np.empty((0,), dtype=np.int64)
382
+ if k >= values.shape[0]:
383
+ return np.argsort(-values)
384
+ idx = np.argpartition(-values, k - 1)[:k]
385
+ return idx[np.argsort(-values[idx])]
386
+
387
+
388
+ def top_mean(values: np.ndarray, k: int) -> float:
389
+ k = min(max(1, k), values.shape[0])
390
+ idx = top_indices(values, k)
391
+ return float(values[idx].mean())
392
+
393
+
394
+ def candidate_allowed(meta: dict[str, Any], lang: str, space: Space, opts: RuntimeOptions) -> bool:
395
+ token = str(meta.get("token") or "")
396
+ if opts.filter_bad_tokens:
397
+ min_len = int((space.config.get("filters") or {}).get("target_is_good_token_min_len", 4))
398
+ if not is_good_token(token, min_len):
399
+ return False
400
+ if opts.filter_stopwords:
401
+ stopwords = set((space.config.get("stopwords") or {}).get(lang, []))
402
+ if token.lower() in stopwords:
403
+ return False
404
+ return True
405
+
406
+
407
+ def rank_candidates(
408
+ space: Space,
409
+ query_vec: np.ndarray,
410
+ source_lang: str,
411
+ target_lang: str,
412
+ opts: RuntimeOptions,
413
+ *,
414
+ apply_filters: bool = True,
415
+ ) -> list[dict[str, Any]]:
416
+ source = space.by_lang[source_lang]
417
+ target = space.by_lang[target_lang]
418
+
419
+ cosine_all = target.vecs @ query_vec
420
+ prefetch_k = max(opts.candidate_retrieval_k, opts.csls_prefetch_k, opts.top_k)
421
+ prefetch_ids = top_indices(cosine_all, min(prefetch_k, len(target.metas)))
422
+ candidate_cosines = cosine_all[prefetch_ids]
423
+
424
+ if opts.score_method == "csls":
425
+ r_query = top_mean(cosine_all, opts.csls_k)
426
+ candidate_vecs = target.vecs[prefetch_ids]
427
+ reverse_sims = candidate_vecs @ source.vecs.T
428
+ r_targets = np.asarray(
429
+ [top_mean(reverse_sims[i], opts.csls_k) for i in range(reverse_sims.shape[0])],
430
+ dtype=np.float32,
431
+ )
432
+ scores = (2.0 * candidate_cosines - r_query - r_targets).astype(np.float32)
433
+ else:
434
+ scores = candidate_cosines.astype(np.float32)
435
+
436
+ order = np.argsort(-scores)[: opts.candidate_retrieval_k]
437
+ results: list[dict[str, Any]] = []
438
+ seen_surfaces: set[str] = set()
439
+ dedupe_surfaces = bool(
440
+ (space.config.get("filters") or {}).get("duplicate_target_surfaces_removed", True)
441
+ )
442
+
443
+ for rank, pos in enumerate(order, 1):
444
+ local_id = int(prefetch_ids[pos])
445
+ meta = target.metas[local_id]
446
+ score = float(scores[pos])
447
+ if score < opts.min_score:
448
+ continue
449
+ if apply_filters and not candidate_allowed(meta, target_lang, space, opts):
450
+ continue
451
+ surface = str(meta.get("surface") or meta.get("token") or "")
452
+ if dedupe_surfaces and surface in seen_surfaces:
453
+ continue
454
+ seen_surfaces.add(surface)
455
+ results.append(
456
+ {
457
+ "rank": rank,
458
+ "global_id": int(target.ids[local_id]),
459
+ "local_id": local_id,
460
+ "meta": meta,
461
+ "score": score,
462
+ "cosine": float(candidate_cosines[pos]),
463
+ "bidirectional": None,
464
+ }
465
+ )
466
+
467
+ return results
468
+
469
+
470
+ def get_meta(space: Space, global_id: int) -> dict[str, Any]:
471
+ lang, local_id = space.id_to_location[int(global_id)]
472
+ return space.by_lang[lang].metas[local_id]
473
+
474
+
475
+ def get_vec(space: Space, global_id: int) -> np.ndarray:
476
+ lang, local_id = space.id_to_location[int(global_id)]
477
+ return space.by_lang[lang].vecs[local_id]
478
+
479
+
480
+ def format_word(meta: dict[str, Any], opts: RuntimeOptions) -> str:
481
+ if opts.use_surface:
482
+ return str(meta.get("surface") or meta.get("token") or "")
483
+ return str(meta.get("token") or meta.get("surface") or "")
484
+
485
+
486
+ def suggestions(space: Space, lang: str, query: str, limit: int = 8) -> list[str]:
487
+ lang_lookup = space.lookup.get(lang, {})
488
+ key = lookup_key(query)
489
+ close_keys = difflib.get_close_matches(key, lang_lookup.keys(), n=limit, cutoff=0.72)
490
+ labels = []
491
+ for close_key in close_keys:
492
+ row_id = lang_lookup[close_key][0]
493
+ meta = get_meta(space, row_id)
494
+ label = str(meta.get("surface") or meta.get("token") or "")
495
+ if label and label not in labels:
496
+ labels.append(label)
497
+ return labels
498
+
499
+
500
+ def resolve_query(space: Space, lang: str, query: str) -> tuple[int, dict[str, Any], str]:
501
+ if lang not in space.by_lang:
502
+ raise ValueError(f"Unknown language {lang!r}. Available: {', '.join(space.languages)}")
503
+
504
+ query = query.strip()
505
+ if not query:
506
+ raise ValueError("Enter a query word.")
507
+
508
+ matches = space.lookup.get(lang, {}).get(lookup_key(query), [])
509
+ if not matches:
510
+ hint = suggestions(space, lang, query)
511
+ if hint:
512
+ raise LookupError(f"No exact match. Close matches: {', '.join(hint)}")
513
+ raise LookupError(f"No exact token/surface match for {lang}:{query!r}")
514
+
515
+ row_id = int(matches[0])
516
+ message = ""
517
+ if len(matches) > 1:
518
+ shown = []
519
+ for match_id in matches[:5]:
520
+ meta = get_meta(space, match_id)
521
+ shown.append(f"{meta.get('surface') or meta.get('token')} (id {match_id})")
522
+ message = f"Matched {len(matches)} entries; using {shown[0]}."
523
+
524
+ return row_id, get_meta(space, row_id), message
525
+
526
+
527
+ def translation_dataframe() -> pd.DataFrame:
528
+ return pd.DataFrame(columns=TRANSLATION_COLUMNS)
529
+
530
+
531
+ def neighbor_dataframe() -> pd.DataFrame:
532
+ return pd.DataFrame(columns=NEIGHBOR_COLUMNS)
533
+
534
+
535
+ def vocabulary_dataframe() -> pd.DataFrame:
536
+ return pd.DataFrame(columns=VOCAB_COLUMNS)
537
+
538
+
539
+ def translate_ui(
540
+ query: str,
541
+ source_lang: str,
542
+ target_langs: list[str] | None,
543
+ top_k: int,
544
+ min_score: float,
545
+ csls_k: int,
546
+ candidate_retrieval_k: int,
547
+ csls_prefetch_k: int,
548
+ bidirectional: bool,
549
+ score_method: str,
550
+ filter_stopwords: bool,
551
+ filter_bad_tokens: bool,
552
+ use_surface: bool,
553
+ ) -> tuple[pd.DataFrame, str]:
554
+ try:
555
+ space = load_space()
556
+ targets = target_langs or [lang for lang in space.languages if lang != source_lang]
557
+ opts = make_options(
558
+ top_k,
559
+ min_score,
560
+ csls_k,
561
+ candidate_retrieval_k,
562
+ csls_prefetch_k,
563
+ bidirectional,
564
+ score_method,
565
+ filter_stopwords,
566
+ filter_bad_tokens,
567
+ use_surface,
568
+ )
569
+ source_id, source_meta, match_message = resolve_query(space, source_lang, query)
570
+ source_vec = get_vec(space, source_id)
571
+ rows: list[dict[str, Any]] = []
572
+ grouped: list[str] = [
573
+ f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
574
+ f"(token `{source_meta.get('token')}`, id `{source_id}`)"
575
+ ]
576
+ if match_message:
577
+ grouped.append(match_message)
578
+
579
+ for target_lang in targets:
580
+ if target_lang == source_lang or target_lang not in space.by_lang:
581
+ continue
582
+ candidates = rank_candidates(space, source_vec, source_lang, target_lang, opts)
583
+ kept: list[dict[str, Any]] = []
584
+ for cand in candidates:
585
+ if opts.bidirectional:
586
+ reverse = rank_candidates(
587
+ space,
588
+ get_vec(space, int(cand["global_id"])),
589
+ target_lang,
590
+ source_lang,
591
+ opts,
592
+ )
593
+ reverse_ids = {int(item["global_id"]) for item in reverse}
594
+ cand["bidirectional"] = source_id in reverse_ids
595
+ if not cand["bidirectional"]:
596
+ continue
597
+ else:
598
+ cand["bidirectional"] = False
599
+ kept.append(cand)
600
+ if len(kept) >= opts.top_k:
601
+ break
602
+
603
+ if kept:
604
+ grouped.append(f"\n{target_lang}:")
605
+ for i, cand in enumerate(kept, 1):
606
+ meta = cand["meta"]
607
+ word = format_word(meta, opts)
608
+ grouped.append(f"{i}. {word} ({cand['score']:.4f})")
609
+ rows.append(
610
+ {
611
+ "target_lang": target_lang,
612
+ "translation": word,
613
+ "token": meta.get("token"),
614
+ "score": round(float(cand["score"]), 6),
615
+ "cosine": round(float(cand["cosine"]), 6),
616
+ "rank": int(cand["rank"]),
617
+ "bidirectional": bool(cand["bidirectional"]),
618
+ "id": int(cand["global_id"]),
619
+ "source_vec_file": meta.get("source_vec_file"),
620
+ }
621
+ )
622
+ else:
623
+ grouped.append(f"\n{target_lang}: no candidates after filters")
624
+
625
+ return pd.DataFrame(rows, columns=TRANSLATION_COLUMNS), "\n".join(grouped)
626
+ except Exception as exc:
627
+ return translation_dataframe(), f"Error: {exc}"
628
+
629
+
630
+ def nearest_ui(
631
+ query: str,
632
+ source_lang: str,
633
+ neighbor_langs: list[str] | None,
634
+ top_n: int,
635
+ min_score: float,
636
+ csls_k: int,
637
+ score_method: str,
638
+ include_source_language: bool,
639
+ use_surface: bool,
640
+ ) -> tuple[pd.DataFrame, str]:
641
+ try:
642
+ space = load_space()
643
+ opts = make_options(
644
+ top_n,
645
+ min_score,
646
+ csls_k,
647
+ max(top_n + 5, 20),
648
+ max(top_n + 5, 50),
649
+ False,
650
+ score_method,
651
+ False,
652
+ False,
653
+ use_surface,
654
+ )
655
+ source_id, source_meta, match_message = resolve_query(space, source_lang, query)
656
+ source_vec = get_vec(space, source_id)
657
+ targets = neighbor_langs or space.languages
658
+ if not include_source_language:
659
+ targets = [lang for lang in targets if lang != source_lang]
660
+
661
+ rows: list[dict[str, Any]] = []
662
+ for target_lang in targets:
663
+ if target_lang not in space.by_lang:
664
+ continue
665
+ candidates = rank_candidates(
666
+ space,
667
+ source_vec,
668
+ source_lang,
669
+ target_lang,
670
+ opts,
671
+ apply_filters=False,
672
+ )
673
+ for cand in candidates:
674
+ if int(cand["global_id"]) == source_id:
675
+ continue
676
+ meta = cand["meta"]
677
+ rows.append(
678
+ {
679
+ "lang": target_lang,
680
+ "word": format_word(meta, opts),
681
+ "token": meta.get("token"),
682
+ "score": round(float(cand["score"]), 6),
683
+ "cosine": round(float(cand["cosine"]), 6),
684
+ "rank": int(cand["rank"]),
685
+ "id": int(cand["global_id"]),
686
+ }
687
+ )
688
+ if len([row for row in rows if row["lang"] == target_lang]) >= top_n:
689
+ break
690
+
691
+ rows = sorted(rows, key=lambda row: row["score"], reverse=True)
692
+ status = (
693
+ f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
694
+ f"(token `{source_meta.get('token')}`, id `{source_id}`)"
695
+ )
696
+ if match_message:
697
+ status += f"\n\n{match_message}"
698
+ return pd.DataFrame(rows, columns=NEIGHBOR_COLUMNS), status
699
+ except Exception as exc:
700
+ return neighbor_dataframe(), f"Error: {exc}"
701
+
702
+
703
+ def browse_ui(lang: str, filter_text: str, limit: int) -> pd.DataFrame:
704
+ try:
705
+ space = load_space()
706
+ if lang not in space.by_lang:
707
+ return vocabulary_dataframe()
708
+ needle = lookup_key(filter_text or "")
709
+ rows = []
710
+ for row_id, meta in zip(space.by_lang[lang].ids.tolist(), space.by_lang[lang].metas):
711
+ surface = str(meta.get("surface") or "")
712
+ token = str(meta.get("token") or "")
713
+ if needle and needle not in lookup_key(surface) and needle not in lookup_key(token):
714
+ continue
715
+ rows.append(
716
+ {
717
+ "id": int(row_id),
718
+ "lang": lang,
719
+ "surface": surface,
720
+ "token": token,
721
+ "source_vec_file": meta.get("source_vec_file"),
722
+ }
723
+ )
724
+ if len(rows) >= int(limit):
725
+ break
726
+ return pd.DataFrame(rows, columns=VOCAB_COLUMNS)
727
+ except Exception:
728
+ return vocabulary_dataframe()
729
+
730
+
731
+ def config_markdown(space: Space) -> str:
732
+ config = space.config
733
+ vocab_sizes = config.get("vocab_sizes") or {
734
+ lang: len(space.by_lang[lang].metas) for lang in space.languages
735
+ }
736
+ bidi = config.get("bidirectional_consistency") or {}
737
+ lines = [
738
+ f"Artifact: `{space.artifact_uri}`",
739
+ f"Created: `{config.get('created_at', 'unknown')}`",
740
+ f"Languages: `{', '.join(space.languages)}`",
741
+ f"Pivot language: `{config.get('pivot_lang', 'unknown')}`",
742
+ f"Vector dim: `{config.get('vector_dim', 'unknown')}`",
743
+ f"Top N vocab: `{config.get('top_n_vocab', 'unknown')}`",
744
+ f"Output top: `{config.get('out_top', 'unknown')}`",
745
+ f"Default top_k: `{config.get('top_k', 3)}`",
746
+ f"Default min_score: `{config.get('min_score', 0.15)}`",
747
+ f"Default csls_k: `{config.get('csls_k', 10)}`",
748
+ f"Bidirectional consistency: `{bool(bidi.get('enabled', True))}`",
749
+ "",
750
+ "Vocabulary sizes:",
751
+ ]
752
+ for lang, size in sorted(vocab_sizes.items()):
753
+ lines.append(f"- `{lang}`: `{size}`")
754
+ return "\n".join(lines)
755
+
756
+
757
+ def initialize_ui():
758
+ try:
759
+ space = load_space()
760
+ opts = default_options(space.config)
761
+ source = space.config.get("pivot_lang", "de")
762
+ if source not in space.languages:
763
+ source = space.languages[0]
764
+ targets = [lang for lang in space.languages if lang != source]
765
+ status = f"Loaded `{space.artifact_uri}` with `{sum(len(v.metas) for v in space.by_lang.values())}` vectors."
766
+ return (
767
+ status,
768
+ gr.update(choices=space.languages, value=source),
769
+ gr.update(choices=space.languages, value=targets),
770
+ opts.top_k,
771
+ opts.min_score,
772
+ opts.csls_k,
773
+ opts.candidate_retrieval_k,
774
+ opts.csls_prefetch_k,
775
+ opts.bidirectional,
776
+ gr.update(choices=space.languages, value=source),
777
+ gr.update(choices=space.languages, value=space.languages),
778
+ opts.csls_k,
779
+ gr.update(choices=space.languages, value=source),
780
+ config_markdown(space),
781
+ )
782
+ except Exception as exc:
783
+ status = f"Load error: {exc}"
784
+ return (
785
+ status,
786
+ gr.update(choices=DEFAULT_LANGUAGES, value="de"),
787
+ gr.update(choices=DEFAULT_LANGUAGES, value=["en", "fr", "lb"]),
788
+ 3,
789
+ 0.15,
790
+ 10,
791
+ 9,
792
+ 50,
793
+ True,
794
+ gr.update(choices=DEFAULT_LANGUAGES, value="de"),
795
+ gr.update(choices=DEFAULT_LANGUAGES, value=DEFAULT_LANGUAGES),
796
+ 10,
797
+ gr.update(choices=DEFAULT_LANGUAGES, value="de"),
798
+ status,
799
+ )
800
+
801
+
802
+ def update_targets(source_lang: str) -> gr.CheckboxGroup:
803
+ try:
804
+ space = load_space()
805
+ return gr.update(
806
+ choices=space.languages,
807
+ value=[lang for lang in space.languages if lang != source_lang],
808
+ )
809
+ except Exception:
810
+ return gr.update(
811
+ choices=DEFAULT_LANGUAGES,
812
+ value=[lang for lang in DEFAULT_LANGUAGES if lang != source_lang],
813
+ )
814
+
815
+
816
+ def update_neighbor_langs(source_lang: str, include_source: bool) -> gr.CheckboxGroup:
817
+ try:
818
+ space = load_space()
819
+ choices = space.languages
820
+ except Exception:
821
+ choices = DEFAULT_LANGUAGES
822
+ values = choices if include_source else [lang for lang in choices if lang != source_lang]
823
+ return gr.update(choices=choices, value=values)
824
+
825
+
826
+ css = """
827
+ .app-title h1 { margin-bottom: 0.15rem; }
828
+ .status-line { font-size: 0.9rem; color: #475569; }
829
+ """
830
+
831
+
832
+ with gr.Blocks(title="Multilingual Static Word Embeddings", css=css) as demo:
833
+ gr.Markdown(
834
+ "# Multilingual Static Word Embeddings\n"
835
+ "Search the aligned FAISS space for cross-lingual word neighbors."
836
+ )
837
+ status_md = gr.Markdown("Loading artifacts...", elem_classes=["status-line"])
838
+
839
+ with gr.Tab("Translate"):
840
+ with gr.Row():
841
+ with gr.Column(scale=1, min_width=320):
842
+ query = gr.Textbox(label="Query word", value="haus")
843
+ source_lang = gr.Dropdown(
844
+ label="Source language",
845
+ choices=DEFAULT_LANGUAGES,
846
+ value="de",
847
+ )
848
+ target_langs = gr.CheckboxGroup(
849
+ label="Target languages",
850
+ choices=DEFAULT_LANGUAGES,
851
+ value=["en", "fr", "lb"],
852
+ )
853
+ translate_btn = gr.Button("Search", variant="primary")
854
+
855
+ with gr.Accordion("Retrieval parameters", open=True):
856
+ top_k = gr.Slider(1, 20, value=3, step=1, label="Top K")
857
+ min_score = gr.Slider(-2.0, 2.0, value=0.15, step=0.01, label="Min score")
858
+ score_method = gr.Radio(
859
+ ["csls", "cosine"],
860
+ value="csls",
861
+ label="Score method",
862
+ )
863
+ csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
864
+ candidate_retrieval_k = gr.Slider(
865
+ 1,
866
+ 100,
867
+ value=9,
868
+ step=1,
869
+ label="Candidate retrieval K",
870
+ )
871
+ csls_prefetch_k = gr.Slider(
872
+ 10,
873
+ 500,
874
+ value=50,
875
+ step=1,
876
+ label="CSLS prefetch K",
877
+ )
878
+ bidirectional = gr.Checkbox(value=True, label="Bidirectional consistency")
879
+ filter_stopwords = gr.Checkbox(value=True, label="Filter stopwords")
880
+ filter_bad_tokens = gr.Checkbox(value=True, label="Filter noisy tokens")
881
+ use_surface = gr.Checkbox(value=True, label="Show surface forms")
882
+
883
+ with gr.Column(scale=2):
884
+ translate_summary = gr.Markdown()
885
+ translation_results = gr.Dataframe(
886
+ headers=TRANSLATION_COLUMNS,
887
+ datatype=["str", "str", "str", "number", "number", "number", "bool", "number", "str"],
888
+ interactive=False,
889
+ wrap=True,
890
+ )
891
+
892
+ with gr.Tab("Nearest Neighbors"):
893
+ with gr.Row():
894
+ with gr.Column(scale=1, min_width=320):
895
+ nn_query = gr.Textbox(label="Query word", value="haus")
896
+ nn_source_lang = gr.Dropdown(
897
+ label="Source language",
898
+ choices=DEFAULT_LANGUAGES,
899
+ value="de",
900
+ )
901
+ nn_langs = gr.CheckboxGroup(
902
+ label="Neighbor languages",
903
+ choices=DEFAULT_LANGUAGES,
904
+ value=DEFAULT_LANGUAGES,
905
+ )
906
+ nn_top_n = gr.Slider(1, 50, value=20, step=1, label="Top N per language")
907
+ nn_min_score = gr.Slider(-2.0, 2.0, value=-2.0, step=0.01, label="Min score")
908
+ nn_score_method = gr.Radio(["csls", "cosine"], value="cosine", label="Score method")
909
+ nn_csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
910
+ nn_include_source = gr.Checkbox(value=True, label="Include source language")
911
+ nn_use_surface = gr.Checkbox(value=True, label="Show surface forms")
912
+ nn_btn = gr.Button("Find neighbors", variant="primary")
913
+ with gr.Column(scale=2):
914
+ nn_summary = gr.Markdown()
915
+ nn_results = gr.Dataframe(
916
+ headers=NEIGHBOR_COLUMNS,
917
+ datatype=["str", "str", "str", "number", "number", "number", "number"],
918
+ interactive=False,
919
+ wrap=True,
920
+ )
921
+
922
+ with gr.Tab("Browse Vocabulary"):
923
+ with gr.Row():
924
+ vocab_lang = gr.Dropdown(label="Language", choices=DEFAULT_LANGUAGES, value="de")
925
+ vocab_filter = gr.Textbox(label="Filter", placeholder="Type part of a token or surface form")
926
+ vocab_limit = gr.Slider(10, 500, value=100, step=10, label="Limit")
927
+ vocab_results = gr.Dataframe(
928
+ headers=VOCAB_COLUMNS,
929
+ datatype=["number", "str", "str", "str", "str"],
930
+ interactive=False,
931
+ wrap=True,
932
+ )
933
+
934
+ with gr.Tab("Artifact Info"):
935
+ artifact_info = gr.Markdown("Loading config...")
936
+
937
+ translate_inputs = [
938
+ query,
939
+ source_lang,
940
+ target_langs,
941
+ top_k,
942
+ min_score,
943
+ csls_k,
944
+ candidate_retrieval_k,
945
+ csls_prefetch_k,
946
+ bidirectional,
947
+ score_method,
948
+ filter_stopwords,
949
+ filter_bad_tokens,
950
+ use_surface,
951
+ ]
952
+ translate_btn.click(
953
+ translate_ui,
954
+ inputs=translate_inputs,
955
+ outputs=[translation_results, translate_summary],
956
+ )
957
+ query.submit(
958
+ translate_ui,
959
+ inputs=translate_inputs,
960
+ outputs=[translation_results, translate_summary],
961
+ )
962
+ source_lang.change(update_targets, inputs=source_lang, outputs=target_langs)
963
+
964
+ nn_btn.click(
965
+ nearest_ui,
966
+ inputs=[
967
+ nn_query,
968
+ nn_source_lang,
969
+ nn_langs,
970
+ nn_top_n,
971
+ nn_min_score,
972
+ nn_csls_k,
973
+ nn_score_method,
974
+ nn_include_source,
975
+ nn_use_surface,
976
+ ],
977
+ outputs=[nn_results, nn_summary],
978
+ )
979
+ nn_query.submit(
980
+ nearest_ui,
981
+ inputs=[
982
+ nn_query,
983
+ nn_source_lang,
984
+ nn_langs,
985
+ nn_top_n,
986
+ nn_min_score,
987
+ nn_csls_k,
988
+ nn_score_method,
989
+ nn_include_source,
990
+ nn_use_surface,
991
+ ],
992
+ outputs=[nn_results, nn_summary],
993
+ )
994
+ nn_source_lang.change(
995
+ update_neighbor_langs,
996
+ inputs=[nn_source_lang, nn_include_source],
997
+ outputs=nn_langs,
998
+ )
999
+ nn_include_source.change(
1000
+ update_neighbor_langs,
1001
+ inputs=[nn_source_lang, nn_include_source],
1002
+ outputs=nn_langs,
1003
+ )
1004
+
1005
+ vocab_lang.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
1006
+ vocab_filter.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
1007
+ vocab_limit.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
1008
+
1009
+ demo.load(
1010
+ initialize_ui,
1011
+ outputs=[
1012
+ status_md,
1013
+ source_lang,
1014
+ target_langs,
1015
+ top_k,
1016
+ min_score,
1017
+ csls_k,
1018
+ candidate_retrieval_k,
1019
+ csls_prefetch_k,
1020
+ bidirectional,
1021
+ nn_source_lang,
1022
+ nn_langs,
1023
+ nn_csls_k,
1024
+ vocab_lang,
1025
+ artifact_info,
1026
+ ],
1027
+ ).then(
1028
+ translate_ui,
1029
+ inputs=translate_inputs,
1030
+ outputs=[translation_results, translate_summary],
1031
+ ).then(
1032
+ browse_ui,
1033
+ inputs=[vocab_lang, vocab_filter, vocab_limit],
1034
+ outputs=vocab_results,
1035
+ )
1036
+
1037
+
1038
+ if __name__ == "__main__":
1039
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ faiss-cpu
3
+ numpy
4
+ pandas
5
+ boto3
6
+ botocore