Maslionok commited on
Commit
a69e0b2
·
1 Parent(s): 09a837f

cleaned up

Browse files
Files changed (3) hide show
  1. README.md +9 -6
  2. app.py +181 -500
  3. requirements.txt +0 -1
README.md CHANGED
@@ -1,14 +1,16 @@
1
  ---
2
- title: Multilingual Static Word Embeddings Demo
3
  sdk: gradio
4
  app_file: app.py
5
  pinned: false
6
  ---
7
 
8
- # Multilingual Static Word Embeddings Demo
9
 
10
- This Space loads a saved aligned multilingual embedding space and lets users
11
- search translations and nearest neighbors with adjustable retrieval parameters.
 
 
12
 
13
  Required artifact files:
14
 
@@ -20,9 +22,9 @@ The app does not use `aligned_all.vec`.
20
 
21
  ## Runtime configuration
22
 
23
- By default, the app lists the newest artifact folder under:
24
 
25
- `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/`
26
 
27
  Set these Hugging Face Space secrets for S3-compatible storage:
28
 
@@ -35,6 +37,7 @@ Optional environment overrides:
35
  - `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
36
  `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
37
  - `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
 
38
  - `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
39
 
40
  Defaults for `top_k`, `min_score`, `csls_k`, `candidate_retrieval_k`,
 
1
  ---
2
+ title: Multilingual Dictionary Explorer
3
  sdk: gradio
4
  app_file: app.py
5
  pinned: false
6
  ---
7
 
8
+ # Multilingual Dictionary Explorer
9
 
10
+ This Space is a Gradio UI for the same lookup logic as
11
+ `query_multilingual_space.py`: enter a source language and query word, then get
12
+ translations to all other languages using FAISS, CSLS, and optional
13
+ bidirectional consistency.
14
 
15
  Required artifact files:
16
 
 
22
 
23
  ## Runtime configuration
24
 
25
+ By default, the app downloads this artifact folder:
26
 
27
+ `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
28
 
29
  Set these Hugging Face Space secrets for S3-compatible storage:
30
 
 
37
  - `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
38
  `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
39
  - `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
40
+ - `SPACE_DIR`: local artifact folder, useful for local testing
41
  - `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
42
 
43
  Defaults for `top_k`, `min_score`, `csls_k`, `candidate_retrieval_k`,
app.py CHANGED
@@ -1,6 +1,5 @@
1
  from __future__ import annotations
2
 
3
- import difflib
4
  import gc
5
  import json
6
  import os
@@ -16,7 +15,6 @@ from urllib.parse import urlparse
16
  import boto3
17
  import gradio as gr
18
  import numpy as np
19
- import pandas as pd
20
  from botocore.config import Config
21
 
22
 
@@ -24,33 +22,13 @@ DEFAULT_ARTIFACT_PREFIX = (
24
  "s3://131-component-staging/"
25
  "multilingual-static-word-embeddings/stage-6/"
26
  )
27
- ARTIFACT_URI_ENV = "SPACE_ARTIFACT_S3_URI"
28
- ARTIFACT_PREFIX_ENV = "SPACE_ARTIFACT_S3_PREFIX"
29
- CACHE_ROOT = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
 
 
30
  REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
31
- DEFAULT_LANGUAGES = ["de", "en", "fr", "lb"]
32
-
33
- TRANSLATION_COLUMNS = [
34
- "target_lang",
35
- "translation",
36
- "token",
37
- "score",
38
- "cosine",
39
- "rank",
40
- "bidirectional",
41
- "id",
42
- "source_vec_file",
43
- ]
44
- NEIGHBOR_COLUMNS = [
45
- "lang",
46
- "word",
47
- "token",
48
- "score",
49
- "cosine",
50
- "rank",
51
- "id",
52
- ]
53
- VOCAB_COLUMNS = ["id", "lang", "surface", "token", "source_vec_file"]
54
 
55
 
56
  @dataclass
@@ -89,7 +67,7 @@ class Space:
89
  def parse_s3_uri(uri: str) -> tuple[str, str]:
90
  parsed = urlparse(uri)
91
  if parsed.scheme != "s3" or not parsed.netloc:
92
- raise ValueError(f"Expected s3://bucket/key URI, got: {uri}")
93
  return parsed.netloc, parsed.path.lstrip("/")
94
 
95
 
@@ -98,6 +76,7 @@ def make_s3_client():
98
  secret_key = os.getenv("SE_SECRET_KEY") or os.getenv("AWS_SECRET_ACCESS_KEY")
99
  endpoint_url = os.getenv("SE_HOST_URL") or os.getenv("AWS_ENDPOINT_URL")
100
  region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
 
101
  if endpoint_url and not endpoint_url.startswith(("http://", "https://")):
102
  endpoint_url = f"https://{endpoint_url}"
103
 
@@ -107,7 +86,7 @@ def make_s3_client():
107
  "config": Config(
108
  signature_version="s3v4",
109
  s3={"addressing_style": "path"},
110
- retries={"max_attempts": 5, "mode": "standard"},
111
  ),
112
  }
113
  if endpoint_url:
@@ -119,66 +98,76 @@ def make_s3_client():
119
  return boto3.client(**kwargs)
120
 
121
 
122
- def find_latest_artifact_uri(client) -> str:
123
- explicit_uri = os.getenv(ARTIFACT_URI_ENV, "").strip()
124
- if explicit_uri:
125
- explicit_uri = explicit_uri.rstrip("/")
126
- if "multilingual_space_" in explicit_uri:
127
- return explicit_uri
128
- bucket, prefix = parse_s3_uri(explicit_uri)
129
- return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
130
-
131
- prefix_uri = os.getenv(ARTIFACT_PREFIX_ENV, DEFAULT_ARTIFACT_PREFIX).strip()
132
- bucket, prefix = parse_s3_uri(prefix_uri)
133
- return find_latest_artifact_uri_under_prefix(client, bucket, prefix)
134
 
 
 
 
135
 
136
- def find_latest_artifact_uri_under_prefix(client, bucket: str, prefix: str) -> str:
 
137
  prefix = prefix.rstrip("/") + "/"
138
-
139
  pattern = re.compile(r"(.*multilingual_space_(\d{8}_\d{6})\.json)/config\.json$")
140
  candidates: list[tuple[str, str]] = []
 
141
  paginator = client.get_paginator("list_objects_v2")
142
  for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
143
  for obj in page.get("Contents", []):
144
- key = obj["Key"]
145
- match = pattern.match(key)
146
  if match:
147
  candidates.append((match.group(2), match.group(1)))
148
 
149
  if not candidates:
150
  raise FileNotFoundError(
151
- f"No multilingual_space_*.json/config.json found under s3://{bucket}/{prefix}"
152
  )
153
 
154
- _, latest_key = sorted(candidates)[-1]
155
- return f"s3://{bucket}/{latest_key}"
156
 
157
 
158
- def artifact_cache_dir(artifact_uri: str) -> Path:
159
- _, key = parse_s3_uri(artifact_uri)
160
- name = Path(key.rstrip("/")).name
161
- return CACHE_ROOT / name
162
 
163
 
164
- def download_artifact() -> tuple[Path, str]:
165
  client = make_s3_client()
166
- artifact_uri = find_latest_artifact_uri(client)
167
- local_dir = artifact_cache_dir(artifact_uri)
168
  local_dir.mkdir(parents=True, exist_ok=True)
169
 
170
- bucket, key_prefix = parse_s3_uri(artifact_uri)
171
- key_prefix = key_prefix.rstrip("/")
172
-
173
  for filename in REQUIRED_FILES:
174
- local_path = local_dir / filename
175
- if local_path.exists() and local_path.stat().st_size > 0:
176
  continue
177
- key = f"{key_prefix}/{filename}"
178
- print(f"Downloading s3://{bucket}/{key} -> {local_path}", file=sys.stderr)
179
- client.download_file(bucket, key, str(local_path))
180
 
181
- return local_dir, artifact_uri
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
 
184
  def strip_diacritics(text: str) -> str:
@@ -202,18 +191,24 @@ def is_good_token(token: str, min_len: int = 4) -> bool:
202
 
203
 
204
  def read_config(space_dir: Path) -> dict[str, Any]:
205
- with (space_dir / "config.json").open("r", encoding="utf-8") as f:
 
 
 
206
  return json.load(f)
207
 
208
 
209
  def read_metadata(space_dir: Path) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
210
- metadata_path = space_dir / "all_metadata.jsonl"
 
 
 
211
  metadata: list[dict[str, Any] | None] = []
212
  ids_by_lang: dict[str, list[int]] = {}
213
-
214
- with metadata_path.open("r", encoding="utf-8") as f:
215
  for line in f:
216
- if not line.strip():
 
217
  continue
218
  meta = json.loads(line)
219
  row_id = int(meta["id"])
@@ -261,17 +256,17 @@ def normalize_rows(vecs: np.ndarray) -> np.ndarray:
261
 
262
 
263
  def load_vectors_from_faiss(space_dir: Path, ids_by_lang: dict[str, list[int]]) -> dict[str, np.ndarray]:
264
- faiss_path = space_dir / "aligned_all.faiss"
265
  try:
266
  import faiss # type: ignore
267
  except ImportError as exc:
268
- raise RuntimeError(
269
- "faiss-cpu is required. The Space must install faiss-cpu from requirements.txt."
270
- ) from exc
 
 
271
 
272
  print(f"Loading FAISS index: {faiss_path}", file=sys.stderr)
273
  index = faiss.read_index(str(faiss_path))
274
-
275
  vectors_by_lang: dict[str, np.ndarray] = {}
276
  for lang, ids in sorted(ids_by_lang.items()):
277
  print(f"Reconstructing {lang}: {len(ids)} vectors", file=sys.stderr)
@@ -288,16 +283,15 @@ def build_lookup(languages: dict[str, LangVectors]) -> dict[str, dict[str, list[
288
  lang_lookup: dict[str, list[int]] = {}
289
  for global_id, meta in zip(data.ids.tolist(), data.metas):
290
  for value in (meta.get("token"), meta.get("surface")):
291
- if not value:
292
- continue
293
- lang_lookup.setdefault(lookup_key(str(value)), []).append(int(global_id))
294
  lookup[lang] = lang_lookup
295
  return lookup
296
 
297
 
298
  @lru_cache(maxsize=1)
299
  def load_space() -> Space:
300
- space_dir, artifact_uri = download_artifact()
301
  config = read_config(space_dir)
302
  metadata, ids_by_lang = read_metadata(space_dir)
303
  vectors_by_lang = load_vectors_from_faiss(space_dir, ids_by_lang)
@@ -335,11 +329,12 @@ def load_space() -> Space:
335
 
336
  def default_options(config: dict[str, Any]) -> RuntimeOptions:
337
  bidi_config = config.get("bidirectional_consistency") or {}
 
338
  return RuntimeOptions(
339
- top_k=int(config.get("top_k", 3)),
340
  min_score=float(config.get("min_score", 0.15)),
341
  csls_k=int(config.get("csls_k", 10)),
342
- candidate_retrieval_k=int(config.get("candidate_retrieval_k", 9)),
343
  csls_prefetch_k=int(config.get("csls_prefetch_k", 50)),
344
  bidirectional=bool(bidi_config.get("enabled", True)),
345
  score_method="csls",
@@ -483,63 +478,31 @@ def format_word(meta: dict[str, Any], opts: RuntimeOptions) -> str:
483
  return str(meta.get("token") or meta.get("surface") or "")
484
 
485
 
486
- def suggestions(space: Space, lang: str, query: str, limit: int = 8) -> list[str]:
487
- lang_lookup = space.lookup.get(lang, {})
488
- key = lookup_key(query)
489
- close_keys = difflib.get_close_matches(key, lang_lookup.keys(), n=limit, cutoff=0.72)
490
- labels = []
491
- for close_key in close_keys:
492
- row_id = lang_lookup[close_key][0]
493
- meta = get_meta(space, row_id)
494
- label = str(meta.get("surface") or meta.get("token") or "")
495
- if label and label not in labels:
496
- labels.append(label)
497
- return labels
498
-
499
-
500
  def resolve_query(space: Space, lang: str, query: str) -> tuple[int, dict[str, Any], str]:
501
  if lang not in space.by_lang:
502
  raise ValueError(f"Unknown language {lang!r}. Available: {', '.join(space.languages)}")
503
-
504
- query = query.strip()
505
- if not query:
506
  raise ValueError("Enter a query word.")
507
 
508
  matches = space.lookup.get(lang, {}).get(lookup_key(query), [])
509
  if not matches:
510
- hint = suggestions(space, lang, query)
511
- if hint:
512
- raise LookupError(f"No exact match. Close matches: {', '.join(hint)}")
513
  raise LookupError(f"No exact token/surface match for {lang}:{query!r}")
514
 
515
- row_id = int(matches[0])
516
  message = ""
517
  if len(matches) > 1:
518
- shown = []
519
- for match_id in matches[:5]:
520
- meta = get_meta(space, match_id)
521
- shown.append(f"{meta.get('surface') or meta.get('token')} (id {match_id})")
522
- message = f"Matched {len(matches)} entries; using {shown[0]}."
523
 
 
524
  return row_id, get_meta(space, row_id), message
525
 
526
 
527
- def translation_dataframe() -> pd.DataFrame:
528
- return pd.DataFrame(columns=TRANSLATION_COLUMNS)
529
-
530
-
531
- def neighbor_dataframe() -> pd.DataFrame:
532
- return pd.DataFrame(columns=NEIGHBOR_COLUMNS)
533
-
534
-
535
- def vocabulary_dataframe() -> pd.DataFrame:
536
- return pd.DataFrame(columns=VOCAB_COLUMNS)
537
-
538
-
539
- def translate_ui(
540
  query: str,
541
  source_lang: str,
542
- target_langs: list[str] | None,
543
  top_k: int,
544
  min_score: float,
545
  csls_k: int,
@@ -550,10 +513,9 @@ def translate_ui(
550
  filter_stopwords: bool,
551
  filter_bad_tokens: bool,
552
  use_surface: bool,
553
- ) -> tuple[pd.DataFrame, str]:
554
  try:
555
  space = load_space()
556
- targets = target_langs or [lang for lang in space.languages if lang != source_lang]
557
  opts = make_options(
558
  top_k,
559
  min_score,
@@ -568,19 +530,25 @@ def translate_ui(
568
  )
569
  source_id, source_meta, match_message = resolve_query(space, source_lang, query)
570
  source_vec = get_vec(space, source_id)
571
- rows: list[dict[str, Any]] = []
572
- grouped: list[str] = [
573
- f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
574
- f"(token `{source_meta.get('token')}`, id `{source_id}`)"
 
 
 
 
 
 
575
  ]
576
  if match_message:
577
- grouped.append(match_message)
578
 
579
- for target_lang in targets:
580
- if target_lang == source_lang or target_lang not in space.by_lang:
581
- continue
582
  candidates = rank_candidates(space, source_vec, source_lang, target_lang, opts)
583
  kept: list[dict[str, Any]] = []
 
584
  for cand in candidates:
585
  if opts.bidirectional:
586
  reverse = rank_candidates(
@@ -596,348 +564,129 @@ def translate_ui(
596
  continue
597
  else:
598
  cand["bidirectional"] = False
 
599
  kept.append(cand)
600
  if len(kept) >= opts.top_k:
601
  break
602
 
603
- if kept:
604
- grouped.append(f"\n{target_lang}:")
605
- for i, cand in enumerate(kept, 1):
606
- meta = cand["meta"]
607
- word = format_word(meta, opts)
608
- grouped.append(f"{i}. {word} ({cand['score']:.4f})")
609
- rows.append(
610
- {
611
- "target_lang": target_lang,
612
- "translation": word,
613
- "token": meta.get("token"),
614
- "score": round(float(cand["score"]), 6),
615
- "cosine": round(float(cand["cosine"]), 6),
616
- "rank": int(cand["rank"]),
617
- "bidirectional": bool(cand["bidirectional"]),
618
- "id": int(cand["global_id"]),
619
- "source_vec_file": meta.get("source_vec_file"),
620
- }
621
- )
622
- else:
623
- grouped.append(f"\n{target_lang}: no candidates after filters")
624
-
625
- return pd.DataFrame(rows, columns=TRANSLATION_COLUMNS), "\n".join(grouped)
626
- except Exception as exc:
627
- return translation_dataframe(), f"Error: {exc}"
628
-
629
-
630
- def nearest_ui(
631
- query: str,
632
- source_lang: str,
633
- neighbor_langs: list[str] | None,
634
- top_n: int,
635
- min_score: float,
636
- csls_k: int,
637
- score_method: str,
638
- include_source_language: bool,
639
- use_surface: bool,
640
- ) -> tuple[pd.DataFrame, str]:
641
- try:
642
- space = load_space()
643
- opts = make_options(
644
- top_n,
645
- min_score,
646
- csls_k,
647
- max(top_n + 5, 20),
648
- max(top_n + 5, 50),
649
- False,
650
- score_method,
651
- False,
652
- False,
653
- use_surface,
654
- )
655
- source_id, source_meta, match_message = resolve_query(space, source_lang, query)
656
- source_vec = get_vec(space, source_id)
657
- targets = neighbor_langs or space.languages
658
- if not include_source_language:
659
- targets = [lang for lang in targets if lang != source_lang]
660
-
661
- rows: list[dict[str, Any]] = []
662
- for target_lang in targets:
663
- if target_lang not in space.by_lang:
664
  continue
665
- candidates = rank_candidates(
666
- space,
667
- source_vec,
668
- source_lang,
669
- target_lang,
670
- opts,
671
- apply_filters=False,
672
- )
673
- for cand in candidates:
674
- if int(cand["global_id"]) == source_id:
675
- continue
676
  meta = cand["meta"]
 
 
 
 
 
 
 
 
677
  rows.append(
678
- {
679
- "lang": target_lang,
680
- "word": format_word(meta, opts),
681
- "token": meta.get("token"),
682
- "score": round(float(cand["score"]), 6),
683
- "cosine": round(float(cand["cosine"]), 6),
684
- "rank": int(cand["rank"]),
685
- "id": int(cand["global_id"]),
686
- }
687
  )
688
- if len([row for row in rows if row["lang"] == target_lang]) >= top_n:
689
- break
690
 
691
- rows = sorted(rows, key=lambda row: row["score"], reverse=True)
692
- status = (
693
- f"Source: `{source_lang}:{format_word(source_meta, opts)}` "
694
- f"(token `{source_meta.get('token')}`, id `{source_id}`)"
695
- )
696
- if match_message:
697
- status += f"\n\n{match_message}"
698
- return pd.DataFrame(rows, columns=NEIGHBOR_COLUMNS), status
699
  except Exception as exc:
700
- return neighbor_dataframe(), f"Error: {exc}"
701
-
702
-
703
- def browse_ui(lang: str, filter_text: str, limit: int) -> pd.DataFrame:
704
- try:
705
- space = load_space()
706
- if lang not in space.by_lang:
707
- return vocabulary_dataframe()
708
- needle = lookup_key(filter_text or "")
709
- rows = []
710
- for row_id, meta in zip(space.by_lang[lang].ids.tolist(), space.by_lang[lang].metas):
711
- surface = str(meta.get("surface") or "")
712
- token = str(meta.get("token") or "")
713
- if needle and needle not in lookup_key(surface) and needle not in lookup_key(token):
714
- continue
715
- rows.append(
716
- {
717
- "id": int(row_id),
718
- "lang": lang,
719
- "surface": surface,
720
- "token": token,
721
- "source_vec_file": meta.get("source_vec_file"),
722
- }
723
- )
724
- if len(rows) >= int(limit):
725
- break
726
- return pd.DataFrame(rows, columns=VOCAB_COLUMNS)
727
- except Exception:
728
- return vocabulary_dataframe()
729
-
730
 
731
- def config_markdown(space: Space) -> str:
732
- config = space.config
733
- vocab_sizes = config.get("vocab_sizes") or {
734
- lang: len(space.by_lang[lang].metas) for lang in space.languages
735
- }
736
- bidi = config.get("bidirectional_consistency") or {}
737
- lines = [
738
- f"Artifact: `{space.artifact_uri}`",
739
- f"Created: `{config.get('created_at', 'unknown')}`",
740
- f"Languages: `{', '.join(space.languages)}`",
741
- f"Pivot language: `{config.get('pivot_lang', 'unknown')}`",
742
- f"Vector dim: `{config.get('vector_dim', 'unknown')}`",
743
- f"Top N vocab: `{config.get('top_n_vocab', 'unknown')}`",
744
- f"Output top: `{config.get('out_top', 'unknown')}`",
745
- f"Default top_k: `{config.get('top_k', 3)}`",
746
- f"Default min_score: `{config.get('min_score', 0.15)}`",
747
- f"Default csls_k: `{config.get('csls_k', 10)}`",
748
- f"Bidirectional consistency: `{bool(bidi.get('enabled', True))}`",
749
- "",
750
- "Vocabulary sizes:",
751
- ]
752
- for lang, size in sorted(vocab_sizes.items()):
753
- lines.append(f"- `{lang}`: `{size}`")
754
- return "\n".join(lines)
755
 
756
-
757
- def initialize_ui():
758
  try:
759
  space = load_space()
760
  opts = default_options(space.config)
761
- source = space.config.get("pivot_lang", "de")
762
- if source not in space.languages:
763
- source = space.languages[0]
764
- targets = [lang for lang in space.languages if lang != source]
765
- status = f"Loaded `{space.artifact_uri}` with `{sum(len(v.metas) for v in space.by_lang.values())}` vectors."
 
 
766
  return (
767
  status,
768
- gr.update(choices=space.languages, value=source),
769
- gr.update(choices=space.languages, value=targets),
770
  opts.top_k,
771
  opts.min_score,
772
  opts.csls_k,
773
  opts.candidate_retrieval_k,
774
  opts.csls_prefetch_k,
775
  opts.bidirectional,
776
- gr.update(choices=space.languages, value=source),
777
- gr.update(choices=space.languages, value=space.languages),
778
- opts.csls_k,
779
- gr.update(choices=space.languages, value=source),
780
- config_markdown(space),
781
  )
782
  except Exception as exc:
783
- status = f"Load error: {exc}"
784
  return (
785
- status,
786
- gr.update(choices=DEFAULT_LANGUAGES, value="de"),
787
- gr.update(choices=DEFAULT_LANGUAGES, value=["en", "fr", "lb"]),
788
  3,
789
  0.15,
790
  10,
791
  9,
792
  50,
793
  True,
794
- gr.update(choices=DEFAULT_LANGUAGES, value="de"),
795
- gr.update(choices=DEFAULT_LANGUAGES, value=DEFAULT_LANGUAGES),
796
- 10,
797
- gr.update(choices=DEFAULT_LANGUAGES, value="de"),
798
- status,
799
- )
800
-
801
-
802
- def update_targets(source_lang: str) -> gr.CheckboxGroup:
803
- try:
804
- space = load_space()
805
- return gr.update(
806
- choices=space.languages,
807
- value=[lang for lang in space.languages if lang != source_lang],
808
  )
809
- except Exception:
810
- return gr.update(
811
- choices=DEFAULT_LANGUAGES,
812
- value=[lang for lang in DEFAULT_LANGUAGES if lang != source_lang],
813
- )
814
-
815
 
816
- def update_neighbor_langs(source_lang: str, include_source: bool) -> gr.CheckboxGroup:
817
- try:
818
- space = load_space()
819
- choices = space.languages
820
- except Exception:
821
- choices = DEFAULT_LANGUAGES
822
- values = choices if include_source else [lang for lang in choices if lang != source_lang]
823
- return gr.update(choices=choices, value=values)
824
 
825
-
826
- css = """
 
827
  .app-title h1 { margin-bottom: 0.15rem; }
828
- .status-line { font-size: 0.9rem; color: #475569; }
 
829
  """
830
 
831
 
832
- with gr.Blocks(title="Multilingual Static Word Embeddings", css=css) as demo:
833
  gr.Markdown(
834
- "# Multilingual Static Word Embeddings\n"
835
- "Search the aligned FAISS space for cross-lingual word neighbors."
 
836
  )
837
- status_md = gr.Markdown("Loading artifacts...", elem_classes=["status-line"])
838
-
839
- with gr.Tab("Translate"):
840
- with gr.Row():
841
- with gr.Column(scale=1, min_width=320):
842
- query = gr.Textbox(label="Query word", value="haus")
843
- source_lang = gr.Dropdown(
844
- label="Source language",
845
- choices=DEFAULT_LANGUAGES,
846
- value="de",
847
- )
848
- target_langs = gr.CheckboxGroup(
849
- label="Target languages",
850
- choices=DEFAULT_LANGUAGES,
851
- value=["en", "fr", "lb"],
852
- )
853
- translate_btn = gr.Button("Search", variant="primary")
854
-
855
- with gr.Accordion("Retrieval parameters", open=True):
856
- top_k = gr.Slider(1, 20, value=3, step=1, label="Top K")
857
- min_score = gr.Slider(-2.0, 2.0, value=0.15, step=0.01, label="Min score")
858
- score_method = gr.Radio(
859
- ["csls", "cosine"],
860
- value="csls",
861
- label="Score method",
862
- )
863
- csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
864
- candidate_retrieval_k = gr.Slider(
865
- 1,
866
- 100,
867
- value=9,
868
- step=1,
869
- label="Candidate retrieval K",
870
- )
871
- csls_prefetch_k = gr.Slider(
872
- 10,
873
- 500,
874
- value=50,
875
- step=1,
876
- label="CSLS prefetch K",
877
- )
878
- bidirectional = gr.Checkbox(value=True, label="Bidirectional consistency")
879
- filter_stopwords = gr.Checkbox(value=True, label="Filter stopwords")
880
- filter_bad_tokens = gr.Checkbox(value=True, label="Filter noisy tokens")
881
- use_surface = gr.Checkbox(value=True, label="Show surface forms")
882
-
883
- with gr.Column(scale=2):
884
- translate_summary = gr.Markdown()
885
- translation_results = gr.Dataframe(
886
- headers=TRANSLATION_COLUMNS,
887
- datatype=["str", "str", "str", "number", "number", "number", "bool", "number", "str"],
888
- interactive=False,
889
- wrap=True,
890
  )
891
-
892
- with gr.Tab("Nearest Neighbors"):
893
- with gr.Row():
894
- with gr.Column(scale=1, min_width=320):
895
- nn_query = gr.Textbox(label="Query word", value="haus")
896
- nn_source_lang = gr.Dropdown(
897
- label="Source language",
898
- choices=DEFAULT_LANGUAGES,
899
- value="de",
900
  )
901
- nn_langs = gr.CheckboxGroup(
902
- label="Neighbor languages",
903
- choices=DEFAULT_LANGUAGES,
904
- value=DEFAULT_LANGUAGES,
905
- )
906
- nn_top_n = gr.Slider(1, 50, value=20, step=1, label="Top N per language")
907
- nn_min_score = gr.Slider(-2.0, 2.0, value=-2.0, step=0.01, label="Min score")
908
- nn_score_method = gr.Radio(["csls", "cosine"], value="cosine", label="Score method")
909
- nn_csls_k = gr.Slider(1, 50, value=10, step=1, label="CSLS K")
910
- nn_include_source = gr.Checkbox(value=True, label="Include source language")
911
- nn_use_surface = gr.Checkbox(value=True, label="Show surface forms")
912
- nn_btn = gr.Button("Find neighbors", variant="primary")
913
- with gr.Column(scale=2):
914
- nn_summary = gr.Markdown()
915
- nn_results = gr.Dataframe(
916
- headers=NEIGHBOR_COLUMNS,
917
- datatype=["str", "str", "str", "number", "number", "number", "number"],
918
- interactive=False,
919
- wrap=True,
920
- )
921
-
922
- with gr.Tab("Browse Vocabulary"):
923
- with gr.Row():
924
- vocab_lang = gr.Dropdown(label="Language", choices=DEFAULT_LANGUAGES, value="de")
925
- vocab_filter = gr.Textbox(label="Filter", placeholder="Type part of a token or surface form")
926
- vocab_limit = gr.Slider(10, 500, value=100, step=10, label="Limit")
927
- vocab_results = gr.Dataframe(
928
- headers=VOCAB_COLUMNS,
929
- datatype=["number", "str", "str", "str", "str"],
930
- interactive=False,
931
- wrap=True,
932
- )
933
-
934
- with gr.Tab("Artifact Info"):
935
- artifact_info = gr.Markdown("Loading config...")
936
 
937
- translate_inputs = [
938
  query,
939
  source_lang,
940
- target_langs,
941
  top_k,
942
  min_score,
943
  csls_k,
@@ -949,90 +698,22 @@ with gr.Blocks(title="Multilingual Static Word Embeddings", css=css) as demo:
949
  filter_bad_tokens,
950
  use_surface,
951
  ]
952
- translate_btn.click(
953
- translate_ui,
954
- inputs=translate_inputs,
955
- outputs=[translation_results, translate_summary],
956
- )
957
- query.submit(
958
- translate_ui,
959
- inputs=translate_inputs,
960
- outputs=[translation_results, translate_summary],
961
- )
962
- source_lang.change(update_targets, inputs=source_lang, outputs=target_langs)
963
-
964
- nn_btn.click(
965
- nearest_ui,
966
- inputs=[
967
- nn_query,
968
- nn_source_lang,
969
- nn_langs,
970
- nn_top_n,
971
- nn_min_score,
972
- nn_csls_k,
973
- nn_score_method,
974
- nn_include_source,
975
- nn_use_surface,
976
- ],
977
- outputs=[nn_results, nn_summary],
978
- )
979
- nn_query.submit(
980
- nearest_ui,
981
- inputs=[
982
- nn_query,
983
- nn_source_lang,
984
- nn_langs,
985
- nn_top_n,
986
- nn_min_score,
987
- nn_csls_k,
988
- nn_score_method,
989
- nn_include_source,
990
- nn_use_surface,
991
- ],
992
- outputs=[nn_results, nn_summary],
993
- )
994
- nn_source_lang.change(
995
- update_neighbor_langs,
996
- inputs=[nn_source_lang, nn_include_source],
997
- outputs=nn_langs,
998
- )
999
- nn_include_source.change(
1000
- update_neighbor_langs,
1001
- inputs=[nn_source_lang, nn_include_source],
1002
- outputs=nn_langs,
1003
- )
1004
-
1005
- vocab_lang.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
1006
- vocab_filter.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
1007
- vocab_limit.change(browse_ui, inputs=[vocab_lang, vocab_filter, vocab_limit], outputs=vocab_results)
1008
 
1009
  demo.load(
1010
- initialize_ui,
1011
  outputs=[
1012
- status_md,
1013
  source_lang,
1014
- target_langs,
1015
  top_k,
1016
  min_score,
1017
  csls_k,
1018
  candidate_retrieval_k,
1019
  csls_prefetch_k,
1020
  bidirectional,
1021
- nn_source_lang,
1022
- nn_langs,
1023
- nn_csls_k,
1024
- vocab_lang,
1025
- artifact_info,
1026
  ],
1027
- ).then(
1028
- translate_ui,
1029
- inputs=translate_inputs,
1030
- outputs=[translation_results, translate_summary],
1031
- ).then(
1032
- browse_ui,
1033
- inputs=[vocab_lang, vocab_filter, vocab_limit],
1034
- outputs=vocab_results,
1035
- )
1036
 
1037
 
1038
  if __name__ == "__main__":
 
1
  from __future__ import annotations
2
 
 
3
  import gc
4
  import json
5
  import os
 
15
  import boto3
16
  import gradio as gr
17
  import numpy as np
 
18
  from botocore.config import Config
19
 
20
 
 
22
  "s3://131-component-staging/"
23
  "multilingual-static-word-embeddings/stage-6/"
24
  )
25
+ DEFAULT_ARTIFACT_URI = (
26
+ DEFAULT_ARTIFACT_PREFIX + "multilingual_space_20260521_133953.json"
27
+ )
28
+ DEFAULT_LOCAL_SPACE = Path("multilingual_space_20260521_133953.json")
29
+ DEFAULT_LANGS = ["de", "en", "fr", "lb"]
30
  REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
31
+ CACHE_DIR = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
 
34
  @dataclass
 
67
  def parse_s3_uri(uri: str) -> tuple[str, str]:
68
  parsed = urlparse(uri)
69
  if parsed.scheme != "s3" or not parsed.netloc:
70
+ raise ValueError(f"Expected s3://bucket/key URI, got {uri!r}")
71
  return parsed.netloc, parsed.path.lstrip("/")
72
 
73
 
 
76
  secret_key = os.getenv("SE_SECRET_KEY") or os.getenv("AWS_SECRET_ACCESS_KEY")
77
  endpoint_url = os.getenv("SE_HOST_URL") or os.getenv("AWS_ENDPOINT_URL")
78
  region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
79
+
80
  if endpoint_url and not endpoint_url.startswith(("http://", "https://")):
81
  endpoint_url = f"https://{endpoint_url}"
82
 
 
86
  "config": Config(
87
  signature_version="s3v4",
88
  s3={"addressing_style": "path"},
89
+ retries={"max_attempts": 3, "mode": "standard"},
90
  ),
91
  }
92
  if endpoint_url:
 
98
  return boto3.client(**kwargs)
99
 
100
 
101
+ def latest_artifact_uri(client) -> str:
102
+ explicit = os.getenv("SPACE_ARTIFACT_S3_URI", "").strip().rstrip("/")
103
+ if explicit:
104
+ return explicit
 
 
 
 
 
 
 
 
105
 
106
+ prefix_override = os.getenv("SPACE_ARTIFACT_S3_PREFIX", "").strip()
107
+ if not prefix_override:
108
+ return DEFAULT_ARTIFACT_URI
109
 
110
+ prefix_uri = prefix_override
111
+ bucket, prefix = parse_s3_uri(prefix_uri)
112
  prefix = prefix.rstrip("/") + "/"
 
113
  pattern = re.compile(r"(.*multilingual_space_(\d{8}_\d{6})\.json)/config\.json$")
114
  candidates: list[tuple[str, str]] = []
115
+
116
  paginator = client.get_paginator("list_objects_v2")
117
  for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
118
  for obj in page.get("Contents", []):
119
+ match = pattern.match(obj["Key"])
 
120
  if match:
121
  candidates.append((match.group(2), match.group(1)))
122
 
123
  if not candidates:
124
  raise FileNotFoundError(
125
+ f"No multilingual_space_*.json/config.json found under {prefix_uri}"
126
  )
127
 
128
+ _, key = sorted(candidates)[-1]
129
+ return f"s3://{bucket}/{key}"
130
 
131
 
132
+ def local_cache_for_uri(uri: str) -> Path:
133
+ _, key = parse_s3_uri(uri)
134
+ return CACHE_DIR / Path(key.rstrip("/")).name
 
135
 
136
 
137
+ def download_space_from_s3() -> tuple[Path, str]:
138
  client = make_s3_client()
139
+ uri = latest_artifact_uri(client)
140
+ local_dir = local_cache_for_uri(uri)
141
  local_dir.mkdir(parents=True, exist_ok=True)
142
 
143
+ bucket, prefix = parse_s3_uri(uri)
144
+ prefix = prefix.rstrip("/")
 
145
  for filename in REQUIRED_FILES:
146
+ dst = local_dir / filename
147
+ if dst.exists() and dst.stat().st_size > 0:
148
  continue
149
+ key = f"{prefix}/{filename}"
150
+ print(f"Downloading s3://{bucket}/{key}", file=sys.stderr)
151
+ client.download_file(bucket, key, str(dst))
152
 
153
+ return local_dir, uri
154
+
155
+
156
+ def find_space_dir() -> tuple[Path, str]:
157
+ local_override = os.getenv("SPACE_DIR", "").strip()
158
+ if local_override:
159
+ path = Path(local_override)
160
+ if path.exists():
161
+ return path, str(path)
162
+
163
+ if DEFAULT_LOCAL_SPACE.exists():
164
+ return DEFAULT_LOCAL_SPACE, str(DEFAULT_LOCAL_SPACE)
165
+
166
+ local_candidates = sorted(Path(".").glob("multilingual_space_*.json"))
167
+ if local_candidates:
168
+ return local_candidates[-1], str(local_candidates[-1])
169
+
170
+ return download_space_from_s3()
171
 
172
 
173
  def strip_diacritics(text: str) -> str:
 
191
 
192
 
193
  def read_config(space_dir: Path) -> dict[str, Any]:
194
+ path = space_dir / "config.json"
195
+ if not path.exists():
196
+ raise FileNotFoundError(f"Missing config.json in {space_dir}")
197
+ with path.open("r", encoding="utf-8") as f:
198
  return json.load(f)
199
 
200
 
201
  def read_metadata(space_dir: Path) -> tuple[list[dict[str, Any]], dict[str, list[int]]]:
202
+ path = space_dir / "all_metadata.jsonl"
203
+ if not path.exists():
204
+ raise FileNotFoundError(f"Missing all_metadata.jsonl in {space_dir}")
205
+
206
  metadata: list[dict[str, Any] | None] = []
207
  ids_by_lang: dict[str, list[int]] = {}
208
+ with path.open("r", encoding="utf-8") as f:
 
209
  for line in f:
210
+ line = line.strip()
211
+ if not line:
212
  continue
213
  meta = json.loads(line)
214
  row_id = int(meta["id"])
 
256
 
257
 
258
  def load_vectors_from_faiss(space_dir: Path, ids_by_lang: dict[str, list[int]]) -> dict[str, np.ndarray]:
 
259
  try:
260
  import faiss # type: ignore
261
  except ImportError as exc:
262
+ raise RuntimeError("faiss-cpu is required to read aligned_all.faiss") from exc
263
+
264
+ faiss_path = space_dir / "aligned_all.faiss"
265
+ if not faiss_path.exists():
266
+ raise FileNotFoundError(f"Missing aligned_all.faiss in {space_dir}")
267
 
268
  print(f"Loading FAISS index: {faiss_path}", file=sys.stderr)
269
  index = faiss.read_index(str(faiss_path))
 
270
  vectors_by_lang: dict[str, np.ndarray] = {}
271
  for lang, ids in sorted(ids_by_lang.items()):
272
  print(f"Reconstructing {lang}: {len(ids)} vectors", file=sys.stderr)
 
283
  lang_lookup: dict[str, list[int]] = {}
284
  for global_id, meta in zip(data.ids.tolist(), data.metas):
285
  for value in (meta.get("token"), meta.get("surface")):
286
+ if value:
287
+ lang_lookup.setdefault(lookup_key(str(value)), []).append(int(global_id))
 
288
  lookup[lang] = lang_lookup
289
  return lookup
290
 
291
 
292
  @lru_cache(maxsize=1)
293
  def load_space() -> Space:
294
+ space_dir, artifact_uri = find_space_dir()
295
  config = read_config(space_dir)
296
  metadata, ids_by_lang = read_metadata(space_dir)
297
  vectors_by_lang = load_vectors_from_faiss(space_dir, ids_by_lang)
 
329
 
330
  def default_options(config: dict[str, Any]) -> RuntimeOptions:
331
  bidi_config = config.get("bidirectional_consistency") or {}
332
+ top_k = int(config.get("top_k", 3))
333
  return RuntimeOptions(
334
+ top_k=top_k,
335
  min_score=float(config.get("min_score", 0.15)),
336
  csls_k=int(config.get("csls_k", 10)),
337
+ candidate_retrieval_k=int(config.get("candidate_retrieval_k", top_k * 3)),
338
  csls_prefetch_k=int(config.get("csls_prefetch_k", 50)),
339
  bidirectional=bool(bidi_config.get("enabled", True)),
340
  score_method="csls",
 
478
  return str(meta.get("token") or meta.get("surface") or "")
479
 
480
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
  def resolve_query(space: Space, lang: str, query: str) -> tuple[int, dict[str, Any], str]:
482
  if lang not in space.by_lang:
483
  raise ValueError(f"Unknown language {lang!r}. Available: {', '.join(space.languages)}")
484
+ if not query.strip():
 
 
485
  raise ValueError("Enter a query word.")
486
 
487
  matches = space.lookup.get(lang, {}).get(lookup_key(query), [])
488
  if not matches:
 
 
 
489
  raise LookupError(f"No exact token/surface match for {lang}:{query!r}")
490
 
 
491
  message = ""
492
  if len(matches) > 1:
493
+ preview = []
494
+ for row_id in matches[:5]:
495
+ meta = get_meta(space, int(row_id))
496
+ preview.append(f"{meta.get('surface') or meta.get('token')} (id {row_id})")
497
+ message = f"Matched {len(matches)} entries; using the first: {preview[0]}"
498
 
499
+ row_id = int(matches[0])
500
  return row_id, get_meta(space, row_id), message
501
 
502
 
503
+ def translate_like_terminal(
 
 
 
 
 
 
 
 
 
 
 
 
504
  query: str,
505
  source_lang: str,
 
506
  top_k: int,
507
  min_score: float,
508
  csls_k: int,
 
513
  filter_stopwords: bool,
514
  filter_bad_tokens: bool,
515
  use_surface: bool,
516
+ ) -> tuple[str, list[list[Any]]]:
517
  try:
518
  space = load_space()
 
519
  opts = make_options(
520
  top_k,
521
  min_score,
 
530
  )
531
  source_id, source_meta, match_message = resolve_query(space, source_lang, query)
532
  source_vec = get_vec(space, source_id)
533
+ source_word = format_word(source_meta, opts)
534
+ target_langs = [lang for lang in space.languages if lang != source_lang]
535
+
536
+ lines = [
537
+ f"Query: {source_lang}:{source_word} "
538
+ f"(token={source_meta.get('token')}, id={source_id})",
539
+ f"Settings: score={opts.score_method}, top_k={opts.top_k}, "
540
+ f"min_score={opts.min_score}, csls_k={opts.csls_k}, "
541
+ f"candidate_retrieval_k={opts.candidate_retrieval_k}, "
542
+ f"bidirectional={opts.bidirectional}",
543
  ]
544
  if match_message:
545
+ lines.append(match_message)
546
 
547
+ rows: list[list[Any]] = []
548
+ for target_lang in target_langs:
 
549
  candidates = rank_candidates(space, source_vec, source_lang, target_lang, opts)
550
  kept: list[dict[str, Any]] = []
551
+
552
  for cand in candidates:
553
  if opts.bidirectional:
554
  reverse = rank_candidates(
 
564
  continue
565
  else:
566
  cand["bidirectional"] = False
567
+
568
  kept.append(cand)
569
  if len(kept) >= opts.top_k:
570
  break
571
 
572
+ lines.append("")
573
+ lines.append(f"{target_lang}:")
574
+ if not kept:
575
+ lines.append(" no candidates after filters")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  continue
577
+
578
+ for i, cand in enumerate(kept, 1):
 
 
 
 
 
 
 
 
 
579
  meta = cand["meta"]
580
+ word = format_word(meta, opts)
581
+ token = meta.get("token")
582
+ bidi = "yes" if cand["bidirectional"] else "no"
583
+ lines.append(
584
+ f" {i}. {word} "
585
+ f"(token={token}, score={cand['score']:.4f}, "
586
+ f"cosine={cand['cosine']:.4f}, bidi={bidi})"
587
+ )
588
  rows.append(
589
+ [
590
+ target_lang,
591
+ i,
592
+ word,
593
+ token,
594
+ round(float(cand["score"]), 6),
595
+ round(float(cand["cosine"]), 6),
596
+ bidi,
597
+ ]
598
  )
 
 
599
 
600
+ return "\n".join(lines), rows
 
 
 
 
 
 
 
601
  except Exception as exc:
602
+ return f"Error: {exc}", []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
+ def initialize() -> tuple[Any, ...]:
 
606
  try:
607
  space = load_space()
608
  opts = default_options(space.config)
609
+ source_lang = space.config.get("pivot_lang", "de")
610
+ if source_lang not in space.languages:
611
+ source_lang = space.languages[0]
612
+ status = (
613
+ f"Loaded {space.artifact_uri} with "
614
+ f"{sum(len(item.metas) for item in space.by_lang.values()):,} vectors."
615
+ )
616
  return (
617
  status,
618
+ gr.update(choices=space.languages, value=source_lang),
 
619
  opts.top_k,
620
  opts.min_score,
621
  opts.csls_k,
622
  opts.candidate_retrieval_k,
623
  opts.csls_prefetch_k,
624
  opts.bidirectional,
 
 
 
 
 
625
  )
626
  except Exception as exc:
 
627
  return (
628
+ f"Load error: {exc}",
629
+ gr.update(choices=DEFAULT_LANGS, value="de"),
 
630
  3,
631
  0.15,
632
  10,
633
  9,
634
  50,
635
  True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  )
 
 
 
 
 
 
637
 
 
 
 
 
 
 
 
 
638
 
639
+ CSS = """
640
+ body { background: #f7f5ef; }
641
+ .gradio-container { max-width: 1120px !important; }
642
  .app-title h1 { margin-bottom: 0.15rem; }
643
+ .status { color: #5f6b7a; font-size: 0.92rem; }
644
+ textarea { font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; }
645
  """
646
 
647
 
648
+ with gr.Blocks(title="Multilingual Dictionary Explorer", css=CSS) as demo:
649
  gr.Markdown(
650
+ "# Multilingual Dictionary Explorer\n"
651
+ "FAISS + CSLS translation lookup from the aligned multilingual space.",
652
+ elem_classes=["app-title"],
653
  )
654
+ status = gr.Markdown("Loading artifacts...", elem_classes=["status"])
655
+
656
+ with gr.Row():
657
+ with gr.Column(scale=1, min_width=320):
658
+ query = gr.Textbox(label="Query word", value="haus")
659
+ source_lang = gr.Dropdown(label="Language", choices=DEFAULT_LANGS, value="de")
660
+ search = gr.Button("Search", variant="primary")
661
+
662
+ with gr.Accordion("Parameters", open=False):
663
+ top_k = gr.Slider(1, 20, value=3, step=1, label="top_k")
664
+ min_score = gr.Slider(-2.0, 2.0, value=0.15, step=0.01, label="min_score")
665
+ csls_k = gr.Slider(1, 50, value=10, step=1, label="csls_k")
666
+ candidate_retrieval_k = gr.Slider(
667
+ 1, 100, value=9, step=1, label="candidate_retrieval_k"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  )
669
+ csls_prefetch_k = gr.Slider(
670
+ 10, 500, value=50, step=1, label="csls_prefetch_k"
 
 
 
 
 
 
 
671
  )
672
+ score_method = gr.Radio(["csls", "cosine"], value="csls", label="score")
673
+ bidirectional = gr.Checkbox(value=True, label="bidirectional_consistency")
674
+ filter_stopwords = gr.Checkbox(value=True, label="filter stopwords")
675
+ filter_bad_tokens = gr.Checkbox(value=True, label="filter bad tokens")
676
+ use_surface = gr.Checkbox(value=True, label="show surface forms")
677
+
678
+ with gr.Column(scale=2):
679
+ output_text = gr.Textbox(label="Terminal-style output", lines=18)
680
+ output_table = gr.Dataframe(
681
+ headers=["target_lang", "rank", "word", "token", "score", "cosine", "bidi"],
682
+ datatype=["str", "number", "str", "str", "number", "number", "str"],
683
+ interactive=False,
684
+ wrap=True,
685
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
 
687
+ inputs = [
688
  query,
689
  source_lang,
 
690
  top_k,
691
  min_score,
692
  csls_k,
 
698
  filter_bad_tokens,
699
  use_surface,
700
  ]
701
+ search.click(translate_like_terminal, inputs=inputs, outputs=[output_text, output_table])
702
+ query.submit(translate_like_terminal, inputs=inputs, outputs=[output_text, output_table])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
 
704
  demo.load(
705
+ initialize,
706
  outputs=[
707
+ status,
708
  source_lang,
 
709
  top_k,
710
  min_score,
711
  csls_k,
712
  candidate_retrieval_k,
713
  csls_prefetch_k,
714
  bidirectional,
 
 
 
 
 
715
  ],
716
+ ).then(translate_like_terminal, inputs=inputs, outputs=[output_text, output_table])
 
 
 
 
 
 
 
 
717
 
718
 
719
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  gradio
2
  faiss-cpu
3
  numpy
4
- pandas
5
  boto3
6
  botocore
 
1
  gradio
2
  faiss-cpu
3
  numpy
 
4
  boto3
5
  botocore