Maslionok commited on
Commit
12c646e
·
1 Parent(s): a69e0b2

updated script to use the newest mutlilingual dicitonary RUN

Browse files
Files changed (2) hide show
  1. README.md +5 -4
  2. app.py +14 -13
README.md CHANGED
@@ -22,9 +22,10 @@ The app does not use `aligned_all.vec`.
22
 
23
  ## Runtime configuration
24
 
25
- By default, the app downloads this artifact folder:
 
26
 
27
- `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
28
 
29
  Set these Hugging Face Space secrets for S3-compatible storage:
30
 
@@ -35,8 +36,8 @@ Set these Hugging Face Space secrets for S3-compatible storage:
35
  Optional environment overrides:
36
 
37
  - `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
38
- `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_space_20260521_133953.json`
39
- - `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
40
  - `SPACE_DIR`: local artifact folder, useful for local testing
41
  - `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
42
 
 
22
 
23
  ## Runtime configuration
24
 
25
+ By default, the app scans the stage 6 prefix and downloads the newest artifact
26
+ folder that contains `config.json`:
27
 
28
+ `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/`
29
 
30
  Set these Hugging Face Space secrets for S3-compatible storage:
31
 
 
36
  Optional environment overrides:
37
 
38
  - `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
39
+ `s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_dict_20260603_122323`
40
+ - `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_dict_*` or `multilingual_space_*.json`
41
  - `SPACE_DIR`: local artifact folder, useful for local testing
42
  - `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
43
 
app.py CHANGED
@@ -22,10 +22,7 @@ DEFAULT_ARTIFACT_PREFIX = (
22
  "s3://131-component-staging/"
23
  "multilingual-static-word-embeddings/stage-6/"
24
  )
25
- DEFAULT_ARTIFACT_URI = (
26
- DEFAULT_ARTIFACT_PREFIX + "multilingual_space_20260521_133953.json"
27
- )
28
- DEFAULT_LOCAL_SPACE = Path("multilingual_space_20260521_133953.json")
29
  DEFAULT_LANGS = ["de", "en", "fr", "lb"]
30
  REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
31
  CACHE_DIR = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
@@ -104,13 +101,12 @@ def latest_artifact_uri(client) -> str:
104
  return explicit
105
 
106
  prefix_override = os.getenv("SPACE_ARTIFACT_S3_PREFIX", "").strip()
107
- if not prefix_override:
108
- return DEFAULT_ARTIFACT_URI
109
-
110
- prefix_uri = prefix_override
111
  bucket, prefix = parse_s3_uri(prefix_uri)
112
  prefix = prefix.rstrip("/") + "/"
113
- pattern = re.compile(r"(.*multilingual_space_(\d{8}_\d{6})\.json)/config\.json$")
 
 
114
  candidates: list[tuple[str, str]] = []
115
 
116
  paginator = client.get_paginator("list_objects_v2")
@@ -122,11 +118,14 @@ def latest_artifact_uri(client) -> str:
122
 
123
  if not candidates:
124
  raise FileNotFoundError(
125
- f"No multilingual_space_*.json/config.json found under {prefix_uri}"
126
  )
127
 
128
- _, key = sorted(candidates)[-1]
129
- return f"s3://{bucket}/{key}"
 
 
 
130
 
131
 
132
  def local_cache_for_uri(uri: str) -> Path:
@@ -163,7 +162,9 @@ def find_space_dir() -> tuple[Path, str]:
163
  if DEFAULT_LOCAL_SPACE.exists():
164
  return DEFAULT_LOCAL_SPACE, str(DEFAULT_LOCAL_SPACE)
165
 
166
- local_candidates = sorted(Path(".").glob("multilingual_space_*.json"))
 
 
167
  if local_candidates:
168
  return local_candidates[-1], str(local_candidates[-1])
169
 
 
22
  "s3://131-component-staging/"
23
  "multilingual-static-word-embeddings/stage-6/"
24
  )
25
+ DEFAULT_LOCAL_SPACE = Path("multilingual_dict_20260603_122323")
 
 
 
26
  DEFAULT_LANGS = ["de", "en", "fr", "lb"]
27
  REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
28
  CACHE_DIR = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
 
101
  return explicit
102
 
103
  prefix_override = os.getenv("SPACE_ARTIFACT_S3_PREFIX", "").strip()
104
+ prefix_uri = prefix_override or DEFAULT_ARTIFACT_PREFIX
 
 
 
105
  bucket, prefix = parse_s3_uri(prefix_uri)
106
  prefix = prefix.rstrip("/") + "/"
107
+ pattern = re.compile(
108
+ r"(.*multilingual_(?:dict|space)_(\d{8}_\d{6})(?:\.json)?)/config\.json$"
109
+ )
110
  candidates: list[tuple[str, str]] = []
111
 
112
  paginator = client.get_paginator("list_objects_v2")
 
118
 
119
  if not candidates:
120
  raise FileNotFoundError(
121
+ f"No multilingual_dict_*/config.json or multilingual_space_*.json/config.json found under {prefix_uri}"
122
  )
123
 
124
+ # Run ids are timestamps: YYYYMMDD_HHMMSS. Lexicographic sort gives newest run.
125
+ run_id, key = sorted(candidates)[-1]
126
+ uri = f"s3://{bucket}/{key}"
127
+ print(f"Selected latest stage 6 artifact {run_id}: {uri}", file=sys.stderr)
128
+ return uri
129
 
130
 
131
  def local_cache_for_uri(uri: str) -> Path:
 
162
  if DEFAULT_LOCAL_SPACE.exists():
163
  return DEFAULT_LOCAL_SPACE, str(DEFAULT_LOCAL_SPACE)
164
 
165
+ local_candidates = sorted(
166
+ [*Path(".").glob("multilingual_dict_*"), *Path(".").glob("multilingual_space_*.json")]
167
+ )
168
  if local_candidates:
169
  return local_candidates[-1], str(local_candidates[-1])
170