updated script to use the newest mutlilingual dicitonary RUN
Browse files
README.md
CHANGED
|
@@ -22,9 +22,10 @@ The app does not use `aligned_all.vec`.
|
|
| 22 |
|
| 23 |
## Runtime configuration
|
| 24 |
|
| 25 |
-
By default, the app downloads
|
|
|
|
| 26 |
|
| 27 |
-
`s3://131-component-staging/multilingual-static-word-embeddings/stage-6/
|
| 28 |
|
| 29 |
Set these Hugging Face Space secrets for S3-compatible storage:
|
| 30 |
|
|
@@ -35,8 +36,8 @@ Set these Hugging Face Space secrets for S3-compatible storage:
|
|
| 35 |
Optional environment overrides:
|
| 36 |
|
| 37 |
- `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
|
| 38 |
-
`s3://131-component-staging/multilingual-static-word-embeddings/stage-6/
|
| 39 |
-
- `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_space_*.json`
|
| 40 |
- `SPACE_DIR`: local artifact folder, useful for local testing
|
| 41 |
- `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
|
| 42 |
|
|
|
|
| 22 |
|
| 23 |
## Runtime configuration
|
| 24 |
|
| 25 |
+
By default, the app scans the stage 6 prefix and downloads the newest artifact
|
| 26 |
+
folder that contains `config.json`:
|
| 27 |
|
| 28 |
+
`s3://131-component-staging/multilingual-static-word-embeddings/stage-6/`
|
| 29 |
|
| 30 |
Set these Hugging Face Space secrets for S3-compatible storage:
|
| 31 |
|
|
|
|
| 36 |
Optional environment overrides:
|
| 37 |
|
| 38 |
- `SPACE_ARTIFACT_S3_URI`: exact artifact folder, for example
|
| 39 |
+
`s3://131-component-staging/multilingual-static-word-embeddings/stage-6/multilingual_dict_20260603_122323`
|
| 40 |
+
- `SPACE_ARTIFACT_S3_PREFIX`: prefix to scan for the newest `multilingual_dict_*` or `multilingual_space_*.json`
|
| 41 |
- `SPACE_DIR`: local artifact folder, useful for local testing
|
| 42 |
- `ARTIFACT_CACHE_DIR`: local cache directory, default `/tmp/multilingual_space_artifacts`
|
| 43 |
|
app.py
CHANGED
|
@@ -22,10 +22,7 @@ DEFAULT_ARTIFACT_PREFIX = (
|
|
| 22 |
"s3://131-component-staging/"
|
| 23 |
"multilingual-static-word-embeddings/stage-6/"
|
| 24 |
)
|
| 25 |
-
|
| 26 |
-
DEFAULT_ARTIFACT_PREFIX + "multilingual_space_20260521_133953.json"
|
| 27 |
-
)
|
| 28 |
-
DEFAULT_LOCAL_SPACE = Path("multilingual_space_20260521_133953.json")
|
| 29 |
DEFAULT_LANGS = ["de", "en", "fr", "lb"]
|
| 30 |
REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
|
| 31 |
CACHE_DIR = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
|
|
@@ -104,13 +101,12 @@ def latest_artifact_uri(client) -> str:
|
|
| 104 |
return explicit
|
| 105 |
|
| 106 |
prefix_override = os.getenv("SPACE_ARTIFACT_S3_PREFIX", "").strip()
|
| 107 |
-
|
| 108 |
-
return DEFAULT_ARTIFACT_URI
|
| 109 |
-
|
| 110 |
-
prefix_uri = prefix_override
|
| 111 |
bucket, prefix = parse_s3_uri(prefix_uri)
|
| 112 |
prefix = prefix.rstrip("/") + "/"
|
| 113 |
-
pattern = re.compile(
|
|
|
|
|
|
|
| 114 |
candidates: list[tuple[str, str]] = []
|
| 115 |
|
| 116 |
paginator = client.get_paginator("list_objects_v2")
|
|
@@ -122,11 +118,14 @@ def latest_artifact_uri(client) -> str:
|
|
| 122 |
|
| 123 |
if not candidates:
|
| 124 |
raise FileNotFoundError(
|
| 125 |
-
f"No multilingual_space_*.json/config.json found under {prefix_uri}"
|
| 126 |
)
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
|
| 132 |
def local_cache_for_uri(uri: str) -> Path:
|
|
@@ -163,7 +162,9 @@ def find_space_dir() -> tuple[Path, str]:
|
|
| 163 |
if DEFAULT_LOCAL_SPACE.exists():
|
| 164 |
return DEFAULT_LOCAL_SPACE, str(DEFAULT_LOCAL_SPACE)
|
| 165 |
|
| 166 |
-
local_candidates = sorted(
|
|
|
|
|
|
|
| 167 |
if local_candidates:
|
| 168 |
return local_candidates[-1], str(local_candidates[-1])
|
| 169 |
|
|
|
|
| 22 |
"s3://131-component-staging/"
|
| 23 |
"multilingual-static-word-embeddings/stage-6/"
|
| 24 |
)
|
| 25 |
+
DEFAULT_LOCAL_SPACE = Path("multilingual_dict_20260603_122323")
|
|
|
|
|
|
|
|
|
|
| 26 |
DEFAULT_LANGS = ["de", "en", "fr", "lb"]
|
| 27 |
REQUIRED_FILES = ("aligned_all.faiss", "all_metadata.jsonl", "config.json")
|
| 28 |
CACHE_DIR = Path(os.getenv("ARTIFACT_CACHE_DIR", "/tmp/multilingual_space_artifacts"))
|
|
|
|
| 101 |
return explicit
|
| 102 |
|
| 103 |
prefix_override = os.getenv("SPACE_ARTIFACT_S3_PREFIX", "").strip()
|
| 104 |
+
prefix_uri = prefix_override or DEFAULT_ARTIFACT_PREFIX
|
|
|
|
|
|
|
|
|
|
| 105 |
bucket, prefix = parse_s3_uri(prefix_uri)
|
| 106 |
prefix = prefix.rstrip("/") + "/"
|
| 107 |
+
pattern = re.compile(
|
| 108 |
+
r"(.*multilingual_(?:dict|space)_(\d{8}_\d{6})(?:\.json)?)/config\.json$"
|
| 109 |
+
)
|
| 110 |
candidates: list[tuple[str, str]] = []
|
| 111 |
|
| 112 |
paginator = client.get_paginator("list_objects_v2")
|
|
|
|
| 118 |
|
| 119 |
if not candidates:
|
| 120 |
raise FileNotFoundError(
|
| 121 |
+
f"No multilingual_dict_*/config.json or multilingual_space_*.json/config.json found under {prefix_uri}"
|
| 122 |
)
|
| 123 |
|
| 124 |
+
# Run ids are timestamps: YYYYMMDD_HHMMSS. Lexicographic sort gives newest run.
|
| 125 |
+
run_id, key = sorted(candidates)[-1]
|
| 126 |
+
uri = f"s3://{bucket}/{key}"
|
| 127 |
+
print(f"Selected latest stage 6 artifact {run_id}: {uri}", file=sys.stderr)
|
| 128 |
+
return uri
|
| 129 |
|
| 130 |
|
| 131 |
def local_cache_for_uri(uri: str) -> Path:
|
|
|
|
| 162 |
if DEFAULT_LOCAL_SPACE.exists():
|
| 163 |
return DEFAULT_LOCAL_SPACE, str(DEFAULT_LOCAL_SPACE)
|
| 164 |
|
| 165 |
+
local_candidates = sorted(
|
| 166 |
+
[*Path(".").glob("multilingual_dict_*"), *Path(".").glob("multilingual_space_*.json")]
|
| 167 |
+
)
|
| 168 |
if local_candidates:
|
| 169 |
return local_candidates[-1], str(local_candidates[-1])
|
| 170 |
|