Spaces:
Running
Running
LLM runner: add --version to pin an exact submission version
Browse files--version accepts a bare filename, full repo path, or URL-encoded %2B and
normalizes to the literal '+'. Without it, the latest version is used.
- llm/README.md +6 -0
- llm/run_llm.py +30 -5
llm/README.md
CHANGED
|
@@ -46,12 +46,18 @@ python run_llm.py \
|
|
| 46 |
|
| 47 |
```bash
|
| 48 |
python run_llm.py --submission NCT02578680__EricZ
|
|
|
|
|
|
|
|
|
|
| 49 |
# pick a specific SAP doc + models:
|
| 50 |
python run_llm.py --submission NCT02578680__EricZ \
|
| 51 |
--doc-id 10.1056_nejmoa1801005 \
|
| 52 |
--models claude-opus-4-8 gpt-4o
|
| 53 |
```
|
| 54 |
|
|
|
|
|
|
|
|
|
|
| 55 |
## What it does
|
| 56 |
|
| 57 |
1. **Submission** — loads the latest version of
|
|
|
|
| 46 |
|
| 47 |
```bash
|
| 48 |
python run_llm.py --submission NCT02578680__EricZ
|
| 49 |
+
# pin an exact submission version (default is the latest):
|
| 50 |
+
python run_llm.py --submission NCT02578680__EricZ \
|
| 51 |
+
--version 2026-06-07T17-23-05-870000+00-00.json
|
| 52 |
# pick a specific SAP doc + models:
|
| 53 |
python run_llm.py --submission NCT02578680__EricZ \
|
| 54 |
--doc-id 10.1056_nejmoa1801005 \
|
| 55 |
--models claude-opus-4-8 gpt-4o
|
| 56 |
```
|
| 57 |
|
| 58 |
+
`--version` accepts the bare filename, a full repo path, or a URL-encoded `%2B`
|
| 59 |
+
(it normalizes to the literal `+`).
|
| 60 |
+
|
| 61 |
## What it does
|
| 62 |
|
| 63 |
1. **Submission** — loads the latest version of
|
llm/run_llm.py
CHANGED
|
@@ -98,9 +98,33 @@ def _hf():
|
|
| 98 |
return HfApi(token=token), token
|
| 99 |
|
| 100 |
|
| 101 |
-
def load_submission(submission: str) -> dict:
|
| 102 |
-
"""Load
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
api, token = _hf()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
prefix = f"submissions/{submission}/"
|
| 105 |
try:
|
| 106 |
files = api.list_repo_files(repo_id=INTAKE_REPO, repo_type="dataset")
|
|
@@ -120,8 +144,6 @@ def load_submission(submission: str) -> dict:
|
|
| 120 |
# max basename is the most recent submission.
|
| 121 |
versions.sort(key=lambda f: f.rsplit("/", 1)[-1])
|
| 122 |
latest = versions[-1]
|
| 123 |
-
from huggingface_hub import hf_hub_download
|
| 124 |
-
|
| 125 |
path = hf_hub_download(
|
| 126 |
repo_id=INTAKE_REPO, repo_type="dataset", filename=latest, token=token
|
| 127 |
)
|
|
@@ -357,6 +379,9 @@ def main() -> None:
|
|
| 357 |
help="submission folder name <trial>__<user> (read from HF)")
|
| 358 |
ap.add_argument("--submission-file", default=None,
|
| 359 |
help="local submission JSON path (skips HF; no HF_TOKEN needed)")
|
|
|
|
|
|
|
|
|
|
| 360 |
ap.add_argument("--doc-id", default=None,
|
| 361 |
help="documents/<doc-id> folder (default: resolve from NCT via tdr.parquet)")
|
| 362 |
ap.add_argument("--sap-file", default=None,
|
|
@@ -375,7 +400,7 @@ def main() -> None:
|
|
| 375 |
if args.submission_file:
|
| 376 |
submission = load_submission_from_file(args.submission_file)
|
| 377 |
else:
|
| 378 |
-
submission = load_submission(args.submission)
|
| 379 |
# --- SAP: local file or HF ---
|
| 380 |
if args.sap_file:
|
| 381 |
sap_text = load_sap_from_file(args.sap_file)
|
|
|
|
| 98 |
return HfApi(token=token), token
|
| 99 |
|
| 100 |
|
| 101 |
+
def load_submission(submission: str, version: str | None = None) -> dict:
|
| 102 |
+
"""Load a submission version from HF.
|
| 103 |
+
|
| 104 |
+
If `version` is given, use that exact file; otherwise use the latest
|
| 105 |
+
(max timestamp) under submissions/<submission>/.
|
| 106 |
+
"""
|
| 107 |
api, token = _hf()
|
| 108 |
+
from huggingface_hub import hf_hub_download
|
| 109 |
+
|
| 110 |
+
# Pin a specific version file if requested.
|
| 111 |
+
if version:
|
| 112 |
+
# Accept a bare basename, a full repo path, or a URL-encoded '+'.
|
| 113 |
+
vname = version.rsplit("/", 1)[-1].replace("%2B", "+")
|
| 114 |
+
if not vname.endswith(".json"):
|
| 115 |
+
vname += ".json"
|
| 116 |
+
target = f"submissions/{submission}/{vname}"
|
| 117 |
+
try:
|
| 118 |
+
path = hf_hub_download(
|
| 119 |
+
repo_id=INTAKE_REPO, repo_type="dataset", filename=target, token=token
|
| 120 |
+
)
|
| 121 |
+
except Exception as e:
|
| 122 |
+
sys.exit(f"Could not download pinned version {target} from {INTAKE_REPO}: {e}")
|
| 123 |
+
with open(path, encoding="utf-8") as fh:
|
| 124 |
+
rec = json.load(fh)
|
| 125 |
+
print(f" using pinned version: {vname}")
|
| 126 |
+
return rec
|
| 127 |
+
|
| 128 |
prefix = f"submissions/{submission}/"
|
| 129 |
try:
|
| 130 |
files = api.list_repo_files(repo_id=INTAKE_REPO, repo_type="dataset")
|
|
|
|
| 144 |
# max basename is the most recent submission.
|
| 145 |
versions.sort(key=lambda f: f.rsplit("/", 1)[-1])
|
| 146 |
latest = versions[-1]
|
|
|
|
|
|
|
| 147 |
path = hf_hub_download(
|
| 148 |
repo_id=INTAKE_REPO, repo_type="dataset", filename=latest, token=token
|
| 149 |
)
|
|
|
|
| 379 |
help="submission folder name <trial>__<user> (read from HF)")
|
| 380 |
ap.add_argument("--submission-file", default=None,
|
| 381 |
help="local submission JSON path (skips HF; no HF_TOKEN needed)")
|
| 382 |
+
ap.add_argument("--version", default=None,
|
| 383 |
+
help="pin an exact version file under submissions/<submission>/ "
|
| 384 |
+
"(e.g. 2026-06-07T17-23-05-870000+00-00.json); default: latest")
|
| 385 |
ap.add_argument("--doc-id", default=None,
|
| 386 |
help="documents/<doc-id> folder (default: resolve from NCT via tdr.parquet)")
|
| 387 |
ap.add_argument("--sap-file", default=None,
|
|
|
|
| 400 |
if args.submission_file:
|
| 401 |
submission = load_submission_from_file(args.submission_file)
|
| 402 |
else:
|
| 403 |
+
submission = load_submission(args.submission, version=args.version)
|
| 404 |
# --- SAP: local file or HF ---
|
| 405 |
if args.sap_file:
|
| 406 |
sap_text = load_sap_from_file(args.sap_file)
|