tttjjj commited on
Commit
435a487
·
1 Parent(s): 2b1b795

LLM runner: add --version to pin an exact submission version

Browse files

--version accepts a bare filename, full repo path, or URL-encoded %2B and
normalizes to the literal '+'. Without it, the latest version is used.

Files changed (2) hide show
  1. llm/README.md +6 -0
  2. llm/run_llm.py +30 -5
llm/README.md CHANGED
@@ -46,12 +46,18 @@ python run_llm.py \
46
 
47
  ```bash
48
  python run_llm.py --submission NCT02578680__EricZ
 
 
 
49
  # pick a specific SAP doc + models:
50
  python run_llm.py --submission NCT02578680__EricZ \
51
  --doc-id 10.1056_nejmoa1801005 \
52
  --models claude-opus-4-8 gpt-4o
53
  ```
54
 
 
 
 
55
  ## What it does
56
 
57
  1. **Submission** — loads the latest version of
 
46
 
47
  ```bash
48
  python run_llm.py --submission NCT02578680__EricZ
49
+ # pin an exact submission version (default is the latest):
50
+ python run_llm.py --submission NCT02578680__EricZ \
51
+ --version 2026-06-07T17-23-05-870000+00-00.json
52
  # pick a specific SAP doc + models:
53
  python run_llm.py --submission NCT02578680__EricZ \
54
  --doc-id 10.1056_nejmoa1801005 \
55
  --models claude-opus-4-8 gpt-4o
56
  ```
57
 
58
+ `--version` accepts the bare filename, a full repo path, or a URL-encoded `%2B`
59
+ (it normalizes to the literal `+`).
60
+
61
  ## What it does
62
 
63
  1. **Submission** — loads the latest version of
llm/run_llm.py CHANGED
@@ -98,9 +98,33 @@ def _hf():
98
  return HfApi(token=token), token
99
 
100
 
101
- def load_submission(submission: str) -> dict:
102
- """Load the latest version of submissions/<submission>/<stamp>.json."""
 
 
 
 
103
  api, token = _hf()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  prefix = f"submissions/{submission}/"
105
  try:
106
  files = api.list_repo_files(repo_id=INTAKE_REPO, repo_type="dataset")
@@ -120,8 +144,6 @@ def load_submission(submission: str) -> dict:
120
  # max basename is the most recent submission.
121
  versions.sort(key=lambda f: f.rsplit("/", 1)[-1])
122
  latest = versions[-1]
123
- from huggingface_hub import hf_hub_download
124
-
125
  path = hf_hub_download(
126
  repo_id=INTAKE_REPO, repo_type="dataset", filename=latest, token=token
127
  )
@@ -357,6 +379,9 @@ def main() -> None:
357
  help="submission folder name <trial>__<user> (read from HF)")
358
  ap.add_argument("--submission-file", default=None,
359
  help="local submission JSON path (skips HF; no HF_TOKEN needed)")
 
 
 
360
  ap.add_argument("--doc-id", default=None,
361
  help="documents/<doc-id> folder (default: resolve from NCT via tdr.parquet)")
362
  ap.add_argument("--sap-file", default=None,
@@ -375,7 +400,7 @@ def main() -> None:
375
  if args.submission_file:
376
  submission = load_submission_from_file(args.submission_file)
377
  else:
378
- submission = load_submission(args.submission)
379
  # --- SAP: local file or HF ---
380
  if args.sap_file:
381
  sap_text = load_sap_from_file(args.sap_file)
 
98
  return HfApi(token=token), token
99
 
100
 
101
+ def load_submission(submission: str, version: str | None = None) -> dict:
102
+ """Load a submission version from HF.
103
+
104
+ If `version` is given, use that exact file; otherwise use the latest
105
+ (max timestamp) under submissions/<submission>/.
106
+ """
107
  api, token = _hf()
108
+ from huggingface_hub import hf_hub_download
109
+
110
+ # Pin a specific version file if requested.
111
+ if version:
112
+ # Accept a bare basename, a full repo path, or a URL-encoded '+'.
113
+ vname = version.rsplit("/", 1)[-1].replace("%2B", "+")
114
+ if not vname.endswith(".json"):
115
+ vname += ".json"
116
+ target = f"submissions/{submission}/{vname}"
117
+ try:
118
+ path = hf_hub_download(
119
+ repo_id=INTAKE_REPO, repo_type="dataset", filename=target, token=token
120
+ )
121
+ except Exception as e:
122
+ sys.exit(f"Could not download pinned version {target} from {INTAKE_REPO}: {e}")
123
+ with open(path, encoding="utf-8") as fh:
124
+ rec = json.load(fh)
125
+ print(f" using pinned version: {vname}")
126
+ return rec
127
+
128
  prefix = f"submissions/{submission}/"
129
  try:
130
  files = api.list_repo_files(repo_id=INTAKE_REPO, repo_type="dataset")
 
144
  # max basename is the most recent submission.
145
  versions.sort(key=lambda f: f.rsplit("/", 1)[-1])
146
  latest = versions[-1]
 
 
147
  path = hf_hub_download(
148
  repo_id=INTAKE_REPO, repo_type="dataset", filename=latest, token=token
149
  )
 
379
  help="submission folder name <trial>__<user> (read from HF)")
380
  ap.add_argument("--submission-file", default=None,
381
  help="local submission JSON path (skips HF; no HF_TOKEN needed)")
382
+ ap.add_argument("--version", default=None,
383
+ help="pin an exact version file under submissions/<submission>/ "
384
+ "(e.g. 2026-06-07T17-23-05-870000+00-00.json); default: latest")
385
  ap.add_argument("--doc-id", default=None,
386
  help="documents/<doc-id> folder (default: resolve from NCT via tdr.parquet)")
387
  ap.add_argument("--sap-file", default=None,
 
400
  if args.submission_file:
401
  submission = load_submission_from_file(args.submission_file)
402
  else:
403
+ submission = load_submission(args.submission, version=args.version)
404
  # --- SAP: local file or HF ---
405
  if args.sap_file:
406
  sap_text = load_sap_from_file(args.sap_file)