"""Storage backend: Hugging Face Dataset repo, with local filesystem fallback for dev. Layout inside the dataset repo: submissions/____.json (immutable: the submission) reviews/____/__.json (one file per review) Submissions are never rewritten. Each review is a brand-new file, so multiple reviewers can review the same submission concurrently with no write conflict. The "current status" of a submission is derived from its most recent review. Env vars (set in HF Space → Settings → Variables and secrets): HF_TOKEN - HF user access token with Write permission HF_DATASET_REPO - e.g. "ttt-77/tdb-intake-submissions" HF_DATASET_BRANCH - optional, defaults to "main" ADMIN_PASSWORD - shared password for the Admin page If HF_TOKEN or HF_DATASET_REPO is missing, all I/O goes to ./data/... so local dev works without HF credentials. """ from __future__ import annotations import io import json import os import re from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional import requests from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "").strip() HF_DATASET_BRANCH = os.environ.get("HF_DATASET_BRANCH", "main").strip() or "main" ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD", "") hf_configured = bool(HF_TOKEN and HF_DATASET_REPO) _api = HfApi(token=HF_TOKEN) if hf_configured else None LOCAL_DATA_DIR = Path("data") SUBMISSIONS_PREFIX = "submissions" REVIEWS_PREFIX = "reviews" def _safe(s: str) -> str: return re.sub(r"[^a-zA-Z0-9_\-]", "_", (s or "").strip()) def _now_iso() -> str: return datetime.now(timezone.utc).isoformat() def _stamp(iso: Optional[str] = None) -> str: return (iso or _now_iso()).replace(":", "-").replace(".", "-") def _base_id(submission_id: str) -> str: """'submissions/foo.json' -> 'foo'""" name = submission_id.split("/")[-1] return name[:-5] if name.endswith(".json") else name # ---- low-level read/write/list (HF or local) ----------------------------- def _hf_upload_json(path_in_repo: str, payload: Dict[str, Any], commit_message: str) -> None: assert _api is not None content = json.dumps(payload, indent=2, ensure_ascii=False).encode("utf-8") _api.upload_file( path_or_fileobj=io.BytesIO(content), path_in_repo=path_in_repo, repo_id=HF_DATASET_REPO, repo_type="dataset", revision=HF_DATASET_BRANCH, commit_message=commit_message, ) def _hf_read_json(path_in_repo: str) -> Optional[Dict[str, Any]]: url = ( f"https://huggingface.co/datasets/{HF_DATASET_REPO}" f"/resolve/{HF_DATASET_BRANCH}/{path_in_repo}" ) r = requests.get(url, headers={"Authorization": f"Bearer {HF_TOKEN}"}, timeout=20) if r.status_code == 404: return None r.raise_for_status() return r.json() def _write_json(path_in_repo: str, payload: Dict[str, Any], commit_message: str) -> None: if hf_configured: _hf_upload_json(path_in_repo, payload, commit_message) return p = LOCAL_DATA_DIR / path_in_repo p.parent.mkdir(parents=True, exist_ok=True) p.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") def _read_json(path_in_repo: str) -> Optional[Dict[str, Any]]: if hf_configured: return _hf_read_json(path_in_repo) p = LOCAL_DATA_DIR / path_in_repo if not p.exists(): return None try: return json.loads(p.read_text(encoding="utf-8")) except Exception: return None def _all_files() -> List[str]: """List every file path in the repo (HF) or under ./data (local).""" if hf_configured: assert _api is not None try: return _api.list_repo_files( repo_id=HF_DATASET_REPO, repo_type="dataset", revision=HF_DATASET_BRANCH, ) except HfHubHTTPError as e: if e.response is not None and e.response.status_code == 404: return [] raise if not LOCAL_DATA_DIR.exists(): return [] return [p.relative_to(LOCAL_DATA_DIR).as_posix() for p in LOCAL_DATA_DIR.rglob("*.json")] # ---- public API ---------------------------------------------------------- def submission_id_for(trial_id: str, username: str) -> str: """Stable submission id (path) for a (trial_id, username) pair. One submission per pair — submitting again updates the same file, so a submission can be loaded back and edited. """ return f"{SUBMISSIONS_PREFIX}/{_safe(trial_id)}__{_safe(username)}.json" def get_submission_by_key(trial_id: str, username: str) -> Optional[Dict[str, Any]]: """Load an existing submission by (trial_id, username), or None.""" return get_submission(submission_id_for(trial_id, username)) def save_submission(trial_id: str, username: str, comparison: Dict[str, Any]) -> Dict[str, Any]: """Create or update the submission for (trial_id, username). If a submission already exists for this pair, it is updated in place (createdAt is preserved); otherwise a new one is created. """ submission_id = submission_id_for(trial_id, username) now = _now_iso() existing = get_submission(submission_id) created_at = (existing or {}).get("createdAt") or (existing or {}).get("submittedAt") or now is_update = existing is not None record = { "submissionId": submission_id, "createdAt": created_at, "updatedAt": now, # kept for backward compatibility with older records / admin display "submittedAt": created_at, "trial_id": trial_id, "username": username, "comparison": comparison, } verb = "Update" if is_update else "Add" _write_json(submission_id, record, f"{verb} submission: {trial_id} — {username}") url = ( f"https://huggingface.co/datasets/{HF_DATASET_REPO}" f"/blob/{HF_DATASET_BRANCH}/{submission_id}" if hf_configured else None ) return { "submissionId": submission_id, "url": url, "record": record, "updated": is_update, } def add_review(submission_id: str, status: str, reviewer: str, note: str = "") -> Dict[str, Any]: """Append a review as its own file under reviews//. Each review is a new file (never overwrites), so concurrent reviews by different people cannot conflict. """ base = _base_id(submission_id) now = _now_iso() review = { "submissionId": submission_id, "at": now, "reviewer": reviewer, "status": status, "note": note, } review_path = f"{REVIEWS_PREFIX}/{base}/{_stamp(now)}__{_safe(reviewer) or 'anon'}.json" _write_json(review_path, review, f"Review ({status}) by {reviewer or 'anon'} on {base}") return review def list_reviews(submission_id: str, all_files: Optional[List[str]] = None) -> List[Dict[str, Any]]: """All reviews for a submission, oldest first.""" base = _base_id(submission_id) prefix = f"{REVIEWS_PREFIX}/{base}/" files = all_files if all_files is not None else _all_files() paths = sorted(f for f in files if f.startswith(prefix) and f.endswith(".json")) reviews = [r for r in (_read_json(p) for p in paths) if r] reviews.sort(key=lambda r: r.get("at", "")) return reviews def get_submission(submission_id: str) -> Optional[Dict[str, Any]]: if not submission_id.startswith(f"{SUBMISSIONS_PREFIX}/"): return None return _read_json(submission_id) def list_submissions() -> List[Dict[str, Any]]: """Every submission with its derived status and full review timeline. Each item: submissionId, trial_id, username, submittedAt, status, reviewedAt, reviewer, review_count, reviews (list), submission (full record). """ files = _all_files() sub_paths = sorted( f for f in files if f.startswith(f"{SUBMISSIONS_PREFIX}/") and f.endswith(".json") ) result: List[Dict[str, Any]] = [] for sp in sub_paths: sub = _read_json(sp) if not sub: continue reviews = list_reviews(sp, all_files=files) latest = reviews[-1] if reviews else None result.append( { "submissionId": sp, "trial_id": sub.get("trial_id", ""), "username": sub.get("username", ""), "submittedAt": sub.get("submittedAt", ""), "updatedAt": sub.get("updatedAt", sub.get("submittedAt", "")), "status": latest["status"] if latest else "pending", "reviewedAt": latest["at"] if latest else "", "reviewer": latest["reviewer"] if latest else "", "review_count": len(reviews), "reviews": reviews, "submission": sub, } ) result.sort(key=lambda r: r.get("updatedAt", ""), reverse=True) return result # ---- admin gate ---------------------------------------------------------- def check_admin_password(supplied: str) -> bool: if not ADMIN_PASSWORD: return True # no password configured = open (dev mode) return supplied == ADMIN_PASSWORD