| """Load all 10,000 evaluation cases from output/*.csv into Postgres. |
| |
| This is YOUR eval pack — golden, adversarial, governance, retrieval, SME feedback. |
| These rows are benchmark *questions*, not documents to chunk. They belong in |
| ``eval_cases`` so runners and dashboards can query them at full scale. |
| |
| Usage: |
| python3 scripts/load_eval_pack_to_db.py |
| python3 scripts/load_eval_pack_to_db.py --verify-only |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import csv |
| import json |
| import os |
| import sys |
| from datetime import UTC, datetime |
| from pathlib import Path |
|
|
| REPO_ROOT = Path(__file__).resolve().parent.parent |
| OUTPUT_DIR = REPO_ROOT / "output" |
|
|
| _env_file = REPO_ROOT / ".env" |
| if _env_file.exists(): |
| for raw_line in _env_file.read_text().splitlines(): |
| line = raw_line.strip() |
| if not line or line.startswith("#") or "=" not in line: |
| continue |
| key, value = line.split("=", 1) |
| os.environ.setdefault(key.strip(), value.strip()) |
|
|
| import psycopg |
|
|
| DSN = os.getenv( |
| "AKS_DATABASE_URL", |
| "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine", |
| ).replace("postgresql+psycopg://", "postgresql://", 1) |
|
|
| DATASETS = [ |
| ("golden", "golden_medical_qa.csv"), |
| ("adversarial", "adversarial_medical_qa.csv"), |
| ("governance", "governance_policy_cases.csv"), |
| ("retrieval", "retrieval_stress_cases.csv"), |
| ("sme", "smr_sme_feedback_examples.csv"), |
| ] |
|
|
| CREATE_TABLE_SQL = (REPO_ROOT / "schemas" / "eval_cases.sql").read_text() |
|
|
|
|
| def _normalize_row(dataset: str, row: dict) -> tuple[str, str, str | None, str | None, str | None, dict]: |
| case_id = row.get("id", "").strip() |
| if not case_id: |
| raise ValueError(f"Row missing id in {dataset}: {row}") |
| therapy = row.get("therapy_area") or row.get("user_geography") |
| geography = row.get("geography") or row.get("user_geography") |
| audience = row.get("audience") |
| return case_id, dataset, therapy, geography, audience, dict(row) |
|
|
|
|
| def load(*, replace: bool = True) -> dict: |
| conn = psycopg.connect(DSN) |
| summary: dict = {"datasets": {}, "total": 0} |
| try: |
| with conn: |
| with conn.cursor() as cur: |
| for statement in CREATE_TABLE_SQL.split(";"): |
| stmt = statement.strip() |
| if stmt: |
| cur.execute(stmt) |
| if replace: |
| cur.execute("TRUNCATE eval_cases") |
| now = datetime.now(UTC) |
| for dataset, filename in DATASETS: |
| path = OUTPUT_DIR / filename |
| if not path.exists(): |
| raise FileNotFoundError(f"Missing eval file: {path}") |
| count = 0 |
| with path.open(newline="", encoding="utf-8") as fh: |
| for row in csv.DictReader(fh): |
| case_id, ds, therapy, geography, audience, payload = _normalize_row( |
| dataset, row |
| ) |
| cur.execute( |
| """ |
| INSERT INTO eval_cases ( |
| case_id, dataset, therapy_area, geography, audience, |
| payload, loaded_at |
| ) VALUES (%s, %s, %s, %s, %s, %s::json, %s) |
| ON CONFLICT (case_id) DO UPDATE SET |
| dataset = EXCLUDED.dataset, |
| therapy_area = EXCLUDED.therapy_area, |
| geography = EXCLUDED.geography, |
| audience = EXCLUDED.audience, |
| payload = EXCLUDED.payload, |
| loaded_at = EXCLUDED.loaded_at |
| """, |
| ( |
| case_id, |
| ds, |
| therapy, |
| geography, |
| audience, |
| json.dumps(payload), |
| now, |
| ), |
| ) |
| count += 1 |
| summary["datasets"][dataset] = count |
| summary["total"] += count |
| finally: |
| conn.close() |
| return summary |
|
|
|
|
| def verify() -> None: |
| conn = psycopg.connect(DSN) |
| try: |
| with conn.cursor() as cur: |
| cur.execute( |
| "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'eval_cases')" |
| ) |
| if not cur.fetchone()[0]: |
| print("eval_cases table does not exist. Run: python3 scripts/load_eval_pack_to_db.py") |
| return |
| cur.execute("SELECT COUNT(*) FROM eval_cases") |
| total = cur.fetchone()[0] |
| print(f"eval_cases total: {total}") |
| cur.execute( |
| "SELECT dataset, COUNT(*) FROM eval_cases GROUP BY dataset ORDER BY dataset" |
| ) |
| for dataset, count in cur.fetchall(): |
| print(f" {dataset}: {count}") |
| finally: |
| conn.close() |
|
|
| |
| import re |
|
|
| conn = psycopg.connect(DSN) |
| eval_sources: set[str] = set() |
| try: |
| with conn.cursor() as cur: |
| cur.execute("SELECT payload FROM eval_cases") |
| for (payload,) in cur.fetchall(): |
| data = payload if isinstance(payload, dict) else json.loads(payload) |
| for key in ("required_sources", "expected_relevant_sources"): |
| raw = data.get(key, "") |
| if raw: |
| eval_sources.update(s.strip() for s in raw.split(";") if s.strip()) |
| cur.execute("SELECT source_id FROM sources") |
| db_sources = {r[0] for r in cur.fetchall()} |
| finally: |
| conn.close() |
|
|
| missing = sorted(eval_sources - db_sources) |
| print(f"unique sources referenced in eval_cases: {len(eval_sources)}") |
| print(f"sources registered in DB: {len(db_sources)}") |
| if missing: |
| print(f"MISSING in sources table ({len(missing)}):", ", ".join(missing[:15])) |
| else: |
| print("All eval-referenced sources are registered in sources.") |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument("--verify-only", action="store_true") |
| args = parser.parse_args() |
| if args.verify_only: |
| verify() |
| return 0 |
| summary = load() |
| print(json.dumps(summary, indent=2)) |
| print("\n--- verification ---") |
| verify() |
| print("\nRun full golden eval (2500 rows, needs gateway/memory up):") |
| print(" python3 eval/runners/run_golden_memory_eval.py") |
| print("Or sample:") |
| print(" python3 eval/runners/run_golden_memory_eval.py --limit 100") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|