File size: 7,066 Bytes
08fd094 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | """Load all 10,000 evaluation cases from output/*.csv into Postgres.
This is YOUR eval pack — golden, adversarial, governance, retrieval, SME feedback.
These rows are benchmark *questions*, not documents to chunk. They belong in
``eval_cases`` so runners and dashboards can query them at full scale.
Usage:
python3 scripts/load_eval_pack_to_db.py
python3 scripts/load_eval_pack_to_db.py --verify-only
"""
from __future__ import annotations
import argparse
import csv
import json
import os
import sys
from datetime import UTC, datetime
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
OUTPUT_DIR = REPO_ROOT / "output"
_env_file = REPO_ROOT / ".env"
if _env_file.exists():
for raw_line in _env_file.read_text().splitlines():
line = raw_line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
os.environ.setdefault(key.strip(), value.strip())
import psycopg # noqa: E402
DSN = os.getenv(
"AKS_DATABASE_URL",
"postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine",
).replace("postgresql+psycopg://", "postgresql://", 1)
DATASETS = [
("golden", "golden_medical_qa.csv"),
("adversarial", "adversarial_medical_qa.csv"),
("governance", "governance_policy_cases.csv"),
("retrieval", "retrieval_stress_cases.csv"),
("sme", "smr_sme_feedback_examples.csv"),
]
CREATE_TABLE_SQL = (REPO_ROOT / "schemas" / "eval_cases.sql").read_text()
def _normalize_row(dataset: str, row: dict) -> tuple[str, str, str | None, str | None, str | None, dict]:
case_id = row.get("id", "").strip()
if not case_id:
raise ValueError(f"Row missing id in {dataset}: {row}")
therapy = row.get("therapy_area") or row.get("user_geography")
geography = row.get("geography") or row.get("user_geography")
audience = row.get("audience")
return case_id, dataset, therapy, geography, audience, dict(row)
def load(*, replace: bool = True) -> dict:
conn = psycopg.connect(DSN)
summary: dict = {"datasets": {}, "total": 0}
try:
with conn:
with conn.cursor() as cur:
for statement in CREATE_TABLE_SQL.split(";"):
stmt = statement.strip()
if stmt:
cur.execute(stmt)
if replace:
cur.execute("TRUNCATE eval_cases")
now = datetime.now(UTC)
for dataset, filename in DATASETS:
path = OUTPUT_DIR / filename
if not path.exists():
raise FileNotFoundError(f"Missing eval file: {path}")
count = 0
with path.open(newline="", encoding="utf-8") as fh:
for row in csv.DictReader(fh):
case_id, ds, therapy, geography, audience, payload = _normalize_row(
dataset, row
)
cur.execute(
"""
INSERT INTO eval_cases (
case_id, dataset, therapy_area, geography, audience,
payload, loaded_at
) VALUES (%s, %s, %s, %s, %s, %s::json, %s)
ON CONFLICT (case_id) DO UPDATE SET
dataset = EXCLUDED.dataset,
therapy_area = EXCLUDED.therapy_area,
geography = EXCLUDED.geography,
audience = EXCLUDED.audience,
payload = EXCLUDED.payload,
loaded_at = EXCLUDED.loaded_at
""",
(
case_id,
ds,
therapy,
geography,
audience,
json.dumps(payload),
now,
),
)
count += 1
summary["datasets"][dataset] = count
summary["total"] += count
finally:
conn.close()
return summary
def verify() -> None:
conn = psycopg.connect(DSN)
try:
with conn.cursor() as cur:
cur.execute(
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'eval_cases')"
)
if not cur.fetchone()[0]:
print("eval_cases table does not exist. Run: python3 scripts/load_eval_pack_to_db.py")
return
cur.execute("SELECT COUNT(*) FROM eval_cases")
total = cur.fetchone()[0]
print(f"eval_cases total: {total}")
cur.execute(
"SELECT dataset, COUNT(*) FROM eval_cases GROUP BY dataset ORDER BY dataset"
)
for dataset, count in cur.fetchall():
print(f" {dataset}: {count}")
finally:
conn.close()
# Source coverage check
import re
conn = psycopg.connect(DSN)
eval_sources: set[str] = set()
try:
with conn.cursor() as cur:
cur.execute("SELECT payload FROM eval_cases")
for (payload,) in cur.fetchall():
data = payload if isinstance(payload, dict) else json.loads(payload)
for key in ("required_sources", "expected_relevant_sources"):
raw = data.get(key, "")
if raw:
eval_sources.update(s.strip() for s in raw.split(";") if s.strip())
cur.execute("SELECT source_id FROM sources")
db_sources = {r[0] for r in cur.fetchall()}
finally:
conn.close()
missing = sorted(eval_sources - db_sources)
print(f"unique sources referenced in eval_cases: {len(eval_sources)}")
print(f"sources registered in DB: {len(db_sources)}")
if missing:
print(f"MISSING in sources table ({len(missing)}):", ", ".join(missing[:15]))
else:
print("All eval-referenced sources are registered in sources.")
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--verify-only", action="store_true")
args = parser.parse_args()
if args.verify_only:
verify()
return 0
summary = load()
print(json.dumps(summary, indent=2))
print("\n--- verification ---")
verify()
print("\nRun full golden eval (2500 rows, needs gateway/memory up):")
print(" python3 eval/runners/run_golden_memory_eval.py")
print("Or sample:")
print(" python3 eval/runners/run_golden_memory_eval.py --limit 100")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|