File size: 7,066 Bytes
08fd094
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""Load all 10,000 evaluation cases from output/*.csv into Postgres.

This is YOUR eval pack — golden, adversarial, governance, retrieval, SME feedback.
These rows are benchmark *questions*, not documents to chunk. They belong in
``eval_cases`` so runners and dashboards can query them at full scale.

Usage:
    python3 scripts/load_eval_pack_to_db.py
    python3 scripts/load_eval_pack_to_db.py --verify-only
"""

from __future__ import annotations

import argparse
import csv
import json
import os
import sys
from datetime import UTC, datetime
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
OUTPUT_DIR = REPO_ROOT / "output"

_env_file = REPO_ROOT / ".env"
if _env_file.exists():
    for raw_line in _env_file.read_text().splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        os.environ.setdefault(key.strip(), value.strip())

import psycopg  # noqa: E402

DSN = os.getenv(
    "AKS_DATABASE_URL",
    "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine",
).replace("postgresql+psycopg://", "postgresql://", 1)

DATASETS = [
    ("golden", "golden_medical_qa.csv"),
    ("adversarial", "adversarial_medical_qa.csv"),
    ("governance", "governance_policy_cases.csv"),
    ("retrieval", "retrieval_stress_cases.csv"),
    ("sme", "smr_sme_feedback_examples.csv"),
]

CREATE_TABLE_SQL = (REPO_ROOT / "schemas" / "eval_cases.sql").read_text()


def _normalize_row(dataset: str, row: dict) -> tuple[str, str, str | None, str | None, str | None, dict]:
    case_id = row.get("id", "").strip()
    if not case_id:
        raise ValueError(f"Row missing id in {dataset}: {row}")
    therapy = row.get("therapy_area") or row.get("user_geography")
    geography = row.get("geography") or row.get("user_geography")
    audience = row.get("audience")
    return case_id, dataset, therapy, geography, audience, dict(row)


def load(*, replace: bool = True) -> dict:
    conn = psycopg.connect(DSN)
    summary: dict = {"datasets": {}, "total": 0}
    try:
        with conn:
            with conn.cursor() as cur:
                for statement in CREATE_TABLE_SQL.split(";"):
                    stmt = statement.strip()
                    if stmt:
                        cur.execute(stmt)
                if replace:
                    cur.execute("TRUNCATE eval_cases")
                now = datetime.now(UTC)
                for dataset, filename in DATASETS:
                    path = OUTPUT_DIR / filename
                    if not path.exists():
                        raise FileNotFoundError(f"Missing eval file: {path}")
                    count = 0
                    with path.open(newline="", encoding="utf-8") as fh:
                        for row in csv.DictReader(fh):
                            case_id, ds, therapy, geography, audience, payload = _normalize_row(
                                dataset, row
                            )
                            cur.execute(
                                """
                                INSERT INTO eval_cases (
                                    case_id, dataset, therapy_area, geography, audience,
                                    payload, loaded_at
                                ) VALUES (%s, %s, %s, %s, %s, %s::json, %s)
                                ON CONFLICT (case_id) DO UPDATE SET
                                    dataset = EXCLUDED.dataset,
                                    therapy_area = EXCLUDED.therapy_area,
                                    geography = EXCLUDED.geography,
                                    audience = EXCLUDED.audience,
                                    payload = EXCLUDED.payload,
                                    loaded_at = EXCLUDED.loaded_at
                                """,
                                (
                                    case_id,
                                    ds,
                                    therapy,
                                    geography,
                                    audience,
                                    json.dumps(payload),
                                    now,
                                ),
                            )
                            count += 1
                    summary["datasets"][dataset] = count
                    summary["total"] += count
    finally:
        conn.close()
    return summary


def verify() -> None:
    conn = psycopg.connect(DSN)
    try:
        with conn.cursor() as cur:
            cur.execute(
                "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'eval_cases')"
            )
            if not cur.fetchone()[0]:
                print("eval_cases table does not exist. Run: python3 scripts/load_eval_pack_to_db.py")
                return
            cur.execute("SELECT COUNT(*) FROM eval_cases")
            total = cur.fetchone()[0]
            print(f"eval_cases total: {total}")
            cur.execute(
                "SELECT dataset, COUNT(*) FROM eval_cases GROUP BY dataset ORDER BY dataset"
            )
            for dataset, count in cur.fetchall():
                print(f"  {dataset}: {count}")
    finally:
        conn.close()

    # Source coverage check
    import re

    conn = psycopg.connect(DSN)
    eval_sources: set[str] = set()
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT payload FROM eval_cases")
            for (payload,) in cur.fetchall():
                data = payload if isinstance(payload, dict) else json.loads(payload)
                for key in ("required_sources", "expected_relevant_sources"):
                    raw = data.get(key, "")
                    if raw:
                        eval_sources.update(s.strip() for s in raw.split(";") if s.strip())
            cur.execute("SELECT source_id FROM sources")
            db_sources = {r[0] for r in cur.fetchall()}
    finally:
        conn.close()

    missing = sorted(eval_sources - db_sources)
    print(f"unique sources referenced in eval_cases: {len(eval_sources)}")
    print(f"sources registered in DB: {len(db_sources)}")
    if missing:
        print(f"MISSING in sources table ({len(missing)}):", ", ".join(missing[:15]))
    else:
        print("All eval-referenced sources are registered in sources.")


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--verify-only", action="store_true")
    args = parser.parse_args()
    if args.verify_only:
        verify()
        return 0
    summary = load()
    print(json.dumps(summary, indent=2))
    print("\n--- verification ---")
    verify()
    print("\nRun full golden eval (2500 rows, needs gateway/memory up):")
    print("  python3 eval/runners/run_golden_memory_eval.py")
    print("Or sample:")
    print("  python3 eval/runners/run_golden_memory_eval.py --limit 100")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())