"""Seed claims + evidence assessments for the eval-pack knowledge corpus. Links each claim to the best-matching chunk on its primary source and adds EvidenceAssessment rows so memory search ranks eval sources (not only RET/TEST). Does not modify fixture claims (clm-ret-*, clm-test-*). Usage: python3 scripts/seed_eval_claims.py python3 scripts/seed_eval_claims.py --dry-run """ from __future__ import annotations import argparse import json import os import sys import uuid from dataclasses import dataclass from datetime import UTC, datetime from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent _env_file = REPO_ROOT / ".env" if _env_file.exists(): for raw_line in _env_file.read_text().splitlines(): line = raw_line.strip() if not line or line.startswith("#") or "=" not in line: continue key, value = line.split("=", 1) os.environ.setdefault(key.strip(), value.strip()) import psycopg # noqa: E402 DSN = os.getenv( "AKS_DATABASE_URL", "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine", ).replace("postgresql+psycopg://", "postgresql://", 1) NOW = datetime.now(UTC) FIXTURE_CLAIM_PREFIXES = ("clm-ret-", "clm-test-") @dataclass(frozen=True) class ClaimSpec: claim_id: str canonical_text: str claim_type: str # DOSE, EFFICACY, SAFETY, ... primary_source_id: str molecule_id: str | None evidence_score: float strength_band: str # HIGH, MEDIUM, LOW extra_chunk_sources: tuple[str, ...] = () # optional supporting sources CLAIM_SPECS: list[ClaimSpec] = [ # DRUG-A label (golden eval anchor) ClaimSpec( "clm-eval-druga-dose", "The recommended dose of DRUG-A is 80 mg once daily for first-line metastatic EGFR-positive NSCLC within approved EU label boundaries.", "DOSE", "LBL-NSCLC-DRUGA-EMA-2024", "DRUG-A", 0.96, "HIGH", ("PK-SUMMARY-NSCLC-005", "SOP-MED-NSCLC-010"), ), ClaimSpec( "clm-eval-druga-dose-reduce", "Dose reductions for DRUG-A must remain within approved EU label boundaries (40 mg once daily when clinically justified for toxicity).", "DOSE", "LBL-NSCLC-DRUGA-EMA-2024", "DRUG-A", 0.94, "HIGH", ), ClaimSpec( "clm-eval-druga-indication", "DRUG-A is indicated for first-line metastatic NSCLC with activating EGFR mutations under the approved EU label.", "BIOMARKER", "LBL-NSCLC-DRUGA-EMA-2024", "DRUG-A", 0.93, "HIGH", ), ClaimSpec( "clm-eval-druga-ild", "Interstitial lung disease is an important risk for DRUG-A; grade 3 or higher ILD requires permanent discontinuation.", "SAFETY", "LBL-NSCLC-DRUGA-EMA-2024", "DRUG-A", 0.90, "HIGH", ("RMP-NSCLC-DRUGA-2024",), ), # DRUG-B / DRUG-C labels (molecule separation) ClaimSpec( "clm-eval-drugb-dose", "DRUG-B dosing and administration must be described separately from DRUG-A using only the approved DRUG-B EU label.", "DOSE", "LBL-NSCLC-DRUGB-EMA-2023", "DRUG-B", 0.88, "HIGH", ), ClaimSpec( "clm-eval-drugc-indication", "DRUG-C is authorised for EGFR-positive metastatic NSCLC within its own approved EU label scope.", "BIOMARKER", "LBL-NSCLC-DRUGC-EMA-2024", "DRUG-C", 0.88, "HIGH", ), # CSRs ClaimSpec( "clm-eval-csr001-pfs", "DRUG-A improved progression-free survival versus chemotherapy in EGFR-positive first-line metastatic NSCLC.", "EFFICACY", "DOC-CSR-NSCLC-001", "DRUG-A", 0.87, "HIGH", ), ClaimSpec( "clm-eval-csr014-pfs", "Progression-free survival and overall response rate favoured DRUG-A in EGFR-positive NSCLC in study DOC-CSR-NSCLC-014.", "EFFICACY", "DOC-CSR-NSCLC-014", "DRUG-A", 0.86, "HIGH", ), # SOPs ClaimSpec( "clm-eval-sop010-dose", "Medical information responses for DRUG-A dosing must use approved EU label content: 80 mg once daily in first-line metastatic NSCLC.", "DOSE", "SOP-MED-NSCLC-010", "DRUG-A", 0.84, "HIGH", ), ClaimSpec( "clm-eval-sop022-rules", "Off-label dosing inquiries for DRUG-A must be routed to SME review and must not be answered as approved truth.", "DOSE", "SOP-MED-NSCLC-022", "DRUG-A", 0.82, "MEDIUM", ), # Guideline + algo ClaimSpec( "clm-eval-gdl-biomarker", "Validated EGFR mutation testing is required before selecting EGFR-targeted therapy in NSCLC.", "BIOMARKER", "GDL-NSCLC-2025-03", None, 0.83, "HIGH", ), ClaimSpec( "clm-eval-gdl-line", "First-line and later-line NSCLC therapy boundaries must not be conflated; adjuvant use is outside approved metastatic label scope.", "LINE_OF_THERAPY", "GDL-NSCLC-2025-03", None, 0.81, "MEDIUM", ), ClaimSpec( "clm-eval-algo-pathway", "Treatment algorithm step 1 requires confirmed EGFR mutation before first-line metastatic DRUG-A consideration.", "BIOMARKER", "TREATMENT-ALGO-NSCLC-2025-02", "DRUG-A", 0.80, "MEDIUM", ), # RMP / PK / FAQ / playbook / SME ClaimSpec( "clm-eval-rmp-ild", "RMP identifies interstitial lung disease and QT prolongation as important identified risks for DRUG-A.", "SAFETY", "RMP-NSCLC-DRUGA-2024", "DRUG-A", 0.85, "HIGH", ), ClaimSpec( "clm-eval-pk-dose", "DRUG-A 80 mg once daily achieves target exposure; renal impairment requires cautious judgement without unsupported fixed-dose rules.", "DOSE", "PK-SUMMARY-NSCLC-005", "DRUG-A", 0.82, "MEDIUM", ), ClaimSpec( "clm-eval-faq-missed", "Patient-facing missed-dose guidance must follow approved label language only; do not invent rescue dosing instructions.", "DOSE", "MI-FAQ-NSCLC-021", "DRUG-A", 0.78, "MEDIUM", ), ClaimSpec( "clm-eval-playbook-boundary", "Medical affairs playbook keeps DRUG-A, DRUG-B, and DRUG-C narratives separate and excludes adjuvant-only messaging from metastatic approval scope.", "LINE_OF_THERAPY", "MED-AFF-NSCLC-PLAYBOOK-008", None, 0.77, "MEDIUM", ), ClaimSpec( "clm-eval-sme-compare", "SME review: comparative statements for DRUG-A require explicit label or CSR grounding and must avoid superiority overclaim.", "COMPARISON_BOUNDARY", "SME-NOTE-NSCLC-017", "DRUG-A", 0.76, "MEDIUM", ), ] # Cross-claim graph: label dose supported by PK + SOP GRAPH_EDGES = [ ("clm-eval-druga-dose", "clm-eval-pk-dose", "SUPPORTED_BY"), ("clm-eval-druga-dose", "clm-eval-sop010-dose", "SUPPORTED_BY"), ("clm-eval-druga-ild", "clm-eval-rmp-ild", "SUPPORTED_BY"), ("clm-eval-csr001-pfs", "clm-eval-druga-indication", "SUPPORTED_BY"), ] def _token_overlap(claim_text: str, chunk_text: str, claim_type: str, chunk_type: str) -> float: claim_tokens = { t for t in (w.strip(".,;:()").lower() for w in claim_text.split()) if len(t) > 3 } chunk_tokens = { t for t in (w.strip(".,;:()").lower() for w in chunk_text.split()) if len(t) > 3 } overlap = len(claim_tokens & chunk_tokens) if claim_type == chunk_type: overlap += 8 return overlap def _best_chunk( cur: "psycopg.Cursor", source_id: str, spec: ClaimSpec, ) -> str | None: cur.execute( """ SELECT chunk_id, claim_type, text FROM chunks WHERE source_id = %s ORDER BY token_count DESC """, (source_id,), ) rows = cur.fetchall() if not rows: return None best_id = None best_score = -1.0 for chunk_id, chunk_type, text in rows: score = _token_overlap(spec.canonical_text, text, spec.claim_type, str(chunk_type)) if score > best_score: best_score = score best_id = chunk_id return best_id def seed(*, dry_run: bool) -> dict: summary = {"claims_upserted": 0, "links": 0, "assessments": 0, "graph_edges": 0, "skipped": []} conn = psycopg.connect(DSN) try: with conn: with conn.cursor() as cur: for spec in CLAIM_SPECS: primary_chunk = _best_chunk(cur, spec.primary_source_id, spec) if primary_chunk is None: summary["skipped"].append(spec.claim_id) continue if dry_run: summary["claims_upserted"] += 1 continue cur.execute( """ INSERT INTO claims ( claim_id, canonical_text, claim_type, molecule_id, geography_id, approval_state, sme_validated, primary_source_id, current_evidence_score, strength_band, created_at ) VALUES (%s, %s, %s, %s, %s, %s, FALSE, %s, %s, %s, %s) ON CONFLICT (claim_id) DO UPDATE SET canonical_text = EXCLUDED.canonical_text, claim_type = EXCLUDED.claim_type, molecule_id = EXCLUDED.molecule_id, primary_source_id = EXCLUDED.primary_source_id, current_evidence_score = EXCLUDED.current_evidence_score, strength_band = EXCLUDED.strength_band """, ( spec.claim_id, spec.canonical_text, spec.claim_type, spec.molecule_id, "EU / EMA", "approved", spec.primary_source_id, spec.evidence_score, spec.strength_band, NOW, ), ) summary["claims_upserted"] += 1 cur.execute( "DELETE FROM claim_evidence_links WHERE claim_id = %s", (spec.claim_id,), ) def link_chunk(chunk_id: str, source_id: str, *, primary: bool) -> None: cur.execute( """ INSERT INTO claim_evidence_links ( claim_id, chunk_id, source_id, support_type, extraction_confidence, is_primary_support ) VALUES (%s, %s, %s, 'PRIMARY', %s, %s) ON CONFLICT DO NOTHING """, ( spec.claim_id, chunk_id, source_id, 0.95 if primary else 0.75, primary, ), ) summary["links"] += cur.rowcount or 0 link_chunk(primary_chunk, spec.primary_source_id, primary=True) for extra_source in spec.extra_chunk_sources: extra_chunk = _best_chunk(cur, extra_source, spec) if extra_chunk: link_chunk(extra_chunk, extra_source, primary=False) assessment_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"eval-asm-{spec.claim_id}")) cur.execute( "DELETE FROM evidence_assessments WHERE assessment_id = %s", (assessment_id,), ) cur.execute( """ INSERT INTO evidence_assessments ( assessment_id, claim_id, source_prior_score, recency_score, approval_score, sme_score, consistency_score, audience_fit_score, geography_fit_score, penalty_score, evidence_score, strength_band, explanation_json, scored_at ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::json, %s ) """, ( assessment_id, spec.claim_id, 0.95 if spec.primary_source_id.startswith("LBL-") else 0.85, 0.90, 1.0, 0.70, 0.90, 1.0, 1.0, 0.0, spec.evidence_score, spec.strength_band, json.dumps( { "reasons": [ f"Seeded from eval corpus primary source {spec.primary_source_id}" ] } ), NOW, ), ) summary["assessments"] += 1 if not dry_run: for from_id, to_id, rel in GRAPH_EDGES: rel_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"eval-rel-{from_id}-{to_id}")) cur.execute( "DELETE FROM claim_relationships WHERE relationship_id = %s", (rel_id,), ) cur.execute( """ INSERT INTO claim_relationships ( relationship_id, from_claim_id, to_claim_id, relation_type, relation_metadata, created_at ) VALUES (%s, %s, %s, %s, %s::json, %s) """, ( rel_id, from_id, to_id, rel, json.dumps({"seed": "eval_corpus"}), NOW, ), ) summary["graph_edges"] += 1 finally: conn.close() return summary def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() summary = seed(dry_run=args.dry_run) print(json.dumps(summary, indent=2)) if args.dry_run: return 0 conn = psycopg.connect(DSN) try: with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM claims WHERE claim_id LIKE 'clm-eval-%'") print(f"eval claims in DB: {cur.fetchone()[0]}") cur.execute( """ SELECT c.primary_source_id, COUNT(*) FROM claims c WHERE c.claim_id LIKE 'clm-eval-%' GROUP BY c.primary_source_id ORDER BY 1 """ ) print("by primary source:") for row in cur.fetchall(): print(f" {row[0]}: {row[1]}") finally: conn.close() return 0 if __name__ == "__main__": raise SystemExit(main())