"""Seed graph ontology entities and link them to eval + fixture claims. Populates diseases, molecules, populations, geographies, studies, endpoints, safety_risks, and junction tables. Updates claim FK fields to use graph IDs. Adds molecule–disease and claim–study/endpoint/risk edges. Does not delete existing claim_relationships. Usage: python3 scripts/seed_eval_graph_entities.py """ from __future__ import annotations import json import os import sys import uuid from datetime import UTC, datetime from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent _env_file = REPO_ROOT / ".env" if _env_file.exists(): for raw_line in _env_file.read_text().splitlines(): line = raw_line.strip() if not line or line.startswith("#") or "=" not in line: continue key, value = line.split("=", 1) os.environ.setdefault(key.strip(), value.strip()) import psycopg # noqa: E402 DSN = os.getenv( "AKS_DATABASE_URL", "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine", ).replace("postgresql+psycopg://", "postgresql://", 1) NOW = datetime.now(UTC) # --- Core ontology nodes --- DISEASES = [ ("dis-nsclc", "Non-Small Cell Lung Cancer (NSCLC)", "NSCLC"), ] MOLECULES = [ ("DRUG-A", "DRUG-A"), ("DRUG-B", "DRUG-B"), ("DRUG-C", "DRUG-C"), ] POPULATIONS = [ ("pop-egfr-metastatic", "Adults with EGFR-positive metastatic NSCLC"), ("pop-adult-standard", "Standard adult patients"), ] GEOGRAPHIES = [ ("geo-eu-ema", "EU / EMA", "EMA"), ] STUDIES = [ ("study-csr-001", "DOC-CSR-NSCLC-001 pivotal trial"), ("study-csr-014", "DOC-CSR-NSCLC-014 confirmatory trial"), ] ENDPOINTS = [ ("ep-pfs", "Progression-free survival"), ("ep-os", "Overall survival"), ("ep-orr", "Objective response rate"), ] SAFETY_RISKS = [ ("risk-ild", "Interstitial lung disease"), ("risk-qt", "QT prolongation"), ("risk-hepato", "Hepatotoxicity"), ] MOLECULE_DISEASE = [ ("DRUG-A", "dis-nsclc", "TREATS"), ("DRUG-B", "dis-nsclc", "TREATS"), ("DRUG-C", "dis-nsclc", "TREATS"), ] # claim_id -> study_ids CLAIM_STUDIES: dict[str, list[str]] = { "clm-eval-csr001-pfs": ["study-csr-001"], "clm-eval-csr014-pfs": ["study-csr-014"], "clm-ret-csr-1": ["study-csr-001"], "clm-test-csr-1": ["study-csr-014"], } # claim_id -> endpoint_ids CLAIM_ENDPOINTS: dict[str, list[str]] = { "clm-eval-csr001-pfs": ["ep-pfs", "ep-orr"], "clm-eval-csr014-pfs": ["ep-pfs", "ep-os", "ep-orr"], "clm-ret-csr-1": ["ep-pfs"], "clm-test-csr-1": ["ep-pfs"], } # claim_id -> risk_ids CLAIM_RISKS: dict[str, list[str]] = { "clm-eval-druga-ild": ["risk-ild", "risk-qt"], "clm-eval-rmp-ild": ["risk-ild"], } # Extra claim–claim edges (beyond seed_eval_claims.py) EXTRA_CLAIM_EDGES = [ ("clm-eval-druga-indication", "clm-eval-gdl-biomarker", "SUPPORTED_BY"), ("clm-eval-druga-dose-reduce", "clm-eval-sop022-rules", "SUPPORTED_BY"), ("clm-eval-drugb-dose", "clm-eval-drugc-indication", "CONTRADICTED_BY"), ] # molecule_id inferred from claim molecule_id or primary source SOURCE_MOLECULE = { "LBL-NSCLC-DRUGA-EMA-2024": "DRUG-A", "LBL-NSCLC-DRUGB-EMA-2023": "DRUG-B", "LBL-NSCLC-DRUGC-EMA-2024": "DRUG-C", "DOC-CSR-NSCLC-001": "DRUG-A", "DOC-CSR-NSCLC-014": "DRUG-A", "RMP-NSCLC-DRUGA-2024": "DRUG-A", "PK-SUMMARY-NSCLC-005": "DRUG-A", "SOP-MED-NSCLC-010": "DRUG-A", "SOP-MED-NSCLC-022": "DRUG-A", "GDL-NSCLC-2025-03": None, "MI-FAQ-NSCLC-021": "DRUG-A", "MED-AFF-NSCLC-PLAYBOOK-008": None, "TREATMENT-ALGO-NSCLC-2025-02": "DRUG-A", "SME-NOTE-NSCLC-017": "DRUG-A", "LBL-NSCLC-RET-EMA-2026": "DRUG-A", "LBL-NSCLC-TEST-EMA-2026": "DRUG-A", "DOC-CSR-NSCLC-RET-2026": "DRUG-A", "DOC-CSR-NSCLC-TEST-2026": "DRUG-A", } def _rel_id(from_id: str, to_id: str, rel: str) -> str: return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"eval-graph-{from_id}-{to_id}-{rel}")) def seed() -> dict: summary = { "entities": 0, "molecule_disease_links": 0, "claims_updated": 0, "claim_study_links": 0, "claim_endpoint_links": 0, "claim_risk_links": 0, "extra_claim_edges": 0, } conn = psycopg.connect(DSN) try: with conn: with conn.cursor() as cur: for disease_id, name, therapy in DISEASES: cur.execute( """ INSERT INTO diseases (disease_id, name, therapy_area) VALUES (%s, %s, %s) ON CONFLICT (disease_id) DO UPDATE SET name = EXCLUDED.name """, (disease_id, name, therapy), ) summary["entities"] += 1 for mol_id, name in MOLECULES: cur.execute( """ INSERT INTO molecules (molecule_id, name) VALUES (%s, %s) ON CONFLICT (molecule_id) DO UPDATE SET name = EXCLUDED.name """, (mol_id, name), ) summary["entities"] += 1 for pop_id, name in POPULATIONS: cur.execute( """ INSERT INTO populations (population_id, name) VALUES (%s, %s) ON CONFLICT (population_id) DO UPDATE SET name = EXCLUDED.name """, (pop_id, name), ) summary["entities"] += 1 for geo_id, name, scope in GEOGRAPHIES: cur.execute( """ INSERT INTO geographies (geography_id, name, regulatory_scope) VALUES (%s, %s, %s) ON CONFLICT (geography_id) DO UPDATE SET name = EXCLUDED.name """, (geo_id, name, scope), ) summary["entities"] += 1 for study_id, title in STUDIES: cur.execute( """ INSERT INTO studies (study_id, title) VALUES (%s, %s) ON CONFLICT (study_id) DO UPDATE SET title = EXCLUDED.title """, (study_id, title), ) summary["entities"] += 1 for ep_id, name in ENDPOINTS: cur.execute( """ INSERT INTO endpoints (endpoint_id, name) VALUES (%s, %s) ON CONFLICT (endpoint_id) DO UPDATE SET name = EXCLUDED.name """, (ep_id, name), ) summary["entities"] += 1 for risk_id, name in SAFETY_RISKS: cur.execute( """ INSERT INTO safety_risks (risk_id, name) VALUES (%s, %s) ON CONFLICT (risk_id) DO UPDATE SET name = EXCLUDED.name """, (risk_id, name), ) summary["entities"] += 1 for mol_id, dis_id, rel in MOLECULE_DISEASE: cur.execute( """ INSERT INTO molecule_disease_links (molecule_id, disease_id, relation_type) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING """, (mol_id, dis_id, rel), ) summary["molecule_disease_links"] += cur.rowcount or 0 cur.execute("SELECT claim_id, molecule_id, primary_source_id FROM claims") for claim_id, molecule_id, primary_source_id in cur.fetchall(): mol = molecule_id if not mol and primary_source_id: mol = SOURCE_MOLECULE.get(primary_source_id) cur.execute( """ UPDATE claims SET disease_id = %s, population_id = %s, geography_id = %s, molecule_id = COALESCE(molecule_id, %s) WHERE claim_id = %s """, ("dis-nsclc", "pop-egfr-metastatic", "geo-eu-ema", mol, claim_id), ) summary["claims_updated"] += 1 for claim_id, study_ids in CLAIM_STUDIES.items(): for study_id in study_ids: cur.execute( """ INSERT INTO claim_study_links (claim_id, study_id) VALUES (%s, %s) ON CONFLICT DO NOTHING """, (claim_id, study_id), ) summary["claim_study_links"] += cur.rowcount or 0 for claim_id, endpoint_ids in CLAIM_ENDPOINTS.items(): for endpoint_id in endpoint_ids: cur.execute( """ INSERT INTO claim_endpoint_links (claim_id, endpoint_id) VALUES (%s, %s) ON CONFLICT DO NOTHING """, (claim_id, endpoint_id), ) summary["claim_endpoint_links"] += cur.rowcount or 0 for claim_id, risk_ids in CLAIM_RISKS.items(): for risk_id in risk_ids: cur.execute( """ INSERT INTO claim_risk_links (claim_id, risk_id) VALUES (%s, %s) ON CONFLICT DO NOTHING """, (claim_id, risk_id), ) summary["claim_risk_links"] += cur.rowcount or 0 for from_id, to_id, rel in EXTRA_CLAIM_EDGES: rel_id = _rel_id(from_id, to_id, rel) cur.execute( "DELETE FROM claim_relationships WHERE relationship_id = %s", (rel_id,), ) cur.execute( """ INSERT INTO claim_relationships ( relationship_id, from_claim_id, to_claim_id, relation_type, relation_metadata, created_at ) VALUES (%s, %s, %s, %s, %s::json, %s) """, ( rel_id, from_id, to_id, rel, json.dumps({"seed": "eval_graph_entities"}), NOW, ), ) summary["extra_claim_edges"] += 1 finally: conn.close() return summary def main() -> int: summary = seed() print(json.dumps(summary, indent=2)) conn = psycopg.connect(DSN) try: with conn.cursor() as cur: for table in [ "diseases", "molecules", "populations", "geographies", "studies", "endpoints", "safety_risks", "molecule_disease_links", "claim_study_links", "claim_endpoint_links", "claim_risk_links", "claim_relationships", ]: cur.execute(f"SELECT COUNT(*) FROM {table}") print(f"{table}: {cur.fetchone()[0]}") finally: conn.close() return 0 if __name__ == "__main__": raise SystemExit(main())