"""Seed graph ontology entities and link them to eval + fixture claims.

Populates diseases, molecules, populations, geographies, studies, endpoints,
safety_risks, and junction tables. Updates claim FK fields to use graph IDs.
Adds molecule–disease and claim–study/endpoint/risk edges.

Does not delete existing claim_relationships.

Usage:
    python3 scripts/seed_eval_graph_entities.py
"""

from __future__ import annotations

import json
import os
import sys
import uuid
from datetime import UTC, datetime
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
_env_file = REPO_ROOT / ".env"
if _env_file.exists():
    for raw_line in _env_file.read_text().splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        os.environ.setdefault(key.strip(), value.strip())

import psycopg  # noqa: E402

DSN = os.getenv(
    "AKS_DATABASE_URL",
    "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine",
).replace("postgresql+psycopg://", "postgresql://", 1)

NOW = datetime.now(UTC)

# --- Core ontology nodes ---
DISEASES = [
    ("dis-nsclc", "Non-Small Cell Lung Cancer (NSCLC)", "NSCLC"),
]

MOLECULES = [
    ("DRUG-A", "DRUG-A"),
    ("DRUG-B", "DRUG-B"),
    ("DRUG-C", "DRUG-C"),
]

POPULATIONS = [
    ("pop-egfr-metastatic", "Adults with EGFR-positive metastatic NSCLC"),
    ("pop-adult-standard", "Standard adult patients"),
]

GEOGRAPHIES = [
    ("geo-eu-ema", "EU / EMA", "EMA"),
]

STUDIES = [
    ("study-csr-001", "DOC-CSR-NSCLC-001 pivotal trial"),
    ("study-csr-014", "DOC-CSR-NSCLC-014 confirmatory trial"),
]

ENDPOINTS = [
    ("ep-pfs", "Progression-free survival"),
    ("ep-os", "Overall survival"),
    ("ep-orr", "Objective response rate"),
]

SAFETY_RISKS = [
    ("risk-ild", "Interstitial lung disease"),
    ("risk-qt", "QT prolongation"),
    ("risk-hepato", "Hepatotoxicity"),
]

MOLECULE_DISEASE = [
    ("DRUG-A", "dis-nsclc", "TREATS"),
    ("DRUG-B", "dis-nsclc", "TREATS"),
    ("DRUG-C", "dis-nsclc", "TREATS"),
]

# claim_id -> study_ids
CLAIM_STUDIES: dict[str, list[str]] = {
    "clm-eval-csr001-pfs": ["study-csr-001"],
    "clm-eval-csr014-pfs": ["study-csr-014"],
    "clm-ret-csr-1": ["study-csr-001"],
    "clm-test-csr-1": ["study-csr-014"],
}

# claim_id -> endpoint_ids
CLAIM_ENDPOINTS: dict[str, list[str]] = {
    "clm-eval-csr001-pfs": ["ep-pfs", "ep-orr"],
    "clm-eval-csr014-pfs": ["ep-pfs", "ep-os", "ep-orr"],
    "clm-ret-csr-1": ["ep-pfs"],
    "clm-test-csr-1": ["ep-pfs"],
}

# claim_id -> risk_ids
CLAIM_RISKS: dict[str, list[str]] = {
    "clm-eval-druga-ild": ["risk-ild", "risk-qt"],
    "clm-eval-rmp-ild": ["risk-ild"],
}

# Extra claim–claim edges (beyond seed_eval_claims.py)
EXTRA_CLAIM_EDGES = [
    ("clm-eval-druga-indication", "clm-eval-gdl-biomarker", "SUPPORTED_BY"),
    ("clm-eval-druga-dose-reduce", "clm-eval-sop022-rules", "SUPPORTED_BY"),
    ("clm-eval-drugb-dose", "clm-eval-drugc-indication", "CONTRADICTED_BY"),
]

# molecule_id inferred from claim molecule_id or primary source
SOURCE_MOLECULE = {
    "LBL-NSCLC-DRUGA-EMA-2024": "DRUG-A",
    "LBL-NSCLC-DRUGB-EMA-2023": "DRUG-B",
    "LBL-NSCLC-DRUGC-EMA-2024": "DRUG-C",
    "DOC-CSR-NSCLC-001": "DRUG-A",
    "DOC-CSR-NSCLC-014": "DRUG-A",
    "RMP-NSCLC-DRUGA-2024": "DRUG-A",
    "PK-SUMMARY-NSCLC-005": "DRUG-A",
    "SOP-MED-NSCLC-010": "DRUG-A",
    "SOP-MED-NSCLC-022": "DRUG-A",
    "GDL-NSCLC-2025-03": None,
    "MI-FAQ-NSCLC-021": "DRUG-A",
    "MED-AFF-NSCLC-PLAYBOOK-008": None,
    "TREATMENT-ALGO-NSCLC-2025-02": "DRUG-A",
    "SME-NOTE-NSCLC-017": "DRUG-A",
    "LBL-NSCLC-RET-EMA-2026": "DRUG-A",
    "LBL-NSCLC-TEST-EMA-2026": "DRUG-A",
    "DOC-CSR-NSCLC-RET-2026": "DRUG-A",
    "DOC-CSR-NSCLC-TEST-2026": "DRUG-A",
}


def _rel_id(from_id: str, to_id: str, rel: str) -> str:
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"eval-graph-{from_id}-{to_id}-{rel}"))


def seed() -> dict:
    summary = {
        "entities": 0,
        "molecule_disease_links": 0,
        "claims_updated": 0,
        "claim_study_links": 0,
        "claim_endpoint_links": 0,
        "claim_risk_links": 0,
        "extra_claim_edges": 0,
    }

    conn = psycopg.connect(DSN)
    try:
        with conn:
            with conn.cursor() as cur:
                for disease_id, name, therapy in DISEASES:
                    cur.execute(
                        """
                        INSERT INTO diseases (disease_id, name, therapy_area)
                        VALUES (%s, %s, %s) ON CONFLICT (disease_id) DO UPDATE SET name = EXCLUDED.name
                        """,
                        (disease_id, name, therapy),
                    )
                    summary["entities"] += 1

                for mol_id, name in MOLECULES:
                    cur.execute(
                        """
                        INSERT INTO molecules (molecule_id, name) VALUES (%s, %s)
                        ON CONFLICT (molecule_id) DO UPDATE SET name = EXCLUDED.name
                        """,
                        (mol_id, name),
                    )
                    summary["entities"] += 1

                for pop_id, name in POPULATIONS:
                    cur.execute(
                        """
                        INSERT INTO populations (population_id, name) VALUES (%s, %s)
                        ON CONFLICT (population_id) DO UPDATE SET name = EXCLUDED.name
                        """,
                        (pop_id, name),
                    )
                    summary["entities"] += 1

                for geo_id, name, scope in GEOGRAPHIES:
                    cur.execute(
                        """
                        INSERT INTO geographies (geography_id, name, regulatory_scope)
                        VALUES (%s, %s, %s) ON CONFLICT (geography_id) DO UPDATE SET name = EXCLUDED.name
                        """,
                        (geo_id, name, scope),
                    )
                    summary["entities"] += 1

                for study_id, title in STUDIES:
                    cur.execute(
                        """
                        INSERT INTO studies (study_id, title) VALUES (%s, %s)
                        ON CONFLICT (study_id) DO UPDATE SET title = EXCLUDED.title
                        """,
                        (study_id, title),
                    )
                    summary["entities"] += 1

                for ep_id, name in ENDPOINTS:
                    cur.execute(
                        """
                        INSERT INTO endpoints (endpoint_id, name) VALUES (%s, %s)
                        ON CONFLICT (endpoint_id) DO UPDATE SET name = EXCLUDED.name
                        """,
                        (ep_id, name),
                    )
                    summary["entities"] += 1

                for risk_id, name in SAFETY_RISKS:
                    cur.execute(
                        """
                        INSERT INTO safety_risks (risk_id, name) VALUES (%s, %s)
                        ON CONFLICT (risk_id) DO UPDATE SET name = EXCLUDED.name
                        """,
                        (risk_id, name),
                    )
                    summary["entities"] += 1

                for mol_id, dis_id, rel in MOLECULE_DISEASE:
                    cur.execute(
                        """
                        INSERT INTO molecule_disease_links (molecule_id, disease_id, relation_type)
                        VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
                        """,
                        (mol_id, dis_id, rel),
                    )
                    summary["molecule_disease_links"] += cur.rowcount or 0

                cur.execute("SELECT claim_id, molecule_id, primary_source_id FROM claims")
                for claim_id, molecule_id, primary_source_id in cur.fetchall():
                    mol = molecule_id
                    if not mol and primary_source_id:
                        mol = SOURCE_MOLECULE.get(primary_source_id)
                    cur.execute(
                        """
                        UPDATE claims SET
                            disease_id = %s,
                            population_id = %s,
                            geography_id = %s,
                            molecule_id = COALESCE(molecule_id, %s)
                        WHERE claim_id = %s
                        """,
                        ("dis-nsclc", "pop-egfr-metastatic", "geo-eu-ema", mol, claim_id),
                    )
                    summary["claims_updated"] += 1

                for claim_id, study_ids in CLAIM_STUDIES.items():
                    for study_id in study_ids:
                        cur.execute(
                            """
                            INSERT INTO claim_study_links (claim_id, study_id)
                            VALUES (%s, %s) ON CONFLICT DO NOTHING
                            """,
                            (claim_id, study_id),
                        )
                        summary["claim_study_links"] += cur.rowcount or 0

                for claim_id, endpoint_ids in CLAIM_ENDPOINTS.items():
                    for endpoint_id in endpoint_ids:
                        cur.execute(
                            """
                            INSERT INTO claim_endpoint_links (claim_id, endpoint_id)
                            VALUES (%s, %s) ON CONFLICT DO NOTHING
                            """,
                            (claim_id, endpoint_id),
                        )
                        summary["claim_endpoint_links"] += cur.rowcount or 0

                for claim_id, risk_ids in CLAIM_RISKS.items():
                    for risk_id in risk_ids:
                        cur.execute(
                            """
                            INSERT INTO claim_risk_links (claim_id, risk_id)
                            VALUES (%s, %s) ON CONFLICT DO NOTHING
                            """,
                            (claim_id, risk_id),
                        )
                        summary["claim_risk_links"] += cur.rowcount or 0

                for from_id, to_id, rel in EXTRA_CLAIM_EDGES:
                    rel_id = _rel_id(from_id, to_id, rel)
                    cur.execute(
                        "DELETE FROM claim_relationships WHERE relationship_id = %s",
                        (rel_id,),
                    )
                    cur.execute(
                        """
                        INSERT INTO claim_relationships (
                            relationship_id, from_claim_id, to_claim_id,
                            relation_type, relation_metadata, created_at
                        ) VALUES (%s, %s, %s, %s, %s::json, %s)
                        """,
                        (
                            rel_id,
                            from_id,
                            to_id,
                            rel,
                            json.dumps({"seed": "eval_graph_entities"}),
                            NOW,
                        ),
                    )
                    summary["extra_claim_edges"] += 1
    finally:
        conn.close()

    return summary


def main() -> int:
    summary = seed()
    print(json.dumps(summary, indent=2))

    conn = psycopg.connect(DSN)
    try:
        with conn.cursor() as cur:
            for table in [
                "diseases",
                "molecules",
                "populations",
                "geographies",
                "studies",
                "endpoints",
                "safety_risks",
                "molecule_disease_links",
                "claim_study_links",
                "claim_endpoint_links",
                "claim_risk_links",
                "claim_relationships",
            ]:
                cur.execute(f"SELECT COUNT(*) FROM {table}")
                print(f"{table}: {cur.fetchone()[0]}")
    finally:
        conn.close()
    return 0


if __name__ == "__main__":
    raise SystemExit(main())