"""Seed claims + evidence assessments for the eval-pack knowledge corpus.

Links each claim to the best-matching chunk on its primary source and adds
EvidenceAssessment rows so memory search ranks eval sources (not only RET/TEST).

Does not modify fixture claims (clm-ret-*, clm-test-*).

Usage:
    python3 scripts/seed_eval_claims.py
    python3 scripts/seed_eval_claims.py --dry-run
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import uuid
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
_env_file = REPO_ROOT / ".env"
if _env_file.exists():
    for raw_line in _env_file.read_text().splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        os.environ.setdefault(key.strip(), value.strip())

import psycopg  # noqa: E402

DSN = os.getenv(
    "AKS_DATABASE_URL",
    "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine",
).replace("postgresql+psycopg://", "postgresql://", 1)

NOW = datetime.now(UTC)

FIXTURE_CLAIM_PREFIXES = ("clm-ret-", "clm-test-")


@dataclass(frozen=True)
class ClaimSpec:
    claim_id: str
    canonical_text: str
    claim_type: str  # DOSE, EFFICACY, SAFETY, ...
    primary_source_id: str
    molecule_id: str | None
    evidence_score: float
    strength_band: str  # HIGH, MEDIUM, LOW
    extra_chunk_sources: tuple[str, ...] = ()  # optional supporting sources


CLAIM_SPECS: list[ClaimSpec] = [
    # DRUG-A label (golden eval anchor)
    ClaimSpec(
        "clm-eval-druga-dose",
        "The recommended dose of DRUG-A is 80 mg once daily for first-line metastatic EGFR-positive NSCLC within approved EU label boundaries.",
        "DOSE",
        "LBL-NSCLC-DRUGA-EMA-2024",
        "DRUG-A",
        0.96,
        "HIGH",
        ("PK-SUMMARY-NSCLC-005", "SOP-MED-NSCLC-010"),
    ),
    ClaimSpec(
        "clm-eval-druga-dose-reduce",
        "Dose reductions for DRUG-A must remain within approved EU label boundaries (40 mg once daily when clinically justified for toxicity).",
        "DOSE",
        "LBL-NSCLC-DRUGA-EMA-2024",
        "DRUG-A",
        0.94,
        "HIGH",
    ),
    ClaimSpec(
        "clm-eval-druga-indication",
        "DRUG-A is indicated for first-line metastatic NSCLC with activating EGFR mutations under the approved EU label.",
        "BIOMARKER",
        "LBL-NSCLC-DRUGA-EMA-2024",
        "DRUG-A",
        0.93,
        "HIGH",
    ),
    ClaimSpec(
        "clm-eval-druga-ild",
        "Interstitial lung disease is an important risk for DRUG-A; grade 3 or higher ILD requires permanent discontinuation.",
        "SAFETY",
        "LBL-NSCLC-DRUGA-EMA-2024",
        "DRUG-A",
        0.90,
        "HIGH",
        ("RMP-NSCLC-DRUGA-2024",),
    ),
    # DRUG-B / DRUG-C labels (molecule separation)
    ClaimSpec(
        "clm-eval-drugb-dose",
        "DRUG-B dosing and administration must be described separately from DRUG-A using only the approved DRUG-B EU label.",
        "DOSE",
        "LBL-NSCLC-DRUGB-EMA-2023",
        "DRUG-B",
        0.88,
        "HIGH",
    ),
    ClaimSpec(
        "clm-eval-drugc-indication",
        "DRUG-C is authorised for EGFR-positive metastatic NSCLC within its own approved EU label scope.",
        "BIOMARKER",
        "LBL-NSCLC-DRUGC-EMA-2024",
        "DRUG-C",
        0.88,
        "HIGH",
    ),
    # CSRs
    ClaimSpec(
        "clm-eval-csr001-pfs",
        "DRUG-A improved progression-free survival versus chemotherapy in EGFR-positive first-line metastatic NSCLC.",
        "EFFICACY",
        "DOC-CSR-NSCLC-001",
        "DRUG-A",
        0.87,
        "HIGH",
    ),
    ClaimSpec(
        "clm-eval-csr014-pfs",
        "Progression-free survival and overall response rate favoured DRUG-A in EGFR-positive NSCLC in study DOC-CSR-NSCLC-014.",
        "EFFICACY",
        "DOC-CSR-NSCLC-014",
        "DRUG-A",
        0.86,
        "HIGH",
    ),
    # SOPs
    ClaimSpec(
        "clm-eval-sop010-dose",
        "Medical information responses for DRUG-A dosing must use approved EU label content: 80 mg once daily in first-line metastatic NSCLC.",
        "DOSE",
        "SOP-MED-NSCLC-010",
        "DRUG-A",
        0.84,
        "HIGH",
    ),
    ClaimSpec(
        "clm-eval-sop022-rules",
        "Off-label dosing inquiries for DRUG-A must be routed to SME review and must not be answered as approved truth.",
        "DOSE",
        "SOP-MED-NSCLC-022",
        "DRUG-A",
        0.82,
        "MEDIUM",
    ),
    # Guideline + algo
    ClaimSpec(
        "clm-eval-gdl-biomarker",
        "Validated EGFR mutation testing is required before selecting EGFR-targeted therapy in NSCLC.",
        "BIOMARKER",
        "GDL-NSCLC-2025-03",
        None,
        0.83,
        "HIGH",
    ),
    ClaimSpec(
        "clm-eval-gdl-line",
        "First-line and later-line NSCLC therapy boundaries must not be conflated; adjuvant use is outside approved metastatic label scope.",
        "LINE_OF_THERAPY",
        "GDL-NSCLC-2025-03",
        None,
        0.81,
        "MEDIUM",
    ),
    ClaimSpec(
        "clm-eval-algo-pathway",
        "Treatment algorithm step 1 requires confirmed EGFR mutation before first-line metastatic DRUG-A consideration.",
        "BIOMARKER",
        "TREATMENT-ALGO-NSCLC-2025-02",
        "DRUG-A",
        0.80,
        "MEDIUM",
    ),
    # RMP / PK / FAQ / playbook / SME
    ClaimSpec(
        "clm-eval-rmp-ild",
        "RMP identifies interstitial lung disease and QT prolongation as important identified risks for DRUG-A.",
        "SAFETY",
        "RMP-NSCLC-DRUGA-2024",
        "DRUG-A",
        0.85,
        "HIGH",
    ),
    ClaimSpec(
        "clm-eval-pk-dose",
        "DRUG-A 80 mg once daily achieves target exposure; renal impairment requires cautious judgement without unsupported fixed-dose rules.",
        "DOSE",
        "PK-SUMMARY-NSCLC-005",
        "DRUG-A",
        0.82,
        "MEDIUM",
    ),
    ClaimSpec(
        "clm-eval-faq-missed",
        "Patient-facing missed-dose guidance must follow approved label language only; do not invent rescue dosing instructions.",
        "DOSE",
        "MI-FAQ-NSCLC-021",
        "DRUG-A",
        0.78,
        "MEDIUM",
    ),
    ClaimSpec(
        "clm-eval-playbook-boundary",
        "Medical affairs playbook keeps DRUG-A, DRUG-B, and DRUG-C narratives separate and excludes adjuvant-only messaging from metastatic approval scope.",
        "LINE_OF_THERAPY",
        "MED-AFF-NSCLC-PLAYBOOK-008",
        None,
        0.77,
        "MEDIUM",
    ),
    ClaimSpec(
        "clm-eval-sme-compare",
        "SME review: comparative statements for DRUG-A require explicit label or CSR grounding and must avoid superiority overclaim.",
        "COMPARISON_BOUNDARY",
        "SME-NOTE-NSCLC-017",
        "DRUG-A",
        0.76,
        "MEDIUM",
    ),
]

# Cross-claim graph: label dose supported by PK + SOP
GRAPH_EDGES = [
    ("clm-eval-druga-dose", "clm-eval-pk-dose", "SUPPORTED_BY"),
    ("clm-eval-druga-dose", "clm-eval-sop010-dose", "SUPPORTED_BY"),
    ("clm-eval-druga-ild", "clm-eval-rmp-ild", "SUPPORTED_BY"),
    ("clm-eval-csr001-pfs", "clm-eval-druga-indication", "SUPPORTED_BY"),
]


def _token_overlap(claim_text: str, chunk_text: str, claim_type: str, chunk_type: str) -> float:
    claim_tokens = {
        t for t in (w.strip(".,;:()").lower() for w in claim_text.split()) if len(t) > 3
    }
    chunk_tokens = {
        t for t in (w.strip(".,;:()").lower() for w in chunk_text.split()) if len(t) > 3
    }
    overlap = len(claim_tokens & chunk_tokens)
    if claim_type == chunk_type:
        overlap += 8
    return overlap


def _best_chunk(
    cur: "psycopg.Cursor",
    source_id: str,
    spec: ClaimSpec,
) -> str | None:
    cur.execute(
        """
        SELECT chunk_id, claim_type, text FROM chunks
        WHERE source_id = %s ORDER BY token_count DESC
        """,
        (source_id,),
    )
    rows = cur.fetchall()
    if not rows:
        return None
    best_id = None
    best_score = -1.0
    for chunk_id, chunk_type, text in rows:
        score = _token_overlap(spec.canonical_text, text, spec.claim_type, str(chunk_type))
        if score > best_score:
            best_score = score
            best_id = chunk_id
    return best_id


def seed(*, dry_run: bool) -> dict:
    summary = {"claims_upserted": 0, "links": 0, "assessments": 0, "graph_edges": 0, "skipped": []}

    conn = psycopg.connect(DSN)
    try:
        with conn:
            with conn.cursor() as cur:
                for spec in CLAIM_SPECS:
                    primary_chunk = _best_chunk(cur, spec.primary_source_id, spec)
                    if primary_chunk is None:
                        summary["skipped"].append(spec.claim_id)
                        continue

                    if dry_run:
                        summary["claims_upserted"] += 1
                        continue

                    cur.execute(
                        """
                        INSERT INTO claims (
                            claim_id, canonical_text, claim_type, molecule_id,
                            geography_id, approval_state, sme_validated,
                            primary_source_id, current_evidence_score, strength_band, created_at
                        ) VALUES (%s, %s, %s, %s, %s, %s, FALSE, %s, %s, %s, %s)
                        ON CONFLICT (claim_id) DO UPDATE SET
                            canonical_text = EXCLUDED.canonical_text,
                            claim_type = EXCLUDED.claim_type,
                            molecule_id = EXCLUDED.molecule_id,
                            primary_source_id = EXCLUDED.primary_source_id,
                            current_evidence_score = EXCLUDED.current_evidence_score,
                            strength_band = EXCLUDED.strength_band
                        """,
                        (
                            spec.claim_id,
                            spec.canonical_text,
                            spec.claim_type,
                            spec.molecule_id,
                            "EU / EMA",
                            "approved",
                            spec.primary_source_id,
                            spec.evidence_score,
                            spec.strength_band,
                            NOW,
                        ),
                    )
                    summary["claims_upserted"] += 1

                    cur.execute(
                        "DELETE FROM claim_evidence_links WHERE claim_id = %s",
                        (spec.claim_id,),
                    )

                    def link_chunk(chunk_id: str, source_id: str, *, primary: bool) -> None:
                        cur.execute(
                            """
                            INSERT INTO claim_evidence_links (
                                claim_id, chunk_id, source_id, support_type,
                                extraction_confidence, is_primary_support
                            ) VALUES (%s, %s, %s, 'PRIMARY', %s, %s)
                            ON CONFLICT DO NOTHING
                            """,
                            (
                                spec.claim_id,
                                chunk_id,
                                source_id,
                                0.95 if primary else 0.75,
                                primary,
                            ),
                        )
                        summary["links"] += cur.rowcount or 0

                    link_chunk(primary_chunk, spec.primary_source_id, primary=True)

                    for extra_source in spec.extra_chunk_sources:
                        extra_chunk = _best_chunk(cur, extra_source, spec)
                        if extra_chunk:
                            link_chunk(extra_chunk, extra_source, primary=False)

                    assessment_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"eval-asm-{spec.claim_id}"))
                    cur.execute(
                        "DELETE FROM evidence_assessments WHERE assessment_id = %s",
                        (assessment_id,),
                    )
                    cur.execute(
                        """
                        INSERT INTO evidence_assessments (
                            assessment_id, claim_id, source_prior_score, recency_score,
                            approval_score, sme_score, consistency_score, audience_fit_score,
                            geography_fit_score, penalty_score, evidence_score, strength_band,
                            explanation_json, scored_at
                        ) VALUES (
                            %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::json, %s
                        )
                        """,
                        (
                            assessment_id,
                            spec.claim_id,
                            0.95 if spec.primary_source_id.startswith("LBL-") else 0.85,
                            0.90,
                            1.0,
                            0.70,
                            0.90,
                            1.0,
                            1.0,
                            0.0,
                            spec.evidence_score,
                            spec.strength_band,
                            json.dumps(
                                {
                                    "reasons": [
                                        f"Seeded from eval corpus primary source {spec.primary_source_id}"
                                    ]
                                }
                            ),
                            NOW,
                        ),
                    )
                    summary["assessments"] += 1

                if not dry_run:
                    for from_id, to_id, rel in GRAPH_EDGES:
                        rel_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"eval-rel-{from_id}-{to_id}"))
                        cur.execute(
                            "DELETE FROM claim_relationships WHERE relationship_id = %s",
                            (rel_id,),
                        )
                        cur.execute(
                            """
                            INSERT INTO claim_relationships (
                                relationship_id, from_claim_id, to_claim_id,
                                relation_type, relation_metadata, created_at
                            ) VALUES (%s, %s, %s, %s, %s::json, %s)
                            """,
                            (
                                rel_id,
                                from_id,
                                to_id,
                                rel,
                                json.dumps({"seed": "eval_corpus"}),
                                NOW,
                            ),
                        )
                        summary["graph_edges"] += 1
    finally:
        conn.close()

    return summary


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()
    summary = seed(dry_run=args.dry_run)
    print(json.dumps(summary, indent=2))
    if args.dry_run:
        return 0

    conn = psycopg.connect(DSN)
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM claims WHERE claim_id LIKE 'clm-eval-%'")
            print(f"eval claims in DB: {cur.fetchone()[0]}")
            cur.execute(
                """
                SELECT c.primary_source_id, COUNT(*)
                FROM claims c WHERE c.claim_id LIKE 'clm-eval-%'
                GROUP BY c.primary_source_id ORDER BY 1
                """
            )
            print("by primary source:")
            for row in cur.fetchall():
                print(f"  {row[0]}: {row[1]}")
    finally:
        conn.close()
    return 0


if __name__ == "__main__":
    raise SystemExit(main())