"""
ACMG Evidence Prediction — Sentence-Level Lookup (HuggingFace Spaces)

On first launch this app downloads sentence_predictions.parquet from the
HuggingFace dataset repo weijiang99/acmg-sentence-predictions.
Subsequent launches reuse the cached file.
"""

import os
import gradio as gr
import duckdb
import pandas as pd
from pathlib import Path
from huggingface_hub import hf_hub_download

# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------

DATASET_REPO = "weijiang99/acmg-sentence-predictions"
PARQUET_FILENAME = "sentence_predictions.parquet"

# hf_hub_download caches to ~/.cache/huggingface by default (persists across
# Space restarts when a persistent-storage volume is attached, otherwise
# re-downloads on cold start — still faster than storing in the repo itself)
def get_parquet_path() -> str:
    local = Path(PARQUET_FILENAME)
    if local.exists():
        return str(local)
    print(f"Downloading {PARQUET_FILENAME} from {DATASET_REPO} ...")
    path = hf_hub_download(
        repo_id=DATASET_REPO,
        filename=PARQUET_FILENAME,
        repo_type="dataset",
    )
    print(f"Downloaded to {path}")
    return path

PARQUET_PATH = get_parquet_path()

EVIDENCE_TYPES = ["Functional", "Computational", "Population"]
SENTENCE_COL_WIDTHS = ["4%", "44%", "12%", "12%", "12%", "16%"]


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _con():
    return duckdb.connect(database=":memory:")


def _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt):
    conditions, params = [], {}

    def add(col, val, param, expr=None):
        if val and str(val).strip():
            conditions.append(expr or f"{col} = ${param}")
            params[param] = val.strip()

    add("UPPER(gene)", gene,         "gene",        "UPPER(gene) = UPPER($gene)")
    add("VCV",         vcv,          "vcv")
    add("RCV",         rcv,          "rcv")
    add("SCV",         scv,          "scv")
    add("VariationID", variation_id, "variation_id")
    add("GRCh38_Chr",  chr_val,      "chr")

    if pos and str(pos).strip():
        try:
            params["pos"] = float(pos)
            conditions.append("GRCh38_Start = $pos")
        except ValueError:
            pass

    add("UPPER(GRCh38_ReferenceAllele)", ref, "ref",
        "UPPER(GRCh38_ReferenceAllele) = UPPER($ref)")
    add("UPPER(GRCh38_AlternateAllele)", alt, "alt",
        "UPPER(GRCh38_AlternateAllele) = UPPER($alt)")

    clause = " AND ".join(conditions) if conditions else "1=1"
    return clause, params


# ---------------------------------------------------------------------------
# Search
# ---------------------------------------------------------------------------

def search_submissions(evidence_type, gene, vcv, rcv, scv,
                       variation_id, chr_val, pos, ref, alt):
    where, params = _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt)
    prefix = evidence_type.lower()

    try:
        con = _con()
        sql = f"""
            SELECT DISTINCT
                SCV,
                VCV,
                COALESCE(gene, Gene)  AS Gene,
                COALESCE(
                    'chr' || GRCh38_Chr || ':' || CAST(GRCh38_Start AS VARCHAR)
                    || ' ' || GRCh38_ReferenceAllele
                    || '>' || GRCh38_AlternateAllele,
                    ''
                ) AS Variant,
                SubmissionClassification               AS "ClinVar Class",
                Submitter,
                final_label                            AS "Model Label",
                CASE WHEN has_evidence THEN 'Yes' ELSE 'No' END AS "Has Evidence",
                ROUND(CAST(evidence_confidence AS DOUBLE), 4)   AS "Max Confidence",
                ROUND(CAST(P_Score AS DOUBLE), 4)               AS "P Score",
                ROUND(CAST(B_Score AS DOUBLE), 4)               AS "B Score",
                Comment
            FROM read_parquet('{PARQUET_PATH}')
            WHERE evidence_type = '{prefix}'
              AND ({where})
            ORDER BY evidence_confidence DESC NULLS LAST
            LIMIT 200
        """
        df = con.execute(sql, parameters=params).df()
        con.close()

        if df.empty:
            return pd.DataFrame({"Message": ["No results found."]}), []

        choices = [
            f"{row['SCV']}  —  {row['Gene'] or '?'}  |  {row['ClinVar Class'] or '?'}  |  label: {row['Model Label'] or 'None'}"
            for _, row in df.iterrows()
        ]
        return df, choices

    except Exception as e:
        return pd.DataFrame({"Error": [str(e)]}), []


# ---------------------------------------------------------------------------
# Sentence detail
# ---------------------------------------------------------------------------

def load_sentences(evidence_type, scv_choice):
    if not scv_choice:
        return pd.DataFrame({"Message": ["Select a submission above to see sentence details."]})

    scv = scv_choice.split("  —  ")[0].strip()
    prefix = evidence_type.lower()

    try:
        con = _con()
        sql = f"""
            SELECT
                sentence_index        AS "#",
                sentence              AS "Sentence",
                ROUND(CAST(stage1_score AS DOUBLE), 4)            AS "Stage 1: Has Evidence",
                stage1_label                                       AS "Stage 1 Label",
                ROUND(CAST(stage2_pathogenic_score AS DOUBLE), 4) AS "Stage 2 P",
                ROUND(CAST(stage2_benign_score AS DOUBLE), 4)     AS "Stage 2 B",
                predicted_evidence_sentence                        AS "Predicted"
            FROM read_parquet('{PARQUET_PATH}')
            WHERE evidence_type = '{prefix}'
              AND SCV = $scv
            ORDER BY sentence_index ASC
        """
        df = con.execute(sql, parameters={"scv": scv}).df()
        con.close()

        if df.empty:
            return pd.DataFrame({"Message": [f"No sentence data found for {scv}."]})
        return df

    except Exception as e:
        return pd.DataFrame({"Error": [str(e)]})


# ---------------------------------------------------------------------------
# Stats
# ---------------------------------------------------------------------------

def get_stats(evidence_type):
    prefix = evidence_type.lower()
    try:
        con = _con()

        total_sub = con.execute(f"""
            SELECT COUNT(DISTINCT SCV) FROM read_parquet('{PARQUET_PATH}')
            WHERE evidence_type = '{prefix}'
        """).fetchone()[0]

        total_sent = con.execute(f"""
            SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}')
            WHERE evidence_type = '{prefix}'
        """).fetchone()[0]

        unique_genes = con.execute(f"""
            SELECT COUNT(DISTINCT COALESCE(gene, Gene))
            FROM read_parquet('{PARQUET_PATH}')
            WHERE evidence_type = '{prefix}'
              AND COALESCE(gene, Gene) IS NOT NULL
        """).fetchone()[0]

        label_dist = con.execute(f"""
            SELECT final_label, COUNT(DISTINCT SCV) AS n
            FROM read_parquet('{PARQUET_PATH}')
            WHERE evidence_type = '{prefix}'
            GROUP BY final_label ORDER BY n DESC
        """).fetchall()

        s1_dist = con.execute(f"""
            SELECT stage1_label, COUNT(*) AS n
            FROM read_parquet('{PARQUET_PATH}')
            WHERE evidence_type = '{prefix}'
            GROUP BY stage1_label ORDER BY n DESC
        """).fetchall()

        con.close()

        txt = f"""## Dataset Statistics — {evidence_type} Evidence

**Unique submissions (SCV):** {total_sub:,}
**Total sentences:** {total_sent:,}
**Unique genes:** {unique_genes:,}

### Submission-level final label distribution
"""
        for label, n in label_dist:
            pct = 100 * n / total_sub if total_sub else 0
            txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n"

        txt += "\n### Sentence-level Stage 1 label distribution\n"
        for label, n in s1_dist:
            pct = 100 * n / total_sent if total_sent else 0
            txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n"

        return txt

    except Exception as e:
        return f"Error: {e}"


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------

with gr.Blocks(
    title="ACMG Evidence — Sentence-Level Lookup",
    theme=gr.themes.Base(
        primary_hue="blue",
        secondary_hue="slate",
        font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
    ),
) as demo:

    gr.Markdown("""
    # ACMG Evidence Prediction — Sentence-Level Lookup

    Search for a ClinVar variant or submission, then select a result to see the
    model confidence score assigned to **each individual sentence** in the
    submission comment.

    Part of: *"Language models reveal evidence gaps in variants of uncertain significance"*
    (Li, Bhat, Yu, Lebo, Zitnik, Cassa)
    """)

    with gr.Tab("Sentence Lookup"):

        gr.Markdown("### 1. Select evidence type and enter search criteria")

        evidence_sel = gr.Radio(
            choices=EVIDENCE_TYPES,
            value="Functional",
            label="Evidence Type",
        )

        with gr.Row():
            with gr.Column():
                gene_in  = gr.Textbox(label="Gene Symbol",       placeholder="e.g. BRCA1")
                vcv_in   = gr.Textbox(label="VCV",               placeholder="e.g. VCV000000390")
                rcv_in   = gr.Textbox(label="RCV",               placeholder="e.g. RCV000000087")
                scv_in   = gr.Textbox(label="SCV",               placeholder="e.g. SCV000020565")
                varid_in = gr.Textbox(label="VariationID",        placeholder="e.g. 390")
            with gr.Column():
                chr_in   = gr.Textbox(label="Chromosome",        placeholder="e.g. 17")
                pos_in   = gr.Textbox(label="Position (GRCh38)", placeholder="e.g. 43094464")
                ref_in   = gr.Textbox(label="Reference Allele",  placeholder="e.g. C")
                alt_in   = gr.Textbox(label="Alternate Allele",  placeholder="e.g. T")

        search_btn = gr.Button("Search", variant="primary")

        gr.Markdown("### 2. Matching submissions  _(max 200)_")

        results_table = gr.Dataframe(
            label="Search Results",
            wrap=True,
            interactive=False,
            column_widths=["10%", "10%", "6%", "14%", "10%", "10%",
                           "8%", "8%", "8%", "6%", "6%", "14%"],
        )

        gr.Markdown("### 3. Select a submission to inspect")
        submission_dropdown = gr.Dropdown(
            label="Submission (SCV — Gene | ClinVar class | model label)",
            choices=[],
            interactive=True,
            allow_custom_value=False,
        )

        gr.Markdown("### 4. Select evidence type for sentence-level breakdown")
        gr.Markdown(
            "_Each evidence type is modeled independently. "
            "Choose which one to display per-sentence scores for._"
        )
        sentence_evidence_sel = gr.Radio(
            choices=EVIDENCE_TYPES,
            value="Functional",
            label="Evidence Type for Sentence Breakdown",
            info="Functional = PS3/BS3 · Population = BA1/BS1/BS2/PS4/PM2 · Computational = PP3/BP4/BP1/BP7",
        )
        load_sentences_btn = gr.Button("Load Sentence Scores", variant="secondary")

        gr.Markdown("### 5. Per-sentence model scores")
        gr.Markdown("""
| Column | Meaning |
|--------|---------|
| **Stage 1: Has Evidence** | Probability the sentence contains this evidence type (0–1). Threshold = 0.8 |
| **Stage 1 Label** | `has_evidence` if score > 0.8, else `no_evidence` |
| **Stage 2 P** | Probability the evidence is **pathogenic** (meaningful only when Stage 1 > 0.8) |
| **Stage 2 B** | Probability the evidence is **benign** (meaningful only when Stage 1 > 0.8) |
| **Predicted** | Per-sentence label — for Functional: `PS3`/`BS3`; for Population/Computational: `P`/`B`; or `None` |
        """)

        sentence_table = gr.Dataframe(
            label="Per-Sentence Scores",
            wrap=True,
            interactive=False,
            column_widths=SENTENCE_COL_WIDTHS,
        )

        gr.Markdown("### Example searches")
        gr.Examples(
            examples=[
                ["Functional",    "BRCA1", "",             "", "",             "", "",  "",          "", ""],
                ["Functional",    "",      "VCV000000390", "", "",             "", "",  "",          "", ""],
                ["Functional",    "",      "",             "", "SCV000020565", "", "",  "",          "", ""],
                ["Population",    "KCNQ1", "",             "", "",             "", "",  "",          "", ""],
                ["Computational", "TP53",  "",             "", "",             "", "",  "",          "", ""],
            ],
            inputs=[evidence_sel, gene_in, vcv_in, rcv_in, scv_in,
                    varid_in, chr_in, pos_in, ref_in, alt_in],
            label="Click to load example",
        )

    with gr.Tab("Dataset Statistics"):
        stats_evidence_sel = gr.Radio(
            choices=EVIDENCE_TYPES, value="Functional", label="Evidence Type"
        )
        stats_btn    = gr.Button("Load Statistics", variant="primary")
        stats_output = gr.Markdown()
        stats_btn.click(fn=get_stats, inputs=stats_evidence_sel, outputs=stats_output)

    with gr.Tab("About"):
        gr.Markdown("""
## About This Tool

This interface exposes the **sentence-level** output of the two-stage ACMG evidence
classifier described in:

> *"Language models reveal evidence gaps in variants of uncertain significance"*
> (Li, Bhat, Yu, Lebo, Zitnik, Cassa)

### How the model works

Each ClinVar submission comment is split into sentences.
Two BioBERT-large classifiers are applied in sequence:

1. **Stage 1** — Does this sentence contain functional / population / computational
   evidence?  Score > 0.8 → `has_evidence`.
2. **Stage 2** — Is the evidence pathogenic or benign?
   Runs only on sentences flagged by Stage 1.

The submission-level prediction uses the **maximum Stage 1 score** across all
sentences; direction (P/B) comes from the highest-confidence Stage 2 sentence.

### Evidence types

| Type | ACMG codes |
|------|-----------|
| Functional | PS3, BS3 |
| Population | BA1, BS1, BS2, PS4, PM2 |
| Computational | PP3, BP4, BP1, BP7 |

### Data
Predictions cover all ClinVar submissions with free-text comments as of the
dataset release date.  Full submission-level predictions are also available at
the companion space [weijiang99/acmg-evidence-prediction](https://huggingface.co/spaces/weijiang99/acmg-evidence-prediction).
        """)

    # ------------------------------------------------------------------
    # Event wiring
    # ------------------------------------------------------------------

    _search_inputs = [
        evidence_sel, gene_in, vcv_in, rcv_in, scv_in,
        varid_in, chr_in, pos_in, ref_in, alt_in,
    ]

    def _search_and_populate(*args):
        df, choices = search_submissions(*args)
        return df, gr.Dropdown(choices=choices, value=None), pd.DataFrame()

    search_btn.click(
        fn=_search_and_populate,
        inputs=_search_inputs,
        outputs=[results_table, submission_dropdown, sentence_table],
    )

    # Load sentence scores when button clicked or evidence type changed
    load_sentences_btn.click(
        fn=load_sentences,
        inputs=[sentence_evidence_sel, submission_dropdown],
        outputs=sentence_table,
    )
    sentence_evidence_sel.change(
        fn=load_sentences,
        inputs=[sentence_evidence_sel, submission_dropdown],
        outputs=sentence_table,
    )


if __name__ == "__main__":
    import sys
    demo.launch(share="--share" in sys.argv)