""" ACMG Evidence Prediction — Sentence-Level Lookup (HuggingFace Spaces) On first launch this app downloads sentence_predictions.parquet from the HuggingFace dataset repo weijiang99/acmg-sentence-predictions. Subsequent launches reuse the cached file. """ import os import gradio as gr import duckdb import pandas as pd from pathlib import Path from huggingface_hub import hf_hub_download # --------------------------------------------------------------------------- # Data loading # --------------------------------------------------------------------------- DATASET_REPO = "weijiang99/acmg-sentence-predictions" PARQUET_FILENAME = "sentence_predictions.parquet" # hf_hub_download caches to ~/.cache/huggingface by default (persists across # Space restarts when a persistent-storage volume is attached, otherwise # re-downloads on cold start — still faster than storing in the repo itself) def get_parquet_path() -> str: local = Path(PARQUET_FILENAME) if local.exists(): return str(local) print(f"Downloading {PARQUET_FILENAME} from {DATASET_REPO} ...") path = hf_hub_download( repo_id=DATASET_REPO, filename=PARQUET_FILENAME, repo_type="dataset", ) print(f"Downloaded to {path}") return path PARQUET_PATH = get_parquet_path() EVIDENCE_TYPES = ["Functional", "Computational", "Population"] SENTENCE_COL_WIDTHS = ["4%", "44%", "12%", "12%", "12%", "16%"] # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _con(): return duckdb.connect(database=":memory:") def _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt): conditions, params = [], {} def add(col, val, param, expr=None): if val and str(val).strip(): conditions.append(expr or f"{col} = ${param}") params[param] = val.strip() add("UPPER(gene)", gene, "gene", "UPPER(gene) = UPPER($gene)") add("VCV", vcv, "vcv") add("RCV", rcv, "rcv") add("SCV", scv, "scv") add("VariationID", variation_id, "variation_id") add("GRCh38_Chr", chr_val, "chr") if pos and str(pos).strip(): try: params["pos"] = float(pos) conditions.append("GRCh38_Start = $pos") except ValueError: pass add("UPPER(GRCh38_ReferenceAllele)", ref, "ref", "UPPER(GRCh38_ReferenceAllele) = UPPER($ref)") add("UPPER(GRCh38_AlternateAllele)", alt, "alt", "UPPER(GRCh38_AlternateAllele) = UPPER($alt)") clause = " AND ".join(conditions) if conditions else "1=1" return clause, params # --------------------------------------------------------------------------- # Search # --------------------------------------------------------------------------- def search_submissions(evidence_type, gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt): where, params = _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt) prefix = evidence_type.lower() try: con = _con() sql = f""" SELECT DISTINCT SCV, VCV, COALESCE(gene, Gene) AS Gene, COALESCE( 'chr' || GRCh38_Chr || ':' || CAST(GRCh38_Start AS VARCHAR) || ' ' || GRCh38_ReferenceAllele || '>' || GRCh38_AlternateAllele, '' ) AS Variant, SubmissionClassification AS "ClinVar Class", Submitter, final_label AS "Model Label", CASE WHEN has_evidence THEN 'Yes' ELSE 'No' END AS "Has Evidence", ROUND(CAST(evidence_confidence AS DOUBLE), 4) AS "Max Confidence", ROUND(CAST(P_Score AS DOUBLE), 4) AS "P Score", ROUND(CAST(B_Score AS DOUBLE), 4) AS "B Score", Comment FROM read_parquet('{PARQUET_PATH}') WHERE evidence_type = '{prefix}' AND ({where}) ORDER BY evidence_confidence DESC NULLS LAST LIMIT 200 """ df = con.execute(sql, parameters=params).df() con.close() if df.empty: return pd.DataFrame({"Message": ["No results found."]}), [] choices = [ f"{row['SCV']} — {row['Gene'] or '?'} | {row['ClinVar Class'] or '?'} | label: {row['Model Label'] or 'None'}" for _, row in df.iterrows() ] return df, choices except Exception as e: return pd.DataFrame({"Error": [str(e)]}), [] # --------------------------------------------------------------------------- # Sentence detail # --------------------------------------------------------------------------- def load_sentences(evidence_type, scv_choice): if not scv_choice: return pd.DataFrame({"Message": ["Select a submission above to see sentence details."]}) scv = scv_choice.split(" — ")[0].strip() prefix = evidence_type.lower() try: con = _con() sql = f""" SELECT sentence_index AS "#", sentence AS "Sentence", ROUND(CAST(stage1_score AS DOUBLE), 4) AS "Stage 1: Has Evidence", stage1_label AS "Stage 1 Label", ROUND(CAST(stage2_pathogenic_score AS DOUBLE), 4) AS "Stage 2 P", ROUND(CAST(stage2_benign_score AS DOUBLE), 4) AS "Stage 2 B", predicted_evidence_sentence AS "Predicted" FROM read_parquet('{PARQUET_PATH}') WHERE evidence_type = '{prefix}' AND SCV = $scv ORDER BY sentence_index ASC """ df = con.execute(sql, parameters={"scv": scv}).df() con.close() if df.empty: return pd.DataFrame({"Message": [f"No sentence data found for {scv}."]}) return df except Exception as e: return pd.DataFrame({"Error": [str(e)]}) # --------------------------------------------------------------------------- # Stats # --------------------------------------------------------------------------- def get_stats(evidence_type): prefix = evidence_type.lower() try: con = _con() total_sub = con.execute(f""" SELECT COUNT(DISTINCT SCV) FROM read_parquet('{PARQUET_PATH}') WHERE evidence_type = '{prefix}' """).fetchone()[0] total_sent = con.execute(f""" SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}') WHERE evidence_type = '{prefix}' """).fetchone()[0] unique_genes = con.execute(f""" SELECT COUNT(DISTINCT COALESCE(gene, Gene)) FROM read_parquet('{PARQUET_PATH}') WHERE evidence_type = '{prefix}' AND COALESCE(gene, Gene) IS NOT NULL """).fetchone()[0] label_dist = con.execute(f""" SELECT final_label, COUNT(DISTINCT SCV) AS n FROM read_parquet('{PARQUET_PATH}') WHERE evidence_type = '{prefix}' GROUP BY final_label ORDER BY n DESC """).fetchall() s1_dist = con.execute(f""" SELECT stage1_label, COUNT(*) AS n FROM read_parquet('{PARQUET_PATH}') WHERE evidence_type = '{prefix}' GROUP BY stage1_label ORDER BY n DESC """).fetchall() con.close() txt = f"""## Dataset Statistics — {evidence_type} Evidence **Unique submissions (SCV):** {total_sub:,} **Total sentences:** {total_sent:,} **Unique genes:** {unique_genes:,} ### Submission-level final label distribution """ for label, n in label_dist: pct = 100 * n / total_sub if total_sub else 0 txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n" txt += "\n### Sentence-level Stage 1 label distribution\n" for label, n in s1_dist: pct = 100 * n / total_sent if total_sent else 0 txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n" return txt except Exception as e: return f"Error: {e}" # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- with gr.Blocks( title="ACMG Evidence — Sentence-Level Lookup", theme=gr.themes.Base( primary_hue="blue", secondary_hue="slate", font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"], ), ) as demo: gr.Markdown(""" # ACMG Evidence Prediction — Sentence-Level Lookup Search for a ClinVar variant or submission, then select a result to see the model confidence score assigned to **each individual sentence** in the submission comment. Part of: *"Language models reveal evidence gaps in variants of uncertain significance"* (Li, Bhat, Yu, Lebo, Zitnik, Cassa) """) with gr.Tab("Sentence Lookup"): gr.Markdown("### 1. Select evidence type and enter search criteria") evidence_sel = gr.Radio( choices=EVIDENCE_TYPES, value="Functional", label="Evidence Type", ) with gr.Row(): with gr.Column(): gene_in = gr.Textbox(label="Gene Symbol", placeholder="e.g. BRCA1") vcv_in = gr.Textbox(label="VCV", placeholder="e.g. VCV000000390") rcv_in = gr.Textbox(label="RCV", placeholder="e.g. RCV000000087") scv_in = gr.Textbox(label="SCV", placeholder="e.g. SCV000020565") varid_in = gr.Textbox(label="VariationID", placeholder="e.g. 390") with gr.Column(): chr_in = gr.Textbox(label="Chromosome", placeholder="e.g. 17") pos_in = gr.Textbox(label="Position (GRCh38)", placeholder="e.g. 43094464") ref_in = gr.Textbox(label="Reference Allele", placeholder="e.g. C") alt_in = gr.Textbox(label="Alternate Allele", placeholder="e.g. T") search_btn = gr.Button("Search", variant="primary") gr.Markdown("### 2. Matching submissions _(max 200)_") results_table = gr.Dataframe( label="Search Results", wrap=True, interactive=False, column_widths=["10%", "10%", "6%", "14%", "10%", "10%", "8%", "8%", "8%", "6%", "6%", "14%"], ) gr.Markdown("### 3. Select a submission to inspect") submission_dropdown = gr.Dropdown( label="Submission (SCV — Gene | ClinVar class | model label)", choices=[], interactive=True, allow_custom_value=False, ) gr.Markdown("### 4. Select evidence type for sentence-level breakdown") gr.Markdown( "_Each evidence type is modeled independently. " "Choose which one to display per-sentence scores for._" ) sentence_evidence_sel = gr.Radio( choices=EVIDENCE_TYPES, value="Functional", label="Evidence Type for Sentence Breakdown", info="Functional = PS3/BS3 · Population = BA1/BS1/BS2/PS4/PM2 · Computational = PP3/BP4/BP1/BP7", ) load_sentences_btn = gr.Button("Load Sentence Scores", variant="secondary") gr.Markdown("### 5. Per-sentence model scores") gr.Markdown(""" | Column | Meaning | |--------|---------| | **Stage 1: Has Evidence** | Probability the sentence contains this evidence type (0–1). Threshold = 0.8 | | **Stage 1 Label** | `has_evidence` if score > 0.8, else `no_evidence` | | **Stage 2 P** | Probability the evidence is **pathogenic** (meaningful only when Stage 1 > 0.8) | | **Stage 2 B** | Probability the evidence is **benign** (meaningful only when Stage 1 > 0.8) | | **Predicted** | Per-sentence label — for Functional: `PS3`/`BS3`; for Population/Computational: `P`/`B`; or `None` | """) sentence_table = gr.Dataframe( label="Per-Sentence Scores", wrap=True, interactive=False, column_widths=SENTENCE_COL_WIDTHS, ) gr.Markdown("### Example searches") gr.Examples( examples=[ ["Functional", "BRCA1", "", "", "", "", "", "", "", ""], ["Functional", "", "VCV000000390", "", "", "", "", "", "", ""], ["Functional", "", "", "", "SCV000020565", "", "", "", "", ""], ["Population", "KCNQ1", "", "", "", "", "", "", "", ""], ["Computational", "TP53", "", "", "", "", "", "", "", ""], ], inputs=[evidence_sel, gene_in, vcv_in, rcv_in, scv_in, varid_in, chr_in, pos_in, ref_in, alt_in], label="Click to load example", ) with gr.Tab("Dataset Statistics"): stats_evidence_sel = gr.Radio( choices=EVIDENCE_TYPES, value="Functional", label="Evidence Type" ) stats_btn = gr.Button("Load Statistics", variant="primary") stats_output = gr.Markdown() stats_btn.click(fn=get_stats, inputs=stats_evidence_sel, outputs=stats_output) with gr.Tab("About"): gr.Markdown(""" ## About This Tool This interface exposes the **sentence-level** output of the two-stage ACMG evidence classifier described in: > *"Language models reveal evidence gaps in variants of uncertain significance"* > (Li, Bhat, Yu, Lebo, Zitnik, Cassa) ### How the model works Each ClinVar submission comment is split into sentences. Two BioBERT-large classifiers are applied in sequence: 1. **Stage 1** — Does this sentence contain functional / population / computational evidence? Score > 0.8 → `has_evidence`. 2. **Stage 2** — Is the evidence pathogenic or benign? Runs only on sentences flagged by Stage 1. The submission-level prediction uses the **maximum Stage 1 score** across all sentences; direction (P/B) comes from the highest-confidence Stage 2 sentence. ### Evidence types | Type | ACMG codes | |------|-----------| | Functional | PS3, BS3 | | Population | BA1, BS1, BS2, PS4, PM2 | | Computational | PP3, BP4, BP1, BP7 | ### Data Predictions cover all ClinVar submissions with free-text comments as of the dataset release date. Full submission-level predictions are also available at the companion space [weijiang99/acmg-evidence-prediction](https://huggingface.co/spaces/weijiang99/acmg-evidence-prediction). """) # ------------------------------------------------------------------ # Event wiring # ------------------------------------------------------------------ _search_inputs = [ evidence_sel, gene_in, vcv_in, rcv_in, scv_in, varid_in, chr_in, pos_in, ref_in, alt_in, ] def _search_and_populate(*args): df, choices = search_submissions(*args) return df, gr.Dropdown(choices=choices, value=None), pd.DataFrame() search_btn.click( fn=_search_and_populate, inputs=_search_inputs, outputs=[results_table, submission_dropdown, sentence_table], ) # Load sentence scores when button clicked or evidence type changed load_sentences_btn.click( fn=load_sentences, inputs=[sentence_evidence_sel, submission_dropdown], outputs=sentence_table, ) sentence_evidence_sel.change( fn=load_sentences, inputs=[sentence_evidence_sel, submission_dropdown], outputs=sentence_table, ) if __name__ == "__main__": import sys demo.launch(share="--share" in sys.argv)