Spaces:
Sleeping
Sleeping
| """ | |
| ACMG Evidence Prediction — Sentence-Level Lookup (HuggingFace Spaces) | |
| On first launch this app downloads sentence_predictions.parquet from the | |
| HuggingFace dataset repo weijiang99/acmg-sentence-predictions. | |
| Subsequent launches reuse the cached file. | |
| """ | |
| import os | |
| import gradio as gr | |
| import duckdb | |
| import pandas as pd | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download | |
| # --------------------------------------------------------------------------- | |
| # Data loading | |
| # --------------------------------------------------------------------------- | |
| DATASET_REPO = "weijiang99/acmg-sentence-predictions" | |
| PARQUET_FILENAME = "sentence_predictions.parquet" | |
| # hf_hub_download caches to ~/.cache/huggingface by default (persists across | |
| # Space restarts when a persistent-storage volume is attached, otherwise | |
| # re-downloads on cold start — still faster than storing in the repo itself) | |
| def get_parquet_path() -> str: | |
| local = Path(PARQUET_FILENAME) | |
| if local.exists(): | |
| return str(local) | |
| print(f"Downloading {PARQUET_FILENAME} from {DATASET_REPO} ...") | |
| path = hf_hub_download( | |
| repo_id=DATASET_REPO, | |
| filename=PARQUET_FILENAME, | |
| repo_type="dataset", | |
| ) | |
| print(f"Downloaded to {path}") | |
| return path | |
| PARQUET_PATH = get_parquet_path() | |
| EVIDENCE_TYPES = ["Functional", "Computational", "Population"] | |
| SENTENCE_COL_WIDTHS = ["4%", "44%", "12%", "12%", "12%", "16%"] | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def _con(): | |
| return duckdb.connect(database=":memory:") | |
| def _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt): | |
| conditions, params = [], {} | |
| def add(col, val, param, expr=None): | |
| if val and str(val).strip(): | |
| conditions.append(expr or f"{col} = ${param}") | |
| params[param] = val.strip() | |
| add("UPPER(gene)", gene, "gene", "UPPER(gene) = UPPER($gene)") | |
| add("VCV", vcv, "vcv") | |
| add("RCV", rcv, "rcv") | |
| add("SCV", scv, "scv") | |
| add("VariationID", variation_id, "variation_id") | |
| add("GRCh38_Chr", chr_val, "chr") | |
| if pos and str(pos).strip(): | |
| try: | |
| params["pos"] = float(pos) | |
| conditions.append("GRCh38_Start = $pos") | |
| except ValueError: | |
| pass | |
| add("UPPER(GRCh38_ReferenceAllele)", ref, "ref", | |
| "UPPER(GRCh38_ReferenceAllele) = UPPER($ref)") | |
| add("UPPER(GRCh38_AlternateAllele)", alt, "alt", | |
| "UPPER(GRCh38_AlternateAllele) = UPPER($alt)") | |
| clause = " AND ".join(conditions) if conditions else "1=1" | |
| return clause, params | |
| # --------------------------------------------------------------------------- | |
| # Search | |
| # --------------------------------------------------------------------------- | |
| def search_submissions(evidence_type, gene, vcv, rcv, scv, | |
| variation_id, chr_val, pos, ref, alt): | |
| where, params = _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt) | |
| prefix = evidence_type.lower() | |
| try: | |
| con = _con() | |
| sql = f""" | |
| SELECT DISTINCT | |
| SCV, | |
| VCV, | |
| COALESCE(gene, Gene) AS Gene, | |
| COALESCE( | |
| 'chr' || GRCh38_Chr || ':' || CAST(GRCh38_Start AS VARCHAR) | |
| || ' ' || GRCh38_ReferenceAllele | |
| || '>' || GRCh38_AlternateAllele, | |
| '' | |
| ) AS Variant, | |
| SubmissionClassification AS "ClinVar Class", | |
| Submitter, | |
| final_label AS "Model Label", | |
| CASE WHEN has_evidence THEN 'Yes' ELSE 'No' END AS "Has Evidence", | |
| ROUND(CAST(evidence_confidence AS DOUBLE), 4) AS "Max Confidence", | |
| ROUND(CAST(P_Score AS DOUBLE), 4) AS "P Score", | |
| ROUND(CAST(B_Score AS DOUBLE), 4) AS "B Score", | |
| Comment | |
| FROM read_parquet('{PARQUET_PATH}') | |
| WHERE evidence_type = '{prefix}' | |
| AND ({where}) | |
| ORDER BY evidence_confidence DESC NULLS LAST | |
| LIMIT 200 | |
| """ | |
| df = con.execute(sql, parameters=params).df() | |
| con.close() | |
| if df.empty: | |
| return pd.DataFrame({"Message": ["No results found."]}), [] | |
| choices = [ | |
| f"{row['SCV']} — {row['Gene'] or '?'} | {row['ClinVar Class'] or '?'} | label: {row['Model Label'] or 'None'}" | |
| for _, row in df.iterrows() | |
| ] | |
| return df, choices | |
| except Exception as e: | |
| return pd.DataFrame({"Error": [str(e)]}), [] | |
| # --------------------------------------------------------------------------- | |
| # Sentence detail | |
| # --------------------------------------------------------------------------- | |
| def load_sentences(evidence_type, scv_choice): | |
| if not scv_choice: | |
| return pd.DataFrame({"Message": ["Select a submission above to see sentence details."]}) | |
| scv = scv_choice.split(" — ")[0].strip() | |
| prefix = evidence_type.lower() | |
| try: | |
| con = _con() | |
| sql = f""" | |
| SELECT | |
| sentence_index AS "#", | |
| sentence AS "Sentence", | |
| ROUND(CAST(stage1_score AS DOUBLE), 4) AS "Stage 1: Has Evidence", | |
| stage1_label AS "Stage 1 Label", | |
| ROUND(CAST(stage2_pathogenic_score AS DOUBLE), 4) AS "Stage 2 P", | |
| ROUND(CAST(stage2_benign_score AS DOUBLE), 4) AS "Stage 2 B", | |
| predicted_evidence_sentence AS "Predicted" | |
| FROM read_parquet('{PARQUET_PATH}') | |
| WHERE evidence_type = '{prefix}' | |
| AND SCV = $scv | |
| ORDER BY sentence_index ASC | |
| """ | |
| df = con.execute(sql, parameters={"scv": scv}).df() | |
| con.close() | |
| if df.empty: | |
| return pd.DataFrame({"Message": [f"No sentence data found for {scv}."]}) | |
| return df | |
| except Exception as e: | |
| return pd.DataFrame({"Error": [str(e)]}) | |
| # --------------------------------------------------------------------------- | |
| # Stats | |
| # --------------------------------------------------------------------------- | |
| def get_stats(evidence_type): | |
| prefix = evidence_type.lower() | |
| try: | |
| con = _con() | |
| total_sub = con.execute(f""" | |
| SELECT COUNT(DISTINCT SCV) FROM read_parquet('{PARQUET_PATH}') | |
| WHERE evidence_type = '{prefix}' | |
| """).fetchone()[0] | |
| total_sent = con.execute(f""" | |
| SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}') | |
| WHERE evidence_type = '{prefix}' | |
| """).fetchone()[0] | |
| unique_genes = con.execute(f""" | |
| SELECT COUNT(DISTINCT COALESCE(gene, Gene)) | |
| FROM read_parquet('{PARQUET_PATH}') | |
| WHERE evidence_type = '{prefix}' | |
| AND COALESCE(gene, Gene) IS NOT NULL | |
| """).fetchone()[0] | |
| label_dist = con.execute(f""" | |
| SELECT final_label, COUNT(DISTINCT SCV) AS n | |
| FROM read_parquet('{PARQUET_PATH}') | |
| WHERE evidence_type = '{prefix}' | |
| GROUP BY final_label ORDER BY n DESC | |
| """).fetchall() | |
| s1_dist = con.execute(f""" | |
| SELECT stage1_label, COUNT(*) AS n | |
| FROM read_parquet('{PARQUET_PATH}') | |
| WHERE evidence_type = '{prefix}' | |
| GROUP BY stage1_label ORDER BY n DESC | |
| """).fetchall() | |
| con.close() | |
| txt = f"""## Dataset Statistics — {evidence_type} Evidence | |
| **Unique submissions (SCV):** {total_sub:,} | |
| **Total sentences:** {total_sent:,} | |
| **Unique genes:** {unique_genes:,} | |
| ### Submission-level final label distribution | |
| """ | |
| for label, n in label_dist: | |
| pct = 100 * n / total_sub if total_sub else 0 | |
| txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n" | |
| txt += "\n### Sentence-level Stage 1 label distribution\n" | |
| for label, n in s1_dist: | |
| pct = 100 * n / total_sent if total_sent else 0 | |
| txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n" | |
| return txt | |
| except Exception as e: | |
| return f"Error: {e}" | |
| # --------------------------------------------------------------------------- | |
| # UI | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks( | |
| title="ACMG Evidence — Sentence-Level Lookup", | |
| theme=gr.themes.Base( | |
| primary_hue="blue", | |
| secondary_hue="slate", | |
| font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"], | |
| ), | |
| ) as demo: | |
| gr.Markdown(""" | |
| # ACMG Evidence Prediction — Sentence-Level Lookup | |
| Search for a ClinVar variant or submission, then select a result to see the | |
| model confidence score assigned to **each individual sentence** in the | |
| submission comment. | |
| Part of: *"Language models reveal evidence gaps in variants of uncertain significance"* | |
| (Li, Bhat, Yu, Lebo, Zitnik, Cassa) | |
| """) | |
| with gr.Tab("Sentence Lookup"): | |
| gr.Markdown("### 1. Select evidence type and enter search criteria") | |
| evidence_sel = gr.Radio( | |
| choices=EVIDENCE_TYPES, | |
| value="Functional", | |
| label="Evidence Type", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gene_in = gr.Textbox(label="Gene Symbol", placeholder="e.g. BRCA1") | |
| vcv_in = gr.Textbox(label="VCV", placeholder="e.g. VCV000000390") | |
| rcv_in = gr.Textbox(label="RCV", placeholder="e.g. RCV000000087") | |
| scv_in = gr.Textbox(label="SCV", placeholder="e.g. SCV000020565") | |
| varid_in = gr.Textbox(label="VariationID", placeholder="e.g. 390") | |
| with gr.Column(): | |
| chr_in = gr.Textbox(label="Chromosome", placeholder="e.g. 17") | |
| pos_in = gr.Textbox(label="Position (GRCh38)", placeholder="e.g. 43094464") | |
| ref_in = gr.Textbox(label="Reference Allele", placeholder="e.g. C") | |
| alt_in = gr.Textbox(label="Alternate Allele", placeholder="e.g. T") | |
| search_btn = gr.Button("Search", variant="primary") | |
| gr.Markdown("### 2. Matching submissions _(max 200)_") | |
| results_table = gr.Dataframe( | |
| label="Search Results", | |
| wrap=True, | |
| interactive=False, | |
| column_widths=["10%", "10%", "6%", "14%", "10%", "10%", | |
| "8%", "8%", "8%", "6%", "6%", "14%"], | |
| ) | |
| gr.Markdown("### 3. Select a submission to inspect") | |
| submission_dropdown = gr.Dropdown( | |
| label="Submission (SCV — Gene | ClinVar class | model label)", | |
| choices=[], | |
| interactive=True, | |
| allow_custom_value=False, | |
| ) | |
| gr.Markdown("### 4. Select evidence type for sentence-level breakdown") | |
| gr.Markdown( | |
| "_Each evidence type is modeled independently. " | |
| "Choose which one to display per-sentence scores for._" | |
| ) | |
| sentence_evidence_sel = gr.Radio( | |
| choices=EVIDENCE_TYPES, | |
| value="Functional", | |
| label="Evidence Type for Sentence Breakdown", | |
| info="Functional = PS3/BS3 · Population = BA1/BS1/BS2/PS4/PM2 · Computational = PP3/BP4/BP1/BP7", | |
| ) | |
| load_sentences_btn = gr.Button("Load Sentence Scores", variant="secondary") | |
| gr.Markdown("### 5. Per-sentence model scores") | |
| gr.Markdown(""" | |
| | Column | Meaning | | |
| |--------|---------| | |
| | **Stage 1: Has Evidence** | Probability the sentence contains this evidence type (0–1). Threshold = 0.8 | | |
| | **Stage 1 Label** | `has_evidence` if score > 0.8, else `no_evidence` | | |
| | **Stage 2 P** | Probability the evidence is **pathogenic** (meaningful only when Stage 1 > 0.8) | | |
| | **Stage 2 B** | Probability the evidence is **benign** (meaningful only when Stage 1 > 0.8) | | |
| | **Predicted** | Per-sentence label — for Functional: `PS3`/`BS3`; for Population/Computational: `P`/`B`; or `None` | | |
| """) | |
| sentence_table = gr.Dataframe( | |
| label="Per-Sentence Scores", | |
| wrap=True, | |
| interactive=False, | |
| column_widths=SENTENCE_COL_WIDTHS, | |
| ) | |
| gr.Markdown("### Example searches") | |
| gr.Examples( | |
| examples=[ | |
| ["Functional", "BRCA1", "", "", "", "", "", "", "", ""], | |
| ["Functional", "", "VCV000000390", "", "", "", "", "", "", ""], | |
| ["Functional", "", "", "", "SCV000020565", "", "", "", "", ""], | |
| ["Population", "KCNQ1", "", "", "", "", "", "", "", ""], | |
| ["Computational", "TP53", "", "", "", "", "", "", "", ""], | |
| ], | |
| inputs=[evidence_sel, gene_in, vcv_in, rcv_in, scv_in, | |
| varid_in, chr_in, pos_in, ref_in, alt_in], | |
| label="Click to load example", | |
| ) | |
| with gr.Tab("Dataset Statistics"): | |
| stats_evidence_sel = gr.Radio( | |
| choices=EVIDENCE_TYPES, value="Functional", label="Evidence Type" | |
| ) | |
| stats_btn = gr.Button("Load Statistics", variant="primary") | |
| stats_output = gr.Markdown() | |
| stats_btn.click(fn=get_stats, inputs=stats_evidence_sel, outputs=stats_output) | |
| with gr.Tab("About"): | |
| gr.Markdown(""" | |
| ## About This Tool | |
| This interface exposes the **sentence-level** output of the two-stage ACMG evidence | |
| classifier described in: | |
| > *"Language models reveal evidence gaps in variants of uncertain significance"* | |
| > (Li, Bhat, Yu, Lebo, Zitnik, Cassa) | |
| ### How the model works | |
| Each ClinVar submission comment is split into sentences. | |
| Two BioBERT-large classifiers are applied in sequence: | |
| 1. **Stage 1** — Does this sentence contain functional / population / computational | |
| evidence? Score > 0.8 → `has_evidence`. | |
| 2. **Stage 2** — Is the evidence pathogenic or benign? | |
| Runs only on sentences flagged by Stage 1. | |
| The submission-level prediction uses the **maximum Stage 1 score** across all | |
| sentences; direction (P/B) comes from the highest-confidence Stage 2 sentence. | |
| ### Evidence types | |
| | Type | ACMG codes | | |
| |------|-----------| | |
| | Functional | PS3, BS3 | | |
| | Population | BA1, BS1, BS2, PS4, PM2 | | |
| | Computational | PP3, BP4, BP1, BP7 | | |
| ### Data | |
| Predictions cover all ClinVar submissions with free-text comments as of the | |
| dataset release date. Full submission-level predictions are also available at | |
| the companion space [weijiang99/acmg-evidence-prediction](https://huggingface.co/spaces/weijiang99/acmg-evidence-prediction). | |
| """) | |
| # ------------------------------------------------------------------ | |
| # Event wiring | |
| # ------------------------------------------------------------------ | |
| _search_inputs = [ | |
| evidence_sel, gene_in, vcv_in, rcv_in, scv_in, | |
| varid_in, chr_in, pos_in, ref_in, alt_in, | |
| ] | |
| def _search_and_populate(*args): | |
| df, choices = search_submissions(*args) | |
| return df, gr.Dropdown(choices=choices, value=None), pd.DataFrame() | |
| search_btn.click( | |
| fn=_search_and_populate, | |
| inputs=_search_inputs, | |
| outputs=[results_table, submission_dropdown, sentence_table], | |
| ) | |
| # Load sentence scores when button clicked or evidence type changed | |
| load_sentences_btn.click( | |
| fn=load_sentences, | |
| inputs=[sentence_evidence_sel, submission_dropdown], | |
| outputs=sentence_table, | |
| ) | |
| sentence_evidence_sel.change( | |
| fn=load_sentences, | |
| inputs=[sentence_evidence_sel, submission_dropdown], | |
| outputs=sentence_table, | |
| ) | |
| if __name__ == "__main__": | |
| import sys | |
| demo.launch(share="--share" in sys.argv) | |