Weijiang Li
Add sentence-level ACMG evidence lookup app
bea64a4
raw
history blame contribute delete
16.1 kB
"""
ACMG Evidence Prediction — Sentence-Level Lookup (HuggingFace Spaces)
On first launch this app downloads sentence_predictions.parquet from the
HuggingFace dataset repo weijiang99/acmg-sentence-predictions.
Subsequent launches reuse the cached file.
"""
import os
import gradio as gr
import duckdb
import pandas as pd
from pathlib import Path
from huggingface_hub import hf_hub_download
# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------
DATASET_REPO = "weijiang99/acmg-sentence-predictions"
PARQUET_FILENAME = "sentence_predictions.parquet"
# hf_hub_download caches to ~/.cache/huggingface by default (persists across
# Space restarts when a persistent-storage volume is attached, otherwise
# re-downloads on cold start — still faster than storing in the repo itself)
def get_parquet_path() -> str:
local = Path(PARQUET_FILENAME)
if local.exists():
return str(local)
print(f"Downloading {PARQUET_FILENAME} from {DATASET_REPO} ...")
path = hf_hub_download(
repo_id=DATASET_REPO,
filename=PARQUET_FILENAME,
repo_type="dataset",
)
print(f"Downloaded to {path}")
return path
PARQUET_PATH = get_parquet_path()
EVIDENCE_TYPES = ["Functional", "Computational", "Population"]
SENTENCE_COL_WIDTHS = ["4%", "44%", "12%", "12%", "12%", "16%"]
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _con():
return duckdb.connect(database=":memory:")
def _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt):
conditions, params = [], {}
def add(col, val, param, expr=None):
if val and str(val).strip():
conditions.append(expr or f"{col} = ${param}")
params[param] = val.strip()
add("UPPER(gene)", gene, "gene", "UPPER(gene) = UPPER($gene)")
add("VCV", vcv, "vcv")
add("RCV", rcv, "rcv")
add("SCV", scv, "scv")
add("VariationID", variation_id, "variation_id")
add("GRCh38_Chr", chr_val, "chr")
if pos and str(pos).strip():
try:
params["pos"] = float(pos)
conditions.append("GRCh38_Start = $pos")
except ValueError:
pass
add("UPPER(GRCh38_ReferenceAllele)", ref, "ref",
"UPPER(GRCh38_ReferenceAllele) = UPPER($ref)")
add("UPPER(GRCh38_AlternateAllele)", alt, "alt",
"UPPER(GRCh38_AlternateAllele) = UPPER($alt)")
clause = " AND ".join(conditions) if conditions else "1=1"
return clause, params
# ---------------------------------------------------------------------------
# Search
# ---------------------------------------------------------------------------
def search_submissions(evidence_type, gene, vcv, rcv, scv,
variation_id, chr_val, pos, ref, alt):
where, params = _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt)
prefix = evidence_type.lower()
try:
con = _con()
sql = f"""
SELECT DISTINCT
SCV,
VCV,
COALESCE(gene, Gene) AS Gene,
COALESCE(
'chr' || GRCh38_Chr || ':' || CAST(GRCh38_Start AS VARCHAR)
|| ' ' || GRCh38_ReferenceAllele
|| '>' || GRCh38_AlternateAllele,
''
) AS Variant,
SubmissionClassification AS "ClinVar Class",
Submitter,
final_label AS "Model Label",
CASE WHEN has_evidence THEN 'Yes' ELSE 'No' END AS "Has Evidence",
ROUND(CAST(evidence_confidence AS DOUBLE), 4) AS "Max Confidence",
ROUND(CAST(P_Score AS DOUBLE), 4) AS "P Score",
ROUND(CAST(B_Score AS DOUBLE), 4) AS "B Score",
Comment
FROM read_parquet('{PARQUET_PATH}')
WHERE evidence_type = '{prefix}'
AND ({where})
ORDER BY evidence_confidence DESC NULLS LAST
LIMIT 200
"""
df = con.execute(sql, parameters=params).df()
con.close()
if df.empty:
return pd.DataFrame({"Message": ["No results found."]}), []
choices = [
f"{row['SCV']}{row['Gene'] or '?'} | {row['ClinVar Class'] or '?'} | label: {row['Model Label'] or 'None'}"
for _, row in df.iterrows()
]
return df, choices
except Exception as e:
return pd.DataFrame({"Error": [str(e)]}), []
# ---------------------------------------------------------------------------
# Sentence detail
# ---------------------------------------------------------------------------
def load_sentences(evidence_type, scv_choice):
if not scv_choice:
return pd.DataFrame({"Message": ["Select a submission above to see sentence details."]})
scv = scv_choice.split(" — ")[0].strip()
prefix = evidence_type.lower()
try:
con = _con()
sql = f"""
SELECT
sentence_index AS "#",
sentence AS "Sentence",
ROUND(CAST(stage1_score AS DOUBLE), 4) AS "Stage 1: Has Evidence",
stage1_label AS "Stage 1 Label",
ROUND(CAST(stage2_pathogenic_score AS DOUBLE), 4) AS "Stage 2 P",
ROUND(CAST(stage2_benign_score AS DOUBLE), 4) AS "Stage 2 B",
predicted_evidence_sentence AS "Predicted"
FROM read_parquet('{PARQUET_PATH}')
WHERE evidence_type = '{prefix}'
AND SCV = $scv
ORDER BY sentence_index ASC
"""
df = con.execute(sql, parameters={"scv": scv}).df()
con.close()
if df.empty:
return pd.DataFrame({"Message": [f"No sentence data found for {scv}."]})
return df
except Exception as e:
return pd.DataFrame({"Error": [str(e)]})
# ---------------------------------------------------------------------------
# Stats
# ---------------------------------------------------------------------------
def get_stats(evidence_type):
prefix = evidence_type.lower()
try:
con = _con()
total_sub = con.execute(f"""
SELECT COUNT(DISTINCT SCV) FROM read_parquet('{PARQUET_PATH}')
WHERE evidence_type = '{prefix}'
""").fetchone()[0]
total_sent = con.execute(f"""
SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}')
WHERE evidence_type = '{prefix}'
""").fetchone()[0]
unique_genes = con.execute(f"""
SELECT COUNT(DISTINCT COALESCE(gene, Gene))
FROM read_parquet('{PARQUET_PATH}')
WHERE evidence_type = '{prefix}'
AND COALESCE(gene, Gene) IS NOT NULL
""").fetchone()[0]
label_dist = con.execute(f"""
SELECT final_label, COUNT(DISTINCT SCV) AS n
FROM read_parquet('{PARQUET_PATH}')
WHERE evidence_type = '{prefix}'
GROUP BY final_label ORDER BY n DESC
""").fetchall()
s1_dist = con.execute(f"""
SELECT stage1_label, COUNT(*) AS n
FROM read_parquet('{PARQUET_PATH}')
WHERE evidence_type = '{prefix}'
GROUP BY stage1_label ORDER BY n DESC
""").fetchall()
con.close()
txt = f"""## Dataset Statistics — {evidence_type} Evidence
**Unique submissions (SCV):** {total_sub:,}
**Total sentences:** {total_sent:,}
**Unique genes:** {unique_genes:,}
### Submission-level final label distribution
"""
for label, n in label_dist:
pct = 100 * n / total_sub if total_sub else 0
txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n"
txt += "\n### Sentence-level Stage 1 label distribution\n"
for label, n in s1_dist:
pct = 100 * n / total_sent if total_sent else 0
txt += f"- **{label or 'None'}**: {n:,} ({pct:.1f}%)\n"
return txt
except Exception as e:
return f"Error: {e}"
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
with gr.Blocks(
title="ACMG Evidence — Sentence-Level Lookup",
theme=gr.themes.Base(
primary_hue="blue",
secondary_hue="slate",
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
),
) as demo:
gr.Markdown("""
# ACMG Evidence Prediction — Sentence-Level Lookup
Search for a ClinVar variant or submission, then select a result to see the
model confidence score assigned to **each individual sentence** in the
submission comment.
Part of: *"Language models reveal evidence gaps in variants of uncertain significance"*
(Li, Bhat, Yu, Lebo, Zitnik, Cassa)
""")
with gr.Tab("Sentence Lookup"):
gr.Markdown("### 1. Select evidence type and enter search criteria")
evidence_sel = gr.Radio(
choices=EVIDENCE_TYPES,
value="Functional",
label="Evidence Type",
)
with gr.Row():
with gr.Column():
gene_in = gr.Textbox(label="Gene Symbol", placeholder="e.g. BRCA1")
vcv_in = gr.Textbox(label="VCV", placeholder="e.g. VCV000000390")
rcv_in = gr.Textbox(label="RCV", placeholder="e.g. RCV000000087")
scv_in = gr.Textbox(label="SCV", placeholder="e.g. SCV000020565")
varid_in = gr.Textbox(label="VariationID", placeholder="e.g. 390")
with gr.Column():
chr_in = gr.Textbox(label="Chromosome", placeholder="e.g. 17")
pos_in = gr.Textbox(label="Position (GRCh38)", placeholder="e.g. 43094464")
ref_in = gr.Textbox(label="Reference Allele", placeholder="e.g. C")
alt_in = gr.Textbox(label="Alternate Allele", placeholder="e.g. T")
search_btn = gr.Button("Search", variant="primary")
gr.Markdown("### 2. Matching submissions _(max 200)_")
results_table = gr.Dataframe(
label="Search Results",
wrap=True,
interactive=False,
column_widths=["10%", "10%", "6%", "14%", "10%", "10%",
"8%", "8%", "8%", "6%", "6%", "14%"],
)
gr.Markdown("### 3. Select a submission to inspect")
submission_dropdown = gr.Dropdown(
label="Submission (SCV — Gene | ClinVar class | model label)",
choices=[],
interactive=True,
allow_custom_value=False,
)
gr.Markdown("### 4. Select evidence type for sentence-level breakdown")
gr.Markdown(
"_Each evidence type is modeled independently. "
"Choose which one to display per-sentence scores for._"
)
sentence_evidence_sel = gr.Radio(
choices=EVIDENCE_TYPES,
value="Functional",
label="Evidence Type for Sentence Breakdown",
info="Functional = PS3/BS3 · Population = BA1/BS1/BS2/PS4/PM2 · Computational = PP3/BP4/BP1/BP7",
)
load_sentences_btn = gr.Button("Load Sentence Scores", variant="secondary")
gr.Markdown("### 5. Per-sentence model scores")
gr.Markdown("""
| Column | Meaning |
|--------|---------|
| **Stage 1: Has Evidence** | Probability the sentence contains this evidence type (0–1). Threshold = 0.8 |
| **Stage 1 Label** | `has_evidence` if score > 0.8, else `no_evidence` |
| **Stage 2 P** | Probability the evidence is **pathogenic** (meaningful only when Stage 1 > 0.8) |
| **Stage 2 B** | Probability the evidence is **benign** (meaningful only when Stage 1 > 0.8) |
| **Predicted** | Per-sentence label — for Functional: `PS3`/`BS3`; for Population/Computational: `P`/`B`; or `None` |
""")
sentence_table = gr.Dataframe(
label="Per-Sentence Scores",
wrap=True,
interactive=False,
column_widths=SENTENCE_COL_WIDTHS,
)
gr.Markdown("### Example searches")
gr.Examples(
examples=[
["Functional", "BRCA1", "", "", "", "", "", "", "", ""],
["Functional", "", "VCV000000390", "", "", "", "", "", "", ""],
["Functional", "", "", "", "SCV000020565", "", "", "", "", ""],
["Population", "KCNQ1", "", "", "", "", "", "", "", ""],
["Computational", "TP53", "", "", "", "", "", "", "", ""],
],
inputs=[evidence_sel, gene_in, vcv_in, rcv_in, scv_in,
varid_in, chr_in, pos_in, ref_in, alt_in],
label="Click to load example",
)
with gr.Tab("Dataset Statistics"):
stats_evidence_sel = gr.Radio(
choices=EVIDENCE_TYPES, value="Functional", label="Evidence Type"
)
stats_btn = gr.Button("Load Statistics", variant="primary")
stats_output = gr.Markdown()
stats_btn.click(fn=get_stats, inputs=stats_evidence_sel, outputs=stats_output)
with gr.Tab("About"):
gr.Markdown("""
## About This Tool
This interface exposes the **sentence-level** output of the two-stage ACMG evidence
classifier described in:
> *"Language models reveal evidence gaps in variants of uncertain significance"*
> (Li, Bhat, Yu, Lebo, Zitnik, Cassa)
### How the model works
Each ClinVar submission comment is split into sentences.
Two BioBERT-large classifiers are applied in sequence:
1. **Stage 1** — Does this sentence contain functional / population / computational
evidence? Score > 0.8 → `has_evidence`.
2. **Stage 2** — Is the evidence pathogenic or benign?
Runs only on sentences flagged by Stage 1.
The submission-level prediction uses the **maximum Stage 1 score** across all
sentences; direction (P/B) comes from the highest-confidence Stage 2 sentence.
### Evidence types
| Type | ACMG codes |
|------|-----------|
| Functional | PS3, BS3 |
| Population | BA1, BS1, BS2, PS4, PM2 |
| Computational | PP3, BP4, BP1, BP7 |
### Data
Predictions cover all ClinVar submissions with free-text comments as of the
dataset release date. Full submission-level predictions are also available at
the companion space [weijiang99/acmg-evidence-prediction](https://huggingface.co/spaces/weijiang99/acmg-evidence-prediction).
""")
# ------------------------------------------------------------------
# Event wiring
# ------------------------------------------------------------------
_search_inputs = [
evidence_sel, gene_in, vcv_in, rcv_in, scv_in,
varid_in, chr_in, pos_in, ref_in, alt_in,
]
def _search_and_populate(*args):
df, choices = search_submissions(*args)
return df, gr.Dropdown(choices=choices, value=None), pd.DataFrame()
search_btn.click(
fn=_search_and_populate,
inputs=_search_inputs,
outputs=[results_table, submission_dropdown, sentence_table],
)
# Load sentence scores when button clicked or evidence type changed
load_sentences_btn.click(
fn=load_sentences,
inputs=[sentence_evidence_sel, submission_dropdown],
outputs=sentence_table,
)
sentence_evidence_sel.change(
fn=load_sentences,
inputs=[sentence_evidence_sel, submission_dropdown],
outputs=sentence_table,
)
if __name__ == "__main__":
import sys
demo.launch(share="--share" in sys.argv)