Spaces:

weijiang99
/

acmg-evidence-sentence

Sleeping

Weijiang Li

Add sentence-level ACMG evidence lookup app

bea64a4 about 2 months ago

16.1 kB

	"""
	ACMG Evidence Prediction — Sentence-Level Lookup (HuggingFace Spaces)

	On first launch this app downloads sentence_predictions.parquet from the
	HuggingFace dataset repo weijiang99/acmg-sentence-predictions.
	Subsequent launches reuse the cached file.
	"""

	import os
	import gradio as gr
	import duckdb
	import pandas as pd
	from pathlib import Path
	from huggingface_hub import hf_hub_download

	# ---------------------------------------------------------------------------
	# Data loading
	# ---------------------------------------------------------------------------

	DATASET_REPO = "weijiang99/acmg-sentence-predictions"
	PARQUET_FILENAME = "sentence_predictions.parquet"

	# hf_hub_download caches to ~/.cache/huggingface by default (persists across
	# Space restarts when a persistent-storage volume is attached, otherwise
	# re-downloads on cold start — still faster than storing in the repo itself)
	def get_parquet_path() -> str:
	local = Path(PARQUET_FILENAME)
	if local.exists():
	return str(local)
	print(f"Downloading {PARQUET_FILENAME} from {DATASET_REPO} ...")
	path = hf_hub_download(
	repo_id=DATASET_REPO,
	filename=PARQUET_FILENAME,
	repo_type="dataset",
	)
	print(f"Downloaded to {path}")
	return path

	PARQUET_PATH = get_parquet_path()

	EVIDENCE_TYPES = ["Functional", "Computational", "Population"]
	SENTENCE_COL_WIDTHS = ["4%", "44%", "12%", "12%", "12%", "16%"]


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def _con():
	return duckdb.connect(database=":memory:")


	def _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt):
	conditions, params = [], {}

	def add(col, val, param, expr=None):
	if val and str(val).strip():
	conditions.append(expr or f"{col} = ${param}")
	params[param] = val.strip()

	add("UPPER(gene)", gene, "gene", "UPPER(gene) = UPPER($gene)")
	add("VCV", vcv, "vcv")
	add("RCV", rcv, "rcv")
	add("SCV", scv, "scv")
	add("VariationID", variation_id, "variation_id")
	add("GRCh38_Chr", chr_val, "chr")

	if pos and str(pos).strip():
	try:
	params["pos"] = float(pos)
	conditions.append("GRCh38_Start = $pos")
	except ValueError:
	pass

	add("UPPER(GRCh38_ReferenceAllele)", ref, "ref",
	"UPPER(GRCh38_ReferenceAllele) = UPPER($ref)")
	add("UPPER(GRCh38_AlternateAllele)", alt, "alt",
	"UPPER(GRCh38_AlternateAllele) = UPPER($alt)")

	clause = " AND ".join(conditions) if conditions else "1=1"
	return clause, params


	# ---------------------------------------------------------------------------
	# Search
	# ---------------------------------------------------------------------------

	def search_submissions(evidence_type, gene, vcv, rcv, scv,
	variation_id, chr_val, pos, ref, alt):
	where, params = _where(gene, vcv, rcv, scv, variation_id, chr_val, pos, ref, alt)
	prefix = evidence_type.lower()

	try:
	con = _con()
	sql = f"""
	SELECT DISTINCT
	SCV,
	VCV,
	COALESCE(gene, Gene) AS Gene,
	COALESCE(
	'chr' \|\| GRCh38_Chr \|\| ':' \|\| CAST(GRCh38_Start AS VARCHAR)
	\|\| ' ' \|\| GRCh38_ReferenceAllele
	\|\| '>' \|\| GRCh38_AlternateAllele,
	''
	) AS Variant,
	SubmissionClassification AS "ClinVar Class",
	Submitter,
	final_label AS "Model Label",
	CASE WHEN has_evidence THEN 'Yes' ELSE 'No' END AS "Has Evidence",
	ROUND(CAST(evidence_confidence AS DOUBLE), 4) AS "Max Confidence",
	ROUND(CAST(P_Score AS DOUBLE), 4) AS "P Score",
	ROUND(CAST(B_Score AS DOUBLE), 4) AS "B Score",
	Comment
	FROM read_parquet('{PARQUET_PATH}')
	WHERE evidence_type = '{prefix}'
	AND ({where})
	ORDER BY evidence_confidence DESC NULLS LAST
	LIMIT 200
	"""
	df = con.execute(sql, parameters=params).df()
	con.close()

	if df.empty:
	return pd.DataFrame({"Message": ["No results found."]}), []

	choices = [
	f"{row['SCV']} — {row['Gene'] or '?'} \| {row['ClinVar Class'] or '?'} \| label: {row['Model Label'] or 'None'}"
	for _, row in df.iterrows()
	]
	return df, choices

	except Exception as e:
	return pd.DataFrame({"Error": [str(e)]}), []


	# ---------------------------------------------------------------------------
	# Sentence detail
	# ---------------------------------------------------------------------------

	def load_sentences(evidence_type, scv_choice):
	if not scv_choice:
	return pd.DataFrame({"Message": ["Select a submission above to see sentence details."]})

	scv = scv_choice.split(" — ")[0].strip()
	prefix = evidence_type.lower()

	try:
	con = _con()
	sql = f"""
	SELECT
	sentence_index AS "#",
	sentence AS "Sentence",
	ROUND(CAST(stage1_score AS DOUBLE), 4) AS "Stage 1: Has Evidence",
	stage1_label AS "Stage 1 Label",
	ROUND(CAST(stage2_pathogenic_score AS DOUBLE), 4) AS "Stage 2 P",
	ROUND(CAST(stage2_benign_score AS DOUBLE), 4) AS "Stage 2 B",
	predicted_evidence_sentence AS "Predicted"
	FROM read_parquet('{PARQUET_PATH}')
	WHERE evidence_type = '{prefix}'
	AND SCV = $scv
	ORDER BY sentence_index ASC
	"""
	df = con.execute(sql, parameters={"scv": scv}).df()
	con.close()

	if df.empty:
	return pd.DataFrame({"Message": [f"No sentence data found for {scv}."]})
	return df

	except Exception as e:
	return pd.DataFrame({"Error": [str(e)]})


	# ---------------------------------------------------------------------------
	# Stats
	# ---------------------------------------------------------------------------

	def get_stats(evidence_type):
	prefix = evidence_type.lower()
	try:
	con = _con()

	total_sub = con.execute(f"""
	SELECT COUNT(DISTINCT SCV) FROM read_parquet('{PARQUET_PATH}')
	WHERE evidence_type = '{prefix}'
	""").fetchone()[0]

	total_sent = con.execute(f"""
	SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}')
	WHERE evidence_type = '{prefix}'
	""").fetchone()[0]

	unique_genes = con.execute(f"""
	SELECT COUNT(DISTINCT COALESCE(gene, Gene))
	FROM read_parquet('{PARQUET_PATH}')
	WHERE evidence_type = '{prefix}'
	AND COALESCE(gene, Gene) IS NOT NULL
	""").fetchone()[0]

	label_dist = con.execute(f"""
	SELECT final_label, COUNT(DISTINCT SCV) AS n
	FROM read_parquet('{PARQUET_PATH}')
	WHERE evidence_type = '{prefix}'
	GROUP BY final_label ORDER BY n DESC
	""").fetchall()

	s1_dist = con.execute(f"""
	SELECT stage1_label, COUNT(*) AS n
	FROM read_parquet('{PARQUET_PATH}')
	WHERE evidence_type = '{prefix}'
	GROUP BY stage1_label ORDER BY n DESC
	""").fetchall()

	con.close()

	txt = f"""## Dataset Statistics — {evidence_type} Evidence

	Unique submissions (SCV): {total_sub:,}
	Total sentences: {total_sent:,}
	Unique genes: {unique_genes:,}

	### Submission-level final label distribution
	"""
	for label, n in label_dist:
	pct = 100 * n / total_sub if total_sub else 0
	txt += f"- {label or 'None'}: {n:,} ({pct:.1f}%)\n"

	txt += "\n### Sentence-level Stage 1 label distribution\n"
	for label, n in s1_dist:
	pct = 100 * n / total_sent if total_sent else 0
	txt += f"- {label or 'None'}: {n:,} ({pct:.1f}%)\n"

	return txt

	except Exception as e:
	return f"Error: {e}"


	# ---------------------------------------------------------------------------
	# UI
	# ---------------------------------------------------------------------------

	with gr.Blocks(
	title="ACMG Evidence — Sentence-Level Lookup",
	theme=gr.themes.Base(
	primary_hue="blue",
	secondary_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
	),
	) as demo:

	gr.Markdown("""
	# ACMG Evidence Prediction — Sentence-Level Lookup

	Search for a ClinVar variant or submission, then select a result to see the
	model confidence score assigned to each individual sentence in the
	submission comment.

	Part of: "Language models reveal evidence gaps in variants of uncertain significance"
	(Li, Bhat, Yu, Lebo, Zitnik, Cassa)
	""")

	with gr.Tab("Sentence Lookup"):

	gr.Markdown("### 1. Select evidence type and enter search criteria")

	evidence_sel = gr.Radio(
	choices=EVIDENCE_TYPES,
	value="Functional",
	label="Evidence Type",
	)

	with gr.Row():
	with gr.Column():
	gene_in = gr.Textbox(label="Gene Symbol", placeholder="e.g. BRCA1")
	vcv_in = gr.Textbox(label="VCV", placeholder="e.g. VCV000000390")
	rcv_in = gr.Textbox(label="RCV", placeholder="e.g. RCV000000087")
	scv_in = gr.Textbox(label="SCV", placeholder="e.g. SCV000020565")
	varid_in = gr.Textbox(label="VariationID", placeholder="e.g. 390")
	with gr.Column():
	chr_in = gr.Textbox(label="Chromosome", placeholder="e.g. 17")
	pos_in = gr.Textbox(label="Position (GRCh38)", placeholder="e.g. 43094464")
	ref_in = gr.Textbox(label="Reference Allele", placeholder="e.g. C")
	alt_in = gr.Textbox(label="Alternate Allele", placeholder="e.g. T")

	search_btn = gr.Button("Search", variant="primary")

	gr.Markdown("### 2. Matching submissions _(max 200)_")

	results_table = gr.Dataframe(
	label="Search Results",
	wrap=True,
	interactive=False,
	column_widths=["10%", "10%", "6%", "14%", "10%", "10%",
	"8%", "8%", "8%", "6%", "6%", "14%"],
	)

	gr.Markdown("### 3. Select a submission to inspect")
	submission_dropdown = gr.Dropdown(
	label="Submission (SCV — Gene \| ClinVar class \| model label)",
	choices=[],
	interactive=True,
	allow_custom_value=False,
	)

	gr.Markdown("### 4. Select evidence type for sentence-level breakdown")
	gr.Markdown(
	"_Each evidence type is modeled independently. "
	"Choose which one to display per-sentence scores for._"
	)
	sentence_evidence_sel = gr.Radio(
	choices=EVIDENCE_TYPES,
	value="Functional",
	label="Evidence Type for Sentence Breakdown",
	info="Functional = PS3/BS3 · Population = BA1/BS1/BS2/PS4/PM2 · Computational = PP3/BP4/BP1/BP7",
	)
	load_sentences_btn = gr.Button("Load Sentence Scores", variant="secondary")

	gr.Markdown("### 5. Per-sentence model scores")
	gr.Markdown("""
	\| Column \| Meaning \|
	\|--------\|---------\|
	\| Stage 1: Has Evidence \| Probability the sentence contains this evidence type (0–1). Threshold = 0.8 \|
	\| Stage 1 Label \| `has_evidence` if score > 0.8, else `no_evidence` \|
	\| Stage 2 P \| Probability the evidence is pathogenic (meaningful only when Stage 1 > 0.8) \|
	\| Stage 2 B \| Probability the evidence is benign (meaningful only when Stage 1 > 0.8) \|
	\| Predicted \| Per-sentence label — for Functional: `PS3`/`BS3`; for Population/Computational: `P`/`B`; or `None` \|
	""")

	sentence_table = gr.Dataframe(
	label="Per-Sentence Scores",
	wrap=True,
	interactive=False,
	column_widths=SENTENCE_COL_WIDTHS,
	)

	gr.Markdown("### Example searches")
	gr.Examples(
	examples=[
	["Functional", "BRCA1", "", "", "", "", "", "", "", ""],
	["Functional", "", "VCV000000390", "", "", "", "", "", "", ""],
	["Functional", "", "", "", "SCV000020565", "", "", "", "", ""],
	["Population", "KCNQ1", "", "", "", "", "", "", "", ""],
	["Computational", "TP53", "", "", "", "", "", "", "", ""],
	],
	inputs=[evidence_sel, gene_in, vcv_in, rcv_in, scv_in,
	varid_in, chr_in, pos_in, ref_in, alt_in],
	label="Click to load example",
	)

	with gr.Tab("Dataset Statistics"):
	stats_evidence_sel = gr.Radio(
	choices=EVIDENCE_TYPES, value="Functional", label="Evidence Type"
	)
	stats_btn = gr.Button("Load Statistics", variant="primary")
	stats_output = gr.Markdown()
	stats_btn.click(fn=get_stats, inputs=stats_evidence_sel, outputs=stats_output)

	with gr.Tab("About"):
	gr.Markdown("""
	## About This Tool

	This interface exposes the sentence-level output of the two-stage ACMG evidence
	classifier described in:

	> "Language models reveal evidence gaps in variants of uncertain significance"
	> (Li, Bhat, Yu, Lebo, Zitnik, Cassa)

	### How the model works

	Each ClinVar submission comment is split into sentences.
	Two BioBERT-large classifiers are applied in sequence:

	1. Stage 1 — Does this sentence contain functional / population / computational
	evidence? Score > 0.8 → `has_evidence`.
	2. Stage 2 — Is the evidence pathogenic or benign?
	Runs only on sentences flagged by Stage 1.

	The submission-level prediction uses the maximum Stage 1 score across all
	sentences; direction (P/B) comes from the highest-confidence Stage 2 sentence.

	### Evidence types

	\| Type \| ACMG codes \|
	\|------\|-----------\|
	\| Functional \| PS3, BS3 \|
	\| Population \| BA1, BS1, BS2, PS4, PM2 \|
	\| Computational \| PP3, BP4, BP1, BP7 \|

	### Data
	Predictions cover all ClinVar submissions with free-text comments as of the
	dataset release date. Full submission-level predictions are also available at
	the companion space [weijiang99/acmg-evidence-prediction](https://huggingface.co/spaces/weijiang99/acmg-evidence-prediction).
	""")

	# ------------------------------------------------------------------
	# Event wiring
	# ------------------------------------------------------------------

	_search_inputs = [
	evidence_sel, gene_in, vcv_in, rcv_in, scv_in,
	varid_in, chr_in, pos_in, ref_in, alt_in,
	]

	def _search_and_populate(*args):
	df, choices = search_submissions(*args)
	return df, gr.Dropdown(choices=choices, value=None), pd.DataFrame()

	search_btn.click(
	fn=_search_and_populate,
	inputs=_search_inputs,
	outputs=[results_table, submission_dropdown, sentence_table],
	)

	# Load sentence scores when button clicked or evidence type changed
	load_sentences_btn.click(
	fn=load_sentences,
	inputs=[sentence_evidence_sel, submission_dropdown],
	outputs=sentence_table,
	)
	sentence_evidence_sel.change(
	fn=load_sentences,
	inputs=[sentence_evidence_sel, submission_dropdown],
	outputs=sentence_table,
	)


	if __name__ == "__main__":
	import sys
	demo.launch(share="--share" in sys.argv)