Spaces:

rufasharon
/

metadata_hierarchy_tfm2026

Sleeping

metadata_hierarchy_tfm2026 / demo.py

RoophaSharon

Sync demo (downloads, build summary, HCP depth fix) + latest approach_1; clean canonical outputs

2b56f2e 8 days ago

26.3 kB

	"""
	Metadata Hierarchy Explorer — TFM 2026
	Pre-built results viewer for Baseline, Approach 1, and Approach 2.

	Rendering faithfully replicates each app's display pipeline:
	- Baseline : raw tree, Greens, Sunburst + Treemap
	- Approach 1 : raw tree, Blues, Sunburst + Treemap + Node-link + Facets
	- Approach 2 : compress one-child chains, Viridis, Sunburst + Treemap + Node-link

	Level-of-Detail controls (depth, leaf labels, hidden nodes, compress chains)
	match the controls in the individual apps.
	"""
	from __future__ import annotations
	import json
	from collections import defaultdict
	from pathlib import Path

	import numpy as np
	import plotly.graph_objects as go
	import streamlit as st

	# ─────────────────────────────────────────────────────────────────────────────
	# PAGE CONFIG
	# ─────────────────────────────────────────────────────────────────────────────
	st.set_page_config(
	page_title="Metadata Hierarchy Explorer",
	page_icon="🌿",
	layout="wide",
	)

	ROOT = Path(__file__).parent / "outputs"

	DEFAULT_DEPTH = 7

	# ─────────────────────────────────────────────────────────────────────────────
	# PRE-BUILT OUTPUT PATHS
	# ─────────────────────────────────────────────────────────────────────────────
	PREBUILT = {
	"Baseline": {
	"AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
	"HCP": {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
	},
	"Approach 1": {
	"AI-MIND": {
	"hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
	"facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
	},
	"HCP": {
	"hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
	"facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
	},
	},
	"Approach 2": {
	"AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
	"HCP": {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
	},
	}

	# Per-approach rendering config (matches each source app)
	CONFIG = {
	"Baseline": {"color": "Greens", "compress": False, "node_link": False},
	"Approach 1": {"color": "Blues", "compress": False, "node_link": True},
	"Approach 2": {"color": "Viridis", "compress": True, "node_link": True},
	}

	APPROACH_DESC = {
	"Baseline": (
	"Pure clustering baseline — TF-IDF representation + recursive agglomerative "
	"(cosine) clustering, number of clusters chosen by silhouette. No external APIs, "
	"no neural embeddings. Node labels are the most discriminative terms per cluster."
	),
	"Approach 1": (
	"Global embedding pipeline — SBERT + N×M concept-table alignment (Gonçalves 2019) "
	"+ HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets. Optionally "
	"retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal."
	),
	"Approach 2": (
	"Dataset-constrained multi-aspect hierarchy — group-anchored L1/L2 → phrase-slot "
	"mining → FASTopic semantic aspect discovery (Wu et al. NeurIPS 2024) → GMM/KMeans "
	"clustering → deterministic 5-stage label generation. Optional local-LLM refinement."
	),
	}

	# ─────────────────────────────────────────────────────────────────────────────
	# TREE TRANSFORMS (copied from approach_2.py — display-only, exact behaviour)
	# ─────────────────────────────────────────────────────────────────────────────
	def _filter_dissolved(nodes: list) -> list:
	drop_ids = {int(n["id"]) for n in nodes
	if n.get("type") == "dissolved" or n.get("isShown") is False}
	if not drop_ids:
	return nodes
	out = []
	for n in nodes:
	if int(n["id"]) in drop_ids:
	continue
	m = dict(n)
	m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
	out.append(m)
	return out

	def compress_one_child_chains(nodes: list) -> list:
	"""Collapse chains where an aggregation node has exactly one aggregation child
	(e.g. 'DMS → DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
	nodes = _filter_dissolved(nodes)
	nm = {int(n["id"]): dict(n) for n in nodes}

	def _is_chain_link(n):
	if n.get("type") != "aggregation":
	return False
	children = n.get("related", [])
	return (len(children) == 1
	and nm.get(int(children[0]), {}).get("type") == "aggregation")

	changed = True
	while changed:
	changed = False
	for nid, n in list(nm.items()):
	if _is_chain_link(n):
	child_id = int(n["related"][0])
	child = nm[child_id]
	new_node = dict(child)
	new_node["id"] = nid
	new_node["name"] = f"{n['name']} / {child['name']}"
	new_node["desc"] = f"{n.get('desc', '')} \| {child.get('desc', '')}"
	nm[nid] = new_node
	if child_id in nm:
	del nm[child_id]
	for other in nm.values():
	other["related"] = [nid if int(c) == child_id else int(c)
	for c in other.get("related", [])]
	changed = True
	break
	return list(nm.values())

	# ─────────────────────────────────────────────────────────────────────────────
	# RENDER HELPERS (DAG-safe value map — copied from approach_2.py)
	# ─────────────────────────────────────────────────────────────────────────────
	def _leaf_ids(nodes: list, nid: int) -> list:
	m = {int(n["id"]): n for n in nodes}
	out = []
	def rec(x):
	n = m.get(int(x))
	if not n:
	return
	if n.get("type") == "attribute":
	out.append(int(x)); return
	for c in n.get("related", []):
	rec(int(c))
	rec(nid)
	return list(dict.fromkeys(out))

	def _parent_map(nodes: list) -> dict:
	pm = {}
	for n in nodes:
	for c in n.get("related", []):
	if int(c) not in pm:
	pm[int(c)] = int(n["id"])
	return pm

	def _tree_value_map(nodes: list, pm: dict) -> dict:
	kids = {}
	for child, par in pm.items():
	kids.setdefault(int(par), []).append(int(child))
	nodemap = {int(n["id"]): n for n in nodes}
	memo = {}
	def count(nid: int) -> int:
	if nid in memo:
	return memo[nid]
	memo[nid] = 1
	n = nodemap.get(nid)
	if n is not None and n.get("type") == "attribute":
	memo[nid] = 1
	return 1
	ch = kids.get(nid, [])
	v = sum(count(c) for c in ch) if ch else 1
	memo[nid] = max(1, v)
	return memo[nid]
	return {nid: count(nid) for nid in nodemap}

	def _wrap_hover(text: str, width: int = 80) -> str:
	import textwrap as _tw
	s = str(text or "")
	if not s:
	return ""
	lines = []
	for raw_line in s.split("\n"):
	lines.extend(_tw.wrap(raw_line, width=width) or [""])
	return "<br>".join(lines)

	def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
	nodes = _filter_dissolved(nodes)
	pm = _parent_map(nodes)
	vm = _tree_value_map(nodes, pm)
	ids, labels, parents, values, hover = [], [], [], [], []
	for n in nodes:
	nid = int(n["id"])
	lc = len(_leaf_ids(nodes, nid))
	ids.append(str(nid))
	labels.append(str(n.get("name", ""))[:40])
	parents.append("" if nid == 0 else str(pm.get(nid, 0)))
	values.append(vm.get(nid, 1))
	hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
	f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
	fig = go.Figure(go.Sunburst(
	ids=ids, labels=labels, parents=parents, values=values,
	branchvalues="total", hovertext=hover, hoverinfo="text",
	maxdepth=max_depth, insidetextorientation="radial",
	marker=dict(colorscale=color, line=dict(width=1, color="white"))))
	fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
	title=dict(text="Click sector to drill down — click centre to go back",
	font=dict(size=13), x=0.5))
	return fig

	def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
	nodes = _filter_dissolved(nodes)
	pm = _parent_map(nodes)
	vm = _tree_value_map(nodes, pm)
	ids, labels, parents, values, hover = [], [], [], [], []
	for n in nodes:
	nid = int(n["id"])
	lc = len(_leaf_ids(nodes, nid))
	ids.append(str(nid))
	labels.append(str(n.get("name", ""))[:40])
	parents.append("" if nid == 0 else str(pm.get(nid, 0)))
	values.append(vm.get(nid, 1))
	hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
	f"{_wrap_hover(n.get('desc', ''))}")
	fig = go.Figure(go.Treemap(
	ids=ids, labels=labels, parents=parents, values=values,
	branchvalues="total", hovertext=hover, hoverinfo="text",
	textinfo="label+value", maxdepth=max_depth,
	marker=dict(colorscale=color, line=dict(width=1, color="white"))))
	fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
	return fig

	# ─────────────────────────────────────────────────────────────────────────────
	# NODE-LINK TREE (Reingold-Tilford layout — copied from approach_2.py)
	# ─────────────────────────────────────────────────────────────────────────────
	def _node_color(n: dict) -> str:
	t = n.get("type", "")
	if t == "root": return "#c44e52"
	if t == "attribute": return "#4C72B0"
	if t == "collapsed": return "#bbbbbb"
	return "#8C8C8C"

	def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
	m = {int(n["id"]): n for n in nodes}
	dnodes: dict = {}
	edges: list = []
	counter = 10 ** 9

	def rec(nid, depth):
	nonlocal counter
	n = m.get(int(nid))
	if not n:
	return
	if not show_hidden and n.get("isShown") is False and depth > 0:
	return
	dnodes[int(nid)] = n
	if depth >= max_depth and n.get("related"):
	counter += 1
	cid = counter
	n_leaves = len(_leaf_ids(nodes, nid))
	dnodes[cid] = {"id": cid, "name": f"… {n_leaves} variables",
	"type": "collapsed", "related": [],
	"desc": f"Collapsed: {n.get('name')}"}
	edges.append((int(nid), cid))
	return
	for c in n.get("related", []):
	ch = m.get(int(c))
	if not ch:
	continue
	if not show_hidden and ch.get("isShown") is False:
	continue
	edges.append((int(nid), int(c)))
	rec(int(c), depth + 1)

	rec(0, 0)
	return list(dnodes.values()), edges

	def _positions(edges: list):
	H_SCALE, V_SPACE = 3.0, 1.8
	children: dict = defaultdict(list)
	for p, c in edges:
	children[p].append(c)
	pos: dict = {}
	counter = {"v": 0}

	def rec(nid, depth):
	ch = children.get(nid, [])
	if not ch:
	y_pos = counter["v"] * V_SPACE
	counter["v"] += 1
	pos[nid] = (depth * H_SCALE, y_pos)
	return y_pos
	child_ys = [rec(c, depth + 1) for c in ch]
	y_pos = float(np.mean(child_ys))
	pos[nid] = (depth * H_SCALE, y_pos)
	return y_pos

	rec(0, 0)
	return pos

	def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
	nodes = _filter_dissolved(nodes)
	dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
	pos = _positions(edges)

	ex, ey = [], []
	for p, c in edges:
	if p not in pos or c not in pos:
	continue
	x0, y0 = pos[p]
	x1, y1 = pos[c]
	xm = (x0 + x1) / 2
	ex += [x0, xm, xm, x1, None]
	ey += [y0, y0, y1, y1, None]
	traces = [go.Scatter(x=ex, y=ey, mode="lines",
	line=dict(width=1, color="#c8c8c8"),
	hoverinfo="skip", showlegend=False)]

	agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
	lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
	for n in dnodes:
	nid = int(n["id"])
	if nid not in pos:
	continue
	x, y = pos[nid]
	lc = len(_leaf_ids(nodes, nid))
	lab = str(n.get("name", ""))[:32]
	hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
	f"Variables: {lc}")
	if n.get("type") == "attribute":
	lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
	lf_lab.append(lab if show_leaf_labels else "")
	lf_hov.append(hov)
	else:
	agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
	agg_lab.append(lab); agg_hov.append(hov)

	traces.append(go.Scatter(
	x=lf_x, y=lf_y, mode="markers+text" if show_leaf_labels else "markers",
	text=lf_lab, textposition="middle right", textfont=dict(size=9),
	marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
	hovertext=lf_hov, hoverinfo="text", showlegend=False))
	traces.append(go.Scatter(
	x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
	textposition="middle right", textfont=dict(size=10),
	marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
	hovertext=agg_hov, hoverinfo="text", showlegend=False))

	n_rows = max(len(lf_y), len(agg_y), 1)
	fig = go.Figure(traces)
	fig.update_layout(
	height=max(600, n_rows * 16),
	margin=dict(l=10, r=140, t=10, b=10),
	xaxis=dict(visible=False), yaxis=dict(visible=False),
	plot_bgcolor="white",
	)
	return fig

	# ─────────────────────────────────────────────────────────────────────────────
	# STATS / SAFE RENDERING
	# ─────────────────────────────────────────────────────────────────────────────
	def _tree_depth(nodes: list) -> int:
	"""Max depth of the rendered single-parent tree (root = depth 0)."""
	nodes = _filter_dissolved(nodes)
	m = {int(n["id"]): n for n in nodes}
	best = {"d": 0}
	def rec(nid, d):
	best["d"] = max(best["d"], d)
	for c in m.get(int(nid), {}).get("related", []):
	if int(c) in m:
	rec(int(c), d + 1)
	rec(0, 0)
	return best["d"]

	def safe_render_depth(nodes: list, requested: int) -> int:
	"""Plotly sunburst/treemap silently blank when asked to draw too many sectors
	at once (large hierarchies like HCP). Cap the initial render depth — the
	chart stays fully drillable by clicking, so no data is lost."""
	n = len(_filter_dissolved(nodes))
	if n > 400:
	return min(requested, 3)
	if n > 150:
	return min(requested, 4)
	return requested

	# ─────────────────────────────────────────────────────────────────────────────
	# IO
	# ─────────────────────────────────────────────────────────────────────────────
	@st.cache_data(show_spinner=False)
	def _load_json(path_str: str):
	with open(path_str, encoding="utf-8") as f:
	return json.load(f)

	def _read_bytes(path_str: str) -> bytes:
	with open(path_str, "rb") as f:
	return f.read()

	@st.cache_data(show_spinner=False)
	def _outputs_zip(root_str: str) -> bytes:
	"""Zip the entire bundled outputs/ folder for one-click download."""
	import io, zipfile
	root = Path(root_str)
	buf = io.BytesIO()
	with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
	for p in sorted(root.rglob("*")):
	if p.is_file():
	zf.write(p, arcname=p.relative_to(root.parent).as_posix())
	return buf.getvalue()

	def count_nodes(nodes: list) -> tuple[int, int]:
	nodes = _filter_dissolved(nodes)
	leaves = sum(1 for n in nodes if n.get("type") == "attribute")
	aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
	return leaves, aggs

	def concept_aligned_pct(nodes: list) -> float \| None:
	"""% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
	aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
	if not aggs:
	return None
	aligned = sum(1 for n in aggs
	if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
	return 100.0 * aligned / len(aggs) if aligned else None

	# ─────────────────────────────────────────────────────────────────────────────
	# SIDEBAR
	# ─────────────────────────────────────────────────────────────────────────────
	with st.sidebar:
	st.title("🌿 Hierarchy Explorer")
	st.caption("TFM 2026 — Metadata hierarchy construction")
	st.markdown("---")

	approach = st.radio("Select Approach",
	["Baseline", "Approach 1", "Approach 2"], index=0)
	dataset = st.radio("Select Dataset", ["AI-MIND", "HCP"], index=0)

	st.markdown("---")
	st.caption("Results are pre-built from the thesis experiments. To run on your "
	"own data, clone the repository and run the individual apps.")
	st.markdown("[📦 GitHub Repository]"
	"(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")

	# ─────────────────────────────────────────────────────────────────────────────
	# MAIN
	# ─────────────────────────────────────────────────────────────────────────────
	cfg = CONFIG[approach]
	color = cfg["color"]

	st.title(f"📊 {approach} — {dataset} Dataset")
	st.markdown(f"> {APPROACH_DESC[approach]}")

	paths = PREBUILT[approach][dataset]
	hier_path = paths.get("hierarchy")
	if hier_path is None or not hier_path.exists():
	st.error(f"Pre-built result not found: `{hier_path}`")
	st.stop()

	raw_nodes = _load_json(str(hier_path))

	leaves, aggs = count_nodes(raw_nodes)
	c1, c2, c3 = st.columns(3)
	c1.metric("Leaf Variables", leaves)
	c2.metric("Aggregation Nodes", aggs)
	c3.metric("Total Nodes", leaves + aggs)

	# ── Build summary (collapsed) ────────────────────────────────────────────────
	facet_path = paths.get("facets")
	n_facets = None
	if facet_path is not None and facet_path.exists():
	try:
	n_facets = len(_load_json(str(facet_path)))
	except Exception:
	n_facets = None

	with st.expander("ℹ️ Build summary", expanded=False):
	bs1, bs2, bs3, bs4 = st.columns(4)
	bs1.metric("Variables", leaves)
	bs2.metric("Internal nodes", aggs)
	bs3.metric("Tree depth", _tree_depth(raw_nodes))
	bs4.metric("Facets", n_facets if n_facets is not None else "—")
	pct = concept_aligned_pct(raw_nodes)
	if pct is not None:
	st.caption(f"Concept-aligned aggregation nodes: {pct:.1f}%")
	st.caption(
	f"Source file: `{hier_path.name}` · "
	f"Approach: {approach} · Dataset: {dataset}. "
	"Tree topology and labels are reproduced exactly from the pre-built "
	"thesis output (the algorithms are not re-run in this viewer)."
	)

	# ── Downloads ────────────────────────────────────────────────────────────────
	d1, d2, d3 = st.columns(3)
	with d1:
	st.download_button("⬇️ Hierarchy JSON", data=_read_bytes(str(hier_path)),
	file_name=hier_path.name, mime="application/json",
	use_container_width=True)
	with d2:
	if facet_path is not None and facet_path.exists():
	st.download_button("⬇️ Facets JSON", data=_read_bytes(str(facet_path)),
	file_name=facet_path.name, mime="application/json",
	use_container_width=True)
	else:
	st.button("⬇️ Facets JSON", disabled=True, use_container_width=True,
	help="This approach/dataset has no facet tree.")
	with d3:
	st.download_button("⬇️ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
	file_name="metadata_hierarchy_outputs.zip",
	mime="application/zip", use_container_width=True)

	st.markdown("---")

	# ── Level-of-Detail controls (above chart — matches the apps) ────────────────
	view_options = ["Sunburst (drill-down)", "Treemap"]
	if cfg["node_link"]:
	view_options.append("Node-link tree")

	if cfg["compress"]:
	vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
	else:
	vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
	vc5 = None

	with vc1:
	viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
	help="Sunburst best for large hierarchies [Taxonomizer]. "
	"Node-link best for moderate-depth structure inspection.")
	with vc2:
	depth = st.slider("Depth (Level of Detail)", 1, 8, DEFAULT_DEPTH, 1)
	with vc3:
	show_leaf_labels = st.checkbox("Leaf labels", value=False)
	with vc4:
	show_hidden = st.checkbox("Hidden nodes", value=False)
	if vc5 is not None:
	with vc5:
	compress_chains = st.checkbox("Compress chains", value=True,
	help="Merge one-child aggregation chains "
	'(e.g. "DMS → DMS Recommended Standard") for '
	"display. Export JSON keeps original structure.")
	else:
	compress_chains = False

	st.divider()

	display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes

	if viz_mode == "Sunburst (drill-down)":
	eff = safe_render_depth(display_nodes, depth)
	if eff < depth:
	st.caption(f"Large hierarchy — showing {eff} levels initially to render "
	"reliably. Click any sector to drill deeper.")
	st.plotly_chart(plot_sunburst(display_nodes, color, eff), use_container_width=True)
	elif viz_mode == "Treemap":
	eff = safe_render_depth(display_nodes, depth)
	if eff < depth:
	st.caption(f"Large hierarchy — showing {eff} levels initially to render "
	"reliably. Click a tile to drill deeper.")
	st.plotly_chart(plot_treemap(display_nodes, color, eff), use_container_width=True)
	else:
	st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
	use_container_width=True)

	# ── Facets (Approach 1 only) ─────────────────────────────────────────────────
	if facet_path is not None and facet_path.exists():
	st.markdown("---")
	st.subheader("🔀 Parallel facets")
	facets = _load_json(str(facet_path))
	names = list(facets.keys())
	if not names:
	st.info("No facets available for this dataset.")
	else:
	sel = st.selectbox("Select facet", names)
	fnodes = facets[sel]
	ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
	with ft1:
	st.plotly_chart(plot_sunburst(fnodes, color, depth), use_container_width=True)
	with ft2:
	st.plotly_chart(plot_treemap(fnodes, color), use_container_width=True)