RoophaSharon commited on
Commit Β·
ef5da89
1
Parent(s): 2b56f2e
Navigation router (branding + Demo View + collapsible Build hierarchy); full-range LoD slider (1-9, default 7); replace deprecated use_container_width
Browse files- demo.py +26 -560
- requirements.txt +1 -1
- pages/2_Approach_1.py β views/run_approach_1.py +22 -22
- pages/3_Approach_2.py β views/run_approach_2.py +16 -17
- pages/1_Baseline.py β views/run_baseline.py +10 -10
- views/viewer.py +562 -0
demo.py
CHANGED
|
@@ -1,580 +1,46 @@
|
|
| 1 |
"""
|
| 2 |
Metadata Hierarchy Explorer β TFM 2026
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
match the controls in the individual apps.
|
| 12 |
"""
|
| 13 |
-
from __future__ import annotations
|
| 14 |
-
import json
|
| 15 |
-
from collections import defaultdict
|
| 16 |
-
from pathlib import Path
|
| 17 |
-
|
| 18 |
-
import numpy as np
|
| 19 |
-
import plotly.graph_objects as go
|
| 20 |
import streamlit as st
|
| 21 |
|
| 22 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
-
# PAGE CONFIG
|
| 24 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
st.set_page_config(
|
| 26 |
page_title="Metadata Hierarchy Explorer",
|
| 27 |
page_icon="πΏ",
|
| 28 |
layout="wide",
|
| 29 |
)
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
# PRE-BUILT OUTPUT PATHS
|
| 37 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
-
PREBUILT = {
|
| 39 |
-
"Baseline": {
|
| 40 |
-
"AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
|
| 41 |
-
"HCP": {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
|
| 42 |
-
},
|
| 43 |
-
"Approach 1": {
|
| 44 |
-
"AI-MIND": {
|
| 45 |
-
"hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
|
| 46 |
-
"facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
|
| 47 |
-
},
|
| 48 |
-
"HCP": {
|
| 49 |
-
"hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
|
| 50 |
-
"facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
|
| 51 |
-
},
|
| 52 |
-
},
|
| 53 |
-
"Approach 2": {
|
| 54 |
-
"AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
|
| 55 |
-
"HCP": {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
|
| 56 |
-
},
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
# Per-approach rendering config (matches each source app)
|
| 60 |
-
CONFIG = {
|
| 61 |
-
"Baseline": {"color": "Greens", "compress": False, "node_link": False},
|
| 62 |
-
"Approach 1": {"color": "Blues", "compress": False, "node_link": True},
|
| 63 |
-
"Approach 2": {"color": "Viridis", "compress": True, "node_link": True},
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
APPROACH_DESC = {
|
| 67 |
-
"Baseline": (
|
| 68 |
-
"Pure clustering baseline β TF-IDF representation + recursive agglomerative "
|
| 69 |
-
"(cosine) clustering, number of clusters chosen by silhouette. No external APIs, "
|
| 70 |
-
"no neural embeddings. Node labels are the most discriminative terms per cluster."
|
| 71 |
-
),
|
| 72 |
-
"Approach 1": (
|
| 73 |
-
"Global embedding pipeline β SBERT + NΓM concept-table alignment (GonΓ§alves 2019) "
|
| 74 |
-
"+ HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets. Optionally "
|
| 75 |
-
"retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal."
|
| 76 |
-
),
|
| 77 |
-
"Approach 2": (
|
| 78 |
-
"Dataset-constrained multi-aspect hierarchy β group-anchored L1/L2 β phrase-slot "
|
| 79 |
-
"mining β FASTopic semantic aspect discovery (Wu et al. NeurIPS 2024) β GMM/KMeans "
|
| 80 |
-
"clustering β deterministic 5-stage label generation. Optional local-LLM refinement."
|
| 81 |
-
),
|
| 82 |
-
}
|
| 83 |
-
|
| 84 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
-
# TREE TRANSFORMS (copied from approach_2.py β display-only, exact behaviour)
|
| 86 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββ
|
| 87 |
-
def _filter_dissolved(nodes: list) -> list:
|
| 88 |
-
drop_ids = {int(n["id"]) for n in nodes
|
| 89 |
-
if n.get("type") == "dissolved" or n.get("isShown") is False}
|
| 90 |
-
if not drop_ids:
|
| 91 |
-
return nodes
|
| 92 |
-
out = []
|
| 93 |
-
for n in nodes:
|
| 94 |
-
if int(n["id"]) in drop_ids:
|
| 95 |
-
continue
|
| 96 |
-
m = dict(n)
|
| 97 |
-
m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
|
| 98 |
-
out.append(m)
|
| 99 |
-
return out
|
| 100 |
-
|
| 101 |
-
def compress_one_child_chains(nodes: list) -> list:
|
| 102 |
-
"""Collapse chains where an aggregation node has exactly one aggregation child
|
| 103 |
-
(e.g. 'DMS β DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
|
| 104 |
-
nodes = _filter_dissolved(nodes)
|
| 105 |
-
nm = {int(n["id"]): dict(n) for n in nodes}
|
| 106 |
-
|
| 107 |
-
def _is_chain_link(n):
|
| 108 |
-
if n.get("type") != "aggregation":
|
| 109 |
-
return False
|
| 110 |
-
children = n.get("related", [])
|
| 111 |
-
return (len(children) == 1
|
| 112 |
-
and nm.get(int(children[0]), {}).get("type") == "aggregation")
|
| 113 |
-
|
| 114 |
-
changed = True
|
| 115 |
-
while changed:
|
| 116 |
-
changed = False
|
| 117 |
-
for nid, n in list(nm.items()):
|
| 118 |
-
if _is_chain_link(n):
|
| 119 |
-
child_id = int(n["related"][0])
|
| 120 |
-
child = nm[child_id]
|
| 121 |
-
new_node = dict(child)
|
| 122 |
-
new_node["id"] = nid
|
| 123 |
-
new_node["name"] = f"{n['name']} / {child['name']}"
|
| 124 |
-
new_node["desc"] = f"{n.get('desc', '')} | {child.get('desc', '')}"
|
| 125 |
-
nm[nid] = new_node
|
| 126 |
-
if child_id in nm:
|
| 127 |
-
del nm[child_id]
|
| 128 |
-
for other in nm.values():
|
| 129 |
-
other["related"] = [nid if int(c) == child_id else int(c)
|
| 130 |
-
for c in other.get("related", [])]
|
| 131 |
-
changed = True
|
| 132 |
-
break
|
| 133 |
-
return list(nm.values())
|
| 134 |
-
|
| 135 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 136 |
-
# RENDER HELPERS (DAG-safe value map β copied from approach_2.py)
|
| 137 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
-
def _leaf_ids(nodes: list, nid: int) -> list:
|
| 139 |
-
m = {int(n["id"]): n for n in nodes}
|
| 140 |
-
out = []
|
| 141 |
-
def rec(x):
|
| 142 |
-
n = m.get(int(x))
|
| 143 |
-
if not n:
|
| 144 |
-
return
|
| 145 |
-
if n.get("type") == "attribute":
|
| 146 |
-
out.append(int(x)); return
|
| 147 |
-
for c in n.get("related", []):
|
| 148 |
-
rec(int(c))
|
| 149 |
-
rec(nid)
|
| 150 |
-
return list(dict.fromkeys(out))
|
| 151 |
-
|
| 152 |
-
def _parent_map(nodes: list) -> dict:
|
| 153 |
-
pm = {}
|
| 154 |
-
for n in nodes:
|
| 155 |
-
for c in n.get("related", []):
|
| 156 |
-
if int(c) not in pm:
|
| 157 |
-
pm[int(c)] = int(n["id"])
|
| 158 |
-
return pm
|
| 159 |
-
|
| 160 |
-
def _tree_value_map(nodes: list, pm: dict) -> dict:
|
| 161 |
-
kids = {}
|
| 162 |
-
for child, par in pm.items():
|
| 163 |
-
kids.setdefault(int(par), []).append(int(child))
|
| 164 |
-
nodemap = {int(n["id"]): n for n in nodes}
|
| 165 |
-
memo = {}
|
| 166 |
-
def count(nid: int) -> int:
|
| 167 |
-
if nid in memo:
|
| 168 |
-
return memo[nid]
|
| 169 |
-
memo[nid] = 1
|
| 170 |
-
n = nodemap.get(nid)
|
| 171 |
-
if n is not None and n.get("type") == "attribute":
|
| 172 |
-
memo[nid] = 1
|
| 173 |
-
return 1
|
| 174 |
-
ch = kids.get(nid, [])
|
| 175 |
-
v = sum(count(c) for c in ch) if ch else 1
|
| 176 |
-
memo[nid] = max(1, v)
|
| 177 |
-
return memo[nid]
|
| 178 |
-
return {nid: count(nid) for nid in nodemap}
|
| 179 |
-
|
| 180 |
-
def _wrap_hover(text: str, width: int = 80) -> str:
|
| 181 |
-
import textwrap as _tw
|
| 182 |
-
s = str(text or "")
|
| 183 |
-
if not s:
|
| 184 |
-
return ""
|
| 185 |
-
lines = []
|
| 186 |
-
for raw_line in s.split("\n"):
|
| 187 |
-
lines.extend(_tw.wrap(raw_line, width=width) or [""])
|
| 188 |
-
return "<br>".join(lines)
|
| 189 |
-
|
| 190 |
-
def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
|
| 191 |
-
nodes = _filter_dissolved(nodes)
|
| 192 |
-
pm = _parent_map(nodes)
|
| 193 |
-
vm = _tree_value_map(nodes, pm)
|
| 194 |
-
ids, labels, parents, values, hover = [], [], [], [], []
|
| 195 |
-
for n in nodes:
|
| 196 |
-
nid = int(n["id"])
|
| 197 |
-
lc = len(_leaf_ids(nodes, nid))
|
| 198 |
-
ids.append(str(nid))
|
| 199 |
-
labels.append(str(n.get("name", ""))[:40])
|
| 200 |
-
parents.append("" if nid == 0 else str(pm.get(nid, 0)))
|
| 201 |
-
values.append(vm.get(nid, 1))
|
| 202 |
-
hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
|
| 203 |
-
f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
|
| 204 |
-
fig = go.Figure(go.Sunburst(
|
| 205 |
-
ids=ids, labels=labels, parents=parents, values=values,
|
| 206 |
-
branchvalues="total", hovertext=hover, hoverinfo="text",
|
| 207 |
-
maxdepth=max_depth, insidetextorientation="radial",
|
| 208 |
-
marker=dict(colorscale=color, line=dict(width=1, color="white"))))
|
| 209 |
-
fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
|
| 210 |
-
title=dict(text="Click sector to drill down β click centre to go back",
|
| 211 |
-
font=dict(size=13), x=0.5))
|
| 212 |
-
return fig
|
| 213 |
-
|
| 214 |
-
def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
|
| 215 |
-
nodes = _filter_dissolved(nodes)
|
| 216 |
-
pm = _parent_map(nodes)
|
| 217 |
-
vm = _tree_value_map(nodes, pm)
|
| 218 |
-
ids, labels, parents, values, hover = [], [], [], [], []
|
| 219 |
-
for n in nodes:
|
| 220 |
-
nid = int(n["id"])
|
| 221 |
-
lc = len(_leaf_ids(nodes, nid))
|
| 222 |
-
ids.append(str(nid))
|
| 223 |
-
labels.append(str(n.get("name", ""))[:40])
|
| 224 |
-
parents.append("" if nid == 0 else str(pm.get(nid, 0)))
|
| 225 |
-
values.append(vm.get(nid, 1))
|
| 226 |
-
hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
|
| 227 |
-
f"{_wrap_hover(n.get('desc', ''))}")
|
| 228 |
-
fig = go.Figure(go.Treemap(
|
| 229 |
-
ids=ids, labels=labels, parents=parents, values=values,
|
| 230 |
-
branchvalues="total", hovertext=hover, hoverinfo="text",
|
| 231 |
-
textinfo="label+value", maxdepth=max_depth,
|
| 232 |
-
marker=dict(colorscale=color, line=dict(width=1, color="white"))))
|
| 233 |
-
fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
|
| 234 |
-
return fig
|
| 235 |
-
|
| 236 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 237 |
-
# NODE-LINK TREE (Reingold-Tilford layout β copied from approach_2.py)
|
| 238 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 239 |
-
def _node_color(n: dict) -> str:
|
| 240 |
-
t = n.get("type", "")
|
| 241 |
-
if t == "root": return "#c44e52"
|
| 242 |
-
if t == "attribute": return "#4C72B0"
|
| 243 |
-
if t == "collapsed": return "#bbbbbb"
|
| 244 |
-
return "#8C8C8C"
|
| 245 |
-
|
| 246 |
-
def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
|
| 247 |
-
m = {int(n["id"]): n for n in nodes}
|
| 248 |
-
dnodes: dict = {}
|
| 249 |
-
edges: list = []
|
| 250 |
-
counter = 10 ** 9
|
| 251 |
-
|
| 252 |
-
def rec(nid, depth):
|
| 253 |
-
nonlocal counter
|
| 254 |
-
n = m.get(int(nid))
|
| 255 |
-
if not n:
|
| 256 |
-
return
|
| 257 |
-
if not show_hidden and n.get("isShown") is False and depth > 0:
|
| 258 |
-
return
|
| 259 |
-
dnodes[int(nid)] = n
|
| 260 |
-
if depth >= max_depth and n.get("related"):
|
| 261 |
-
counter += 1
|
| 262 |
-
cid = counter
|
| 263 |
-
n_leaves = len(_leaf_ids(nodes, nid))
|
| 264 |
-
dnodes[cid] = {"id": cid, "name": f"β¦ {n_leaves} variables",
|
| 265 |
-
"type": "collapsed", "related": [],
|
| 266 |
-
"desc": f"Collapsed: {n.get('name')}"}
|
| 267 |
-
edges.append((int(nid), cid))
|
| 268 |
-
return
|
| 269 |
-
for c in n.get("related", []):
|
| 270 |
-
ch = m.get(int(c))
|
| 271 |
-
if not ch:
|
| 272 |
-
continue
|
| 273 |
-
if not show_hidden and ch.get("isShown") is False:
|
| 274 |
-
continue
|
| 275 |
-
edges.append((int(nid), int(c)))
|
| 276 |
-
rec(int(c), depth + 1)
|
| 277 |
-
|
| 278 |
-
rec(0, 0)
|
| 279 |
-
return list(dnodes.values()), edges
|
| 280 |
-
|
| 281 |
-
def _positions(edges: list):
|
| 282 |
-
H_SCALE, V_SPACE = 3.0, 1.8
|
| 283 |
-
children: dict = defaultdict(list)
|
| 284 |
-
for p, c in edges:
|
| 285 |
-
children[p].append(c)
|
| 286 |
-
pos: dict = {}
|
| 287 |
-
counter = {"v": 0}
|
| 288 |
-
|
| 289 |
-
def rec(nid, depth):
|
| 290 |
-
ch = children.get(nid, [])
|
| 291 |
-
if not ch:
|
| 292 |
-
y_pos = counter["v"] * V_SPACE
|
| 293 |
-
counter["v"] += 1
|
| 294 |
-
pos[nid] = (depth * H_SCALE, y_pos)
|
| 295 |
-
return y_pos
|
| 296 |
-
child_ys = [rec(c, depth + 1) for c in ch]
|
| 297 |
-
y_pos = float(np.mean(child_ys))
|
| 298 |
-
pos[nid] = (depth * H_SCALE, y_pos)
|
| 299 |
-
return y_pos
|
| 300 |
-
|
| 301 |
-
rec(0, 0)
|
| 302 |
-
return pos
|
| 303 |
-
|
| 304 |
-
def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
|
| 305 |
-
nodes = _filter_dissolved(nodes)
|
| 306 |
-
dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
|
| 307 |
-
pos = _positions(edges)
|
| 308 |
-
|
| 309 |
-
ex, ey = [], []
|
| 310 |
-
for p, c in edges:
|
| 311 |
-
if p not in pos or c not in pos:
|
| 312 |
-
continue
|
| 313 |
-
x0, y0 = pos[p]
|
| 314 |
-
x1, y1 = pos[c]
|
| 315 |
-
xm = (x0 + x1) / 2
|
| 316 |
-
ex += [x0, xm, xm, x1, None]
|
| 317 |
-
ey += [y0, y0, y1, y1, None]
|
| 318 |
-
traces = [go.Scatter(x=ex, y=ey, mode="lines",
|
| 319 |
-
line=dict(width=1, color="#c8c8c8"),
|
| 320 |
-
hoverinfo="skip", showlegend=False)]
|
| 321 |
-
|
| 322 |
-
agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
|
| 323 |
-
lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
|
| 324 |
-
for n in dnodes:
|
| 325 |
-
nid = int(n["id"])
|
| 326 |
-
if nid not in pos:
|
| 327 |
-
continue
|
| 328 |
-
x, y = pos[nid]
|
| 329 |
-
lc = len(_leaf_ids(nodes, nid))
|
| 330 |
-
lab = str(n.get("name", ""))[:32]
|
| 331 |
-
hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
|
| 332 |
-
f"Variables: {lc}")
|
| 333 |
-
if n.get("type") == "attribute":
|
| 334 |
-
lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
|
| 335 |
-
lf_lab.append(lab if show_leaf_labels else "")
|
| 336 |
-
lf_hov.append(hov)
|
| 337 |
-
else:
|
| 338 |
-
agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
|
| 339 |
-
agg_lab.append(lab); agg_hov.append(hov)
|
| 340 |
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
text=lf_lab, textposition="middle right", textfont=dict(size=9),
|
| 344 |
-
marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
|
| 345 |
-
hovertext=lf_hov, hoverinfo="text", showlegend=False))
|
| 346 |
-
traces.append(go.Scatter(
|
| 347 |
-
x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
|
| 348 |
-
textposition="middle right", textfont=dict(size=10),
|
| 349 |
-
marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
|
| 350 |
-
hovertext=agg_hov, hoverinfo="text", showlegend=False))
|
| 351 |
|
| 352 |
-
|
| 353 |
-
fig = go.Figure(traces)
|
| 354 |
-
fig.update_layout(
|
| 355 |
-
height=max(600, n_rows * 16),
|
| 356 |
-
margin=dict(l=10, r=140, t=10, b=10),
|
| 357 |
-
xaxis=dict(visible=False), yaxis=dict(visible=False),
|
| 358 |
-
plot_bgcolor="white",
|
| 359 |
-
)
|
| 360 |
-
return fig
|
| 361 |
-
|
| 362 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 363 |
-
# STATS / SAFE RENDERING
|
| 364 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 365 |
-
def _tree_depth(nodes: list) -> int:
|
| 366 |
-
"""Max depth of the rendered single-parent tree (root = depth 0)."""
|
| 367 |
-
nodes = _filter_dissolved(nodes)
|
| 368 |
-
m = {int(n["id"]): n for n in nodes}
|
| 369 |
-
best = {"d": 0}
|
| 370 |
-
def rec(nid, d):
|
| 371 |
-
best["d"] = max(best["d"], d)
|
| 372 |
-
for c in m.get(int(nid), {}).get("related", []):
|
| 373 |
-
if int(c) in m:
|
| 374 |
-
rec(int(c), d + 1)
|
| 375 |
-
rec(0, 0)
|
| 376 |
-
return best["d"]
|
| 377 |
-
|
| 378 |
-
def safe_render_depth(nodes: list, requested: int) -> int:
|
| 379 |
-
"""Plotly sunburst/treemap silently blank when asked to draw too many sectors
|
| 380 |
-
at once (large hierarchies like HCP). Cap the *initial* render depth β the
|
| 381 |
-
chart stays fully drillable by clicking, so no data is lost."""
|
| 382 |
-
n = len(_filter_dissolved(nodes))
|
| 383 |
-
if n > 400:
|
| 384 |
-
return min(requested, 3)
|
| 385 |
-
if n > 150:
|
| 386 |
-
return min(requested, 4)
|
| 387 |
-
return requested
|
| 388 |
-
|
| 389 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 390 |
-
# IO
|
| 391 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 392 |
-
@st.cache_data(show_spinner=False)
|
| 393 |
-
def _load_json(path_str: str):
|
| 394 |
-
with open(path_str, encoding="utf-8") as f:
|
| 395 |
-
return json.load(f)
|
| 396 |
-
|
| 397 |
-
def _read_bytes(path_str: str) -> bytes:
|
| 398 |
-
with open(path_str, "rb") as f:
|
| 399 |
-
return f.read()
|
| 400 |
-
|
| 401 |
-
@st.cache_data(show_spinner=False)
|
| 402 |
-
def _outputs_zip(root_str: str) -> bytes:
|
| 403 |
-
"""Zip the entire bundled outputs/ folder for one-click download."""
|
| 404 |
-
import io, zipfile
|
| 405 |
-
root = Path(root_str)
|
| 406 |
-
buf = io.BytesIO()
|
| 407 |
-
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 408 |
-
for p in sorted(root.rglob("*")):
|
| 409 |
-
if p.is_file():
|
| 410 |
-
zf.write(p, arcname=p.relative_to(root.parent).as_posix())
|
| 411 |
-
return buf.getvalue()
|
| 412 |
-
|
| 413 |
-
def count_nodes(nodes: list) -> tuple[int, int]:
|
| 414 |
-
nodes = _filter_dissolved(nodes)
|
| 415 |
-
leaves = sum(1 for n in nodes if n.get("type") == "attribute")
|
| 416 |
-
aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
|
| 417 |
-
return leaves, aggs
|
| 418 |
-
|
| 419 |
-
def concept_aligned_pct(nodes: list) -> float | None:
|
| 420 |
-
"""% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
|
| 421 |
-
aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
|
| 422 |
-
if not aggs:
|
| 423 |
-
return None
|
| 424 |
-
aligned = sum(1 for n in aggs
|
| 425 |
-
if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
|
| 426 |
-
return 100.0 * aligned / len(aggs) if aligned else None
|
| 427 |
-
|
| 428 |
-
# ββββββββοΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 429 |
-
# SIDEBAR
|
| 430 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 431 |
with st.sidebar:
|
| 432 |
st.title("πΏ Hierarchy Explorer")
|
| 433 |
st.caption("TFM 2026 β Metadata hierarchy construction")
|
| 434 |
st.markdown("---")
|
|
|
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
dataset = st.radio("**Select Dataset**", ["AI-MIND", "HCP"], index=0)
|
| 439 |
-
|
| 440 |
-
st.markdown("---")
|
| 441 |
-
st.caption("Results are pre-built from the thesis experiments. To run on your "
|
| 442 |
-
"own data, clone the repository and run the individual apps.")
|
| 443 |
-
st.markdown("[π¦ GitHub Repository]"
|
| 444 |
-
"(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")
|
| 445 |
-
|
| 446 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 447 |
-
# MAIN
|
| 448 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 449 |
-
cfg = CONFIG[approach]
|
| 450 |
-
color = cfg["color"]
|
| 451 |
-
|
| 452 |
-
st.title(f"π {approach} β {dataset} Dataset")
|
| 453 |
-
st.markdown(f"> {APPROACH_DESC[approach]}")
|
| 454 |
-
|
| 455 |
-
paths = PREBUILT[approach][dataset]
|
| 456 |
-
hier_path = paths.get("hierarchy")
|
| 457 |
-
if hier_path is None or not hier_path.exists():
|
| 458 |
-
st.error(f"Pre-built result not found: `{hier_path}`")
|
| 459 |
-
st.stop()
|
| 460 |
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
leaves, aggs = count_nodes(raw_nodes)
|
| 464 |
-
c1, c2, c3 = st.columns(3)
|
| 465 |
-
c1.metric("Leaf Variables", leaves)
|
| 466 |
-
c2.metric("Aggregation Nodes", aggs)
|
| 467 |
-
c3.metric("Total Nodes", leaves + aggs)
|
| 468 |
-
|
| 469 |
-
# ββ Build summary (collapsed) ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 470 |
-
facet_path = paths.get("facets")
|
| 471 |
-
n_facets = None
|
| 472 |
-
if facet_path is not None and facet_path.exists():
|
| 473 |
-
try:
|
| 474 |
-
n_facets = len(_load_json(str(facet_path)))
|
| 475 |
-
except Exception:
|
| 476 |
-
n_facets = None
|
| 477 |
-
|
| 478 |
-
with st.expander("βΉοΈ Build summary", expanded=False):
|
| 479 |
-
bs1, bs2, bs3, bs4 = st.columns(4)
|
| 480 |
-
bs1.metric("Variables", leaves)
|
| 481 |
-
bs2.metric("Internal nodes", aggs)
|
| 482 |
-
bs3.metric("Tree depth", _tree_depth(raw_nodes))
|
| 483 |
-
bs4.metric("Facets", n_facets if n_facets is not None else "β")
|
| 484 |
-
pct = concept_aligned_pct(raw_nodes)
|
| 485 |
-
if pct is not None:
|
| 486 |
-
st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
|
| 487 |
-
st.caption(
|
| 488 |
-
f"Source file: `{hier_path.name}` Β· "
|
| 489 |
-
f"Approach: **{approach}** Β· Dataset: **{dataset}**. "
|
| 490 |
-
"Tree topology and labels are reproduced exactly from the pre-built "
|
| 491 |
-
"thesis output (the algorithms are not re-run in this viewer)."
|
| 492 |
-
)
|
| 493 |
-
|
| 494 |
-
# ββ Downloads ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 495 |
-
d1, d2, d3 = st.columns(3)
|
| 496 |
-
with d1:
|
| 497 |
-
st.download_button("β¬οΈ Hierarchy JSON", data=_read_bytes(str(hier_path)),
|
| 498 |
-
file_name=hier_path.name, mime="application/json",
|
| 499 |
-
use_container_width=True)
|
| 500 |
-
with d2:
|
| 501 |
-
if facet_path is not None and facet_path.exists():
|
| 502 |
-
st.download_button("β¬οΈ Facets JSON", data=_read_bytes(str(facet_path)),
|
| 503 |
-
file_name=facet_path.name, mime="application/json",
|
| 504 |
-
use_container_width=True)
|
| 505 |
-
else:
|
| 506 |
-
st.button("β¬οΈ Facets JSON", disabled=True, use_container_width=True,
|
| 507 |
-
help="This approach/dataset has no facet tree.")
|
| 508 |
-
with d3:
|
| 509 |
-
st.download_button("β¬οΈ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
|
| 510 |
-
file_name="metadata_hierarchy_outputs.zip",
|
| 511 |
-
mime="application/zip", use_container_width=True)
|
| 512 |
-
|
| 513 |
-
st.markdown("---")
|
| 514 |
-
|
| 515 |
-
# ββ Level-of-Detail controls (above chart β matches the apps) ββββββββββββββββ
|
| 516 |
-
view_options = ["Sunburst (drill-down)", "Treemap"]
|
| 517 |
-
if cfg["node_link"]:
|
| 518 |
-
view_options.append("Node-link tree")
|
| 519 |
-
|
| 520 |
-
if cfg["compress"]:
|
| 521 |
-
vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
|
| 522 |
-
else:
|
| 523 |
-
vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
|
| 524 |
-
vc5 = None
|
| 525 |
-
|
| 526 |
-
with vc1:
|
| 527 |
-
viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
|
| 528 |
-
help="Sunburst best for large hierarchies [Taxonomizer]. "
|
| 529 |
-
"Node-link best for moderate-depth structure inspection.")
|
| 530 |
-
with vc2:
|
| 531 |
-
depth = st.slider("Depth (Level of Detail)", 1, 8, DEFAULT_DEPTH, 1)
|
| 532 |
-
with vc3:
|
| 533 |
-
show_leaf_labels = st.checkbox("Leaf labels", value=False)
|
| 534 |
-
with vc4:
|
| 535 |
-
show_hidden = st.checkbox("Hidden nodes", value=False)
|
| 536 |
-
if vc5 is not None:
|
| 537 |
-
with vc5:
|
| 538 |
-
compress_chains = st.checkbox("Compress chains", value=True,
|
| 539 |
-
help="Merge one-child aggregation chains "
|
| 540 |
-
'(e.g. "DMS β DMS Recommended Standard") for '
|
| 541 |
-
"display. Export JSON keeps original structure.")
|
| 542 |
-
else:
|
| 543 |
-
compress_chains = False
|
| 544 |
-
|
| 545 |
-
st.divider()
|
| 546 |
-
|
| 547 |
-
display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
|
| 548 |
-
|
| 549 |
-
if viz_mode == "Sunburst (drill-down)":
|
| 550 |
-
eff = safe_render_depth(display_nodes, depth)
|
| 551 |
-
if eff < depth:
|
| 552 |
-
st.caption(f"Large hierarchy β showing {eff} levels initially to render "
|
| 553 |
-
"reliably. **Click any sector to drill deeper.**")
|
| 554 |
-
st.plotly_chart(plot_sunburst(display_nodes, color, eff), use_container_width=True)
|
| 555 |
-
elif viz_mode == "Treemap":
|
| 556 |
-
eff = safe_render_depth(display_nodes, depth)
|
| 557 |
-
if eff < depth:
|
| 558 |
-
st.caption(f"Large hierarchy β showing {eff} levels initially to render "
|
| 559 |
-
"reliably. **Click a tile to drill deeper.**")
|
| 560 |
-
st.plotly_chart(plot_treemap(display_nodes, color, eff), use_container_width=True)
|
| 561 |
-
else:
|
| 562 |
-
st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
|
| 563 |
-
use_container_width=True)
|
| 564 |
-
|
| 565 |
-
# ββ Facets (Approach 1 only) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 566 |
-
if facet_path is not None and facet_path.exists():
|
| 567 |
st.markdown("---")
|
| 568 |
-
st.
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
st.
|
| 573 |
-
else:
|
| 574 |
-
sel = st.selectbox("Select facet", names)
|
| 575 |
-
fnodes = facets[sel]
|
| 576 |
-
ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
|
| 577 |
-
with ft1:
|
| 578 |
-
st.plotly_chart(plot_sunburst(fnodes, color, depth), use_container_width=True)
|
| 579 |
-
with ft2:
|
| 580 |
-
st.plotly_chart(plot_treemap(fnodes, color), use_container_width=True)
|
|
|
|
| 1 |
"""
|
| 2 |
Metadata Hierarchy Explorer β TFM 2026
|
| 3 |
+
Navigation router (Streamlit st.navigation).
|
| 4 |
+
|
| 5 |
+
Sidebar layout:
|
| 6 |
+
πΏ Hierarchy Explorer / TFM 2026 (branding, top)
|
| 7 |
+
π Demo View (pre-built results viewer)
|
| 8 |
+
β¦ the Demo View's own controls β¦ (Select Approach / Dataset, etc.)
|
| 9 |
+
π οΈ Build hierarchy (collapsible) (upload a CSV and run an app)
|
| 10 |
+
β’ Baseline β’ Approach 1 β’ Approach 2
|
|
|
|
| 11 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
import streamlit as st
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
st.set_page_config(
|
| 15 |
page_title="Metadata Hierarchy Explorer",
|
| 16 |
page_icon="πΏ",
|
| 17 |
layout="wide",
|
| 18 |
)
|
| 19 |
|
| 20 |
+
# ββ Pages ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
viewer = st.Page("views/viewer.py", title="Demo View", icon="π", default=True)
|
| 22 |
+
base = st.Page("views/run_baseline.py", title="Baseline", icon="π’")
|
| 23 |
+
appr1 = st.Page("views/run_approach_1.py", title="Approach 1", icon="π³")
|
| 24 |
+
appr2 = st.Page("views/run_approach_2.py", title="Approach 2", icon="π¬")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
# Hidden default nav β we render our own links so we control the order.
|
| 27 |
+
pg = st.navigation([viewer, base, appr1, appr2], position="hidden")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
# ββ Sidebar TOP: branding + Demo View link ββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
with st.sidebar:
|
| 31 |
st.title("πΏ Hierarchy Explorer")
|
| 32 |
st.caption("TFM 2026 β Metadata hierarchy construction")
|
| 33 |
st.markdown("---")
|
| 34 |
+
st.page_link(viewer, label="Demo View", icon="π")
|
| 35 |
|
| 36 |
+
# ββ The selected page renders here (its own sidebar controls included) βββββββ
|
| 37 |
+
pg.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# ββ Sidebar BOTTOM: collapsible "Build hierarchy" group βββββββββββββββββββββ
|
| 40 |
+
with st.sidebar:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
st.markdown("---")
|
| 42 |
+
with st.expander("π οΈ Build hierarchy", expanded=False):
|
| 43 |
+
st.caption("Upload your own CSV and run an algorithm live.")
|
| 44 |
+
st.page_link(base, label="Baseline", icon="π’")
|
| 45 |
+
st.page_link(appr1, label="Approach 1", icon="π³")
|
| 46 |
+
st.page_link(appr2, label="Approach 2", icon="π¬")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
ο»Ώstreamlit>=1.
|
| 2 |
pandas>=2.0
|
| 3 |
numpy>=1.24
|
| 4 |
scikit-learn>=1.3
|
|
|
|
| 1 |
+
ο»Ώstreamlit>=1.43
|
| 2 |
pandas>=2.0
|
| 3 |
numpy>=1.24
|
| 4 |
scikit-learn>=1.3
|
pages/2_Approach_1.py β views/run_approach_1.py
RENAMED
|
@@ -57,7 +57,7 @@ except Exception:
|
|
| 57 |
|
| 58 |
warnings.filterwarnings('ignore')
|
| 59 |
|
| 60 |
-
|
| 61 |
st.title('Metadata Hierarchy Builder β Approach 1')
|
| 62 |
st.caption(
|
| 63 |
'Automatic concept-label extraction from metadata text + HiExpan refinement + Castanet facets. '
|
|
@@ -3876,7 +3876,7 @@ if uploads:
|
|
| 3876 |
if warn:
|
| 3877 |
st.warning('Looked like raw data β columns converted to metadata rows.')
|
| 3878 |
st.write(f'Rows: **{len(df):,}**, Columns: **{len(df.columns)}**')
|
| 3879 |
-
st.dataframe(df.head(10),
|
| 3880 |
except Exception as e:
|
| 3881 |
st.error(f'Failed to load {p.name}: {e}')
|
| 3882 |
|
|
@@ -3902,7 +3902,7 @@ if uploads:
|
|
| 3902 |
key=f'meta_{name}')
|
| 3903 |
prev = list(dict.fromkeys(leaf + group + text + meta))
|
| 3904 |
if prev:
|
| 3905 |
-
st.dataframe(df[prev].head(6),
|
| 3906 |
configs[name] = {'leaf_cols': leaf, 'group_cols': group,
|
| 3907 |
'text_cols': text, 'metadata_cols': meta}
|
| 3908 |
|
|
@@ -4202,12 +4202,12 @@ with tabs[0]:
|
|
| 4202 |
|
| 4203 |
if viz_mode == 'Sunburst (drill-down)':
|
| 4204 |
st.caption('Hover for concept provenance (confidence, source, alternatives). Click to drill down.')
|
| 4205 |
-
st.plotly_chart(plot_sunburst(nodes, depth),
|
| 4206 |
elif viz_mode == 'Treemap':
|
| 4207 |
-
st.plotly_chart(plot_treemap(nodes),
|
| 4208 |
else:
|
| 4209 |
st.plotly_chart(plot_node_link(nodes, depth, show_hidden, show_leaf_labels),
|
| 4210 |
-
|
| 4211 |
pr = path_rows(nodes)
|
| 4212 |
max_d = max((r['depth'] for r in pr), default=0)
|
| 4213 |
c1, c2, c3 = st.columns(3)
|
|
@@ -4225,7 +4225,7 @@ with tabs[0]:
|
|
| 4225 |
exp_rows = [{'Segment': seg, 'Expansion': v['expansion'],
|
| 4226 |
'Evidence': ', '.join(v['evidence'])}
|
| 4227 |
for seg, v in code_exp.items()]
|
| 4228 |
-
st.dataframe(pd.DataFrame(exp_rows),
|
| 4229 |
|
| 4230 |
# Concept label provenance for internal nodes
|
| 4231 |
prov_rows = []
|
|
@@ -4241,7 +4241,7 @@ with tabs[0]:
|
|
| 4241 |
})
|
| 4242 |
if prov_rows:
|
| 4243 |
with st.expander('Concept label provenance for internal nodes', expanded=False):
|
| 4244 |
-
st.dataframe(pd.DataFrame(prov_rows),
|
| 4245 |
|
| 4246 |
# ββ Tab 1: Faceted view βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4247 |
with tabs[1]:
|
|
@@ -4251,11 +4251,11 @@ with tabs[1]:
|
|
| 4251 |
'Concept facet uses automatically assigned labels from embedding alignment.'
|
| 4252 |
)
|
| 4253 |
if facet_trees:
|
| 4254 |
-
st.plotly_chart(plot_facets_parallel(facet_trees),
|
| 4255 |
st.markdown('### Per-facet detail')
|
| 4256 |
sel_facet = st.selectbox('Inspect facet tree', list(facet_trees.keys()))
|
| 4257 |
ft = facet_trees[sel_facet]
|
| 4258 |
-
st.plotly_chart(plot_sunburst(ft, max_depth=3),
|
| 4259 |
n_groups = len([n for n in ft if n.get('type') == 'aggregation'])
|
| 4260 |
st.info(f'Facet **{sel_facet}**: {n_groups} groups, '
|
| 4261 |
f'{len([n for n in ft if n.get("type")=="attribute"])} variables')
|
|
@@ -4273,11 +4273,11 @@ with tabs[2]:
|
|
| 4273 |
st.markdown('### Sibling coherence β before refinement (worst first)')
|
| 4274 |
before = hiexpan_report.get('coherence_before', [])
|
| 4275 |
if before:
|
| 4276 |
-
st.dataframe(pd.DataFrame(before),
|
| 4277 |
st.markdown('### Sibling coherence β after refinement')
|
| 4278 |
after = hiexpan_report.get('coherence_after', [])
|
| 4279 |
if after:
|
| 4280 |
-
st.dataframe(pd.DataFrame(after),
|
| 4281 |
b_mean = np.mean([r['coherence_score'] for r in before]) if before else float('nan')
|
| 4282 |
a_mean = np.mean([r['coherence_score'] for r in after])
|
| 4283 |
st.metric('Mean coherence improvement',
|
|
@@ -4324,7 +4324,7 @@ with tabs[3]:
|
|
| 4324 |
if can is not None:
|
| 4325 |
conflict_df = compute_conflict_table(can, nodes)
|
| 4326 |
if len(conflict_df):
|
| 4327 |
-
st.dataframe(conflict_df,
|
| 4328 |
else:
|
| 4329 |
st.success('No low-confidence placements detected.')
|
| 4330 |
else:
|
|
@@ -4393,7 +4393,7 @@ with tabs[4]:
|
|
| 4393 |
'type': c.get('type'),
|
| 4394 |
'relation': c.get('info', {}).get('relation_label', ''),
|
| 4395 |
'desc': str(c.get('desc', ''))[:120]}
|
| 4396 |
-
for c in cns if c]),
|
| 4397 |
|
| 4398 |
# ββ Tab 5: Search βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4399 |
with tabs[5]:
|
|
@@ -4407,14 +4407,14 @@ with tabs[5]:
|
|
| 4407 |
'relation': n.get('info', {}).get('relation_label', ''),
|
| 4408 |
'n_children': len(n.get('related', [])),
|
| 4409 |
'desc': str(n.get('desc', ''))[:200]})
|
| 4410 |
-
st.dataframe(pd.DataFrame(out_),
|
| 4411 |
|
| 4412 |
# ββ Tab 6: Semantic map βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4413 |
with tabs[6]:
|
| 4414 |
if can is None or len(can) < 3:
|
| 4415 |
st.info('Semantic map available after build.')
|
| 4416 |
else:
|
| 4417 |
-
st.plotly_chart(semantic_map(can),
|
| 4418 |
|
| 4419 |
# ββ Tab 7: Metadata βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4420 |
with tabs[7]:
|
|
@@ -4422,7 +4422,7 @@ with tabs[7]:
|
|
| 4422 |
st.info('Available after build.')
|
| 4423 |
else:
|
| 4424 |
show_cols = [c for c in can.columns if c != '_raw']
|
| 4425 |
-
st.dataframe(can[show_cols],
|
| 4426 |
|
| 4427 |
# ββ Tab 8: Export βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4428 |
with tabs[8]:
|
|
@@ -4438,7 +4438,7 @@ with tabs[8]:
|
|
| 4438 |
data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
|
| 4439 |
file_name=f'{_base}_approach1_hierarchy.json',
|
| 4440 |
mime='application/json',
|
| 4441 |
-
|
| 4442 |
)
|
| 4443 |
with col2:
|
| 4444 |
if facet_trees:
|
|
@@ -4447,7 +4447,7 @@ with tabs[8]:
|
|
| 4447 |
data=json.dumps(facet_trees, indent=2, ensure_ascii=False).encode('utf-8'),
|
| 4448 |
file_name=f'{_base}_approach1_facets.json',
|
| 4449 |
mime='application/json',
|
| 4450 |
-
|
| 4451 |
)
|
| 4452 |
|
| 4453 |
col3, col4 = st.columns(2)
|
|
@@ -4458,7 +4458,7 @@ with tabs[8]:
|
|
| 4458 |
data=can.drop(columns=['_raw'], errors='ignore').to_csv(index=False).encode('utf-8'),
|
| 4459 |
file_name=f'{_base}_approach1_canonical.csv',
|
| 4460 |
mime='text/csv',
|
| 4461 |
-
|
| 4462 |
)
|
| 4463 |
with col4:
|
| 4464 |
_prov_df = st.session_state.get('prov_df', pd.DataFrame())
|
|
@@ -4468,7 +4468,7 @@ with tabs[8]:
|
|
| 4468 |
data=_prov_df.to_csv(index=False).encode('utf-8'),
|
| 4469 |
file_name=f'{_base}_approach1_concept_labels.csv',
|
| 4470 |
mime='text/csv',
|
| 4471 |
-
|
| 4472 |
)
|
| 4473 |
|
| 4474 |
st.divider()
|
|
@@ -4481,7 +4481,7 @@ with tabs[8]:
|
|
| 4481 |
'dataset name β convenient for `evaluate_all.py`.'
|
| 4482 |
)
|
| 4483 |
if st.button('πΎ Save all to outputs/approach_1/', type='primary',
|
| 4484 |
-
|
| 4485 |
try:
|
| 4486 |
_out_dir.mkdir(parents=True, exist_ok=True)
|
| 4487 |
saved = []
|
|
|
|
| 57 |
|
| 58 |
warnings.filterwarnings('ignore')
|
| 59 |
|
| 60 |
+
# set_page_config handled by the navigation router (demo.py)
|
| 61 |
st.title('Metadata Hierarchy Builder β Approach 1')
|
| 62 |
st.caption(
|
| 63 |
'Automatic concept-label extraction from metadata text + HiExpan refinement + Castanet facets. '
|
|
|
|
| 3876 |
if warn:
|
| 3877 |
st.warning('Looked like raw data β columns converted to metadata rows.')
|
| 3878 |
st.write(f'Rows: **{len(df):,}**, Columns: **{len(df.columns)}**')
|
| 3879 |
+
st.dataframe(df.head(10), width='stretch')
|
| 3880 |
except Exception as e:
|
| 3881 |
st.error(f'Failed to load {p.name}: {e}')
|
| 3882 |
|
|
|
|
| 3902 |
key=f'meta_{name}')
|
| 3903 |
prev = list(dict.fromkeys(leaf + group + text + meta))
|
| 3904 |
if prev:
|
| 3905 |
+
st.dataframe(df[prev].head(6), width='stretch')
|
| 3906 |
configs[name] = {'leaf_cols': leaf, 'group_cols': group,
|
| 3907 |
'text_cols': text, 'metadata_cols': meta}
|
| 3908 |
|
|
|
|
| 4202 |
|
| 4203 |
if viz_mode == 'Sunburst (drill-down)':
|
| 4204 |
st.caption('Hover for concept provenance (confidence, source, alternatives). Click to drill down.')
|
| 4205 |
+
st.plotly_chart(plot_sunburst(nodes, depth), width='stretch')
|
| 4206 |
elif viz_mode == 'Treemap':
|
| 4207 |
+
st.plotly_chart(plot_treemap(nodes), width='stretch')
|
| 4208 |
else:
|
| 4209 |
st.plotly_chart(plot_node_link(nodes, depth, show_hidden, show_leaf_labels),
|
| 4210 |
+
width='stretch')
|
| 4211 |
pr = path_rows(nodes)
|
| 4212 |
max_d = max((r['depth'] for r in pr), default=0)
|
| 4213 |
c1, c2, c3 = st.columns(3)
|
|
|
|
| 4225 |
exp_rows = [{'Segment': seg, 'Expansion': v['expansion'],
|
| 4226 |
'Evidence': ', '.join(v['evidence'])}
|
| 4227 |
for seg, v in code_exp.items()]
|
| 4228 |
+
st.dataframe(pd.DataFrame(exp_rows), width='stretch')
|
| 4229 |
|
| 4230 |
# Concept label provenance for internal nodes
|
| 4231 |
prov_rows = []
|
|
|
|
| 4241 |
})
|
| 4242 |
if prov_rows:
|
| 4243 |
with st.expander('Concept label provenance for internal nodes', expanded=False):
|
| 4244 |
+
st.dataframe(pd.DataFrame(prov_rows), width='stretch')
|
| 4245 |
|
| 4246 |
# ββ Tab 1: Faceted view βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4247 |
with tabs[1]:
|
|
|
|
| 4251 |
'Concept facet uses automatically assigned labels from embedding alignment.'
|
| 4252 |
)
|
| 4253 |
if facet_trees:
|
| 4254 |
+
st.plotly_chart(plot_facets_parallel(facet_trees), width='stretch')
|
| 4255 |
st.markdown('### Per-facet detail')
|
| 4256 |
sel_facet = st.selectbox('Inspect facet tree', list(facet_trees.keys()))
|
| 4257 |
ft = facet_trees[sel_facet]
|
| 4258 |
+
st.plotly_chart(plot_sunburst(ft, max_depth=3), width='stretch')
|
| 4259 |
n_groups = len([n for n in ft if n.get('type') == 'aggregation'])
|
| 4260 |
st.info(f'Facet **{sel_facet}**: {n_groups} groups, '
|
| 4261 |
f'{len([n for n in ft if n.get("type")=="attribute"])} variables')
|
|
|
|
| 4273 |
st.markdown('### Sibling coherence β before refinement (worst first)')
|
| 4274 |
before = hiexpan_report.get('coherence_before', [])
|
| 4275 |
if before:
|
| 4276 |
+
st.dataframe(pd.DataFrame(before), width='stretch')
|
| 4277 |
st.markdown('### Sibling coherence β after refinement')
|
| 4278 |
after = hiexpan_report.get('coherence_after', [])
|
| 4279 |
if after:
|
| 4280 |
+
st.dataframe(pd.DataFrame(after), width='stretch')
|
| 4281 |
b_mean = np.mean([r['coherence_score'] for r in before]) if before else float('nan')
|
| 4282 |
a_mean = np.mean([r['coherence_score'] for r in after])
|
| 4283 |
st.metric('Mean coherence improvement',
|
|
|
|
| 4324 |
if can is not None:
|
| 4325 |
conflict_df = compute_conflict_table(can, nodes)
|
| 4326 |
if len(conflict_df):
|
| 4327 |
+
st.dataframe(conflict_df, width='stretch')
|
| 4328 |
else:
|
| 4329 |
st.success('No low-confidence placements detected.')
|
| 4330 |
else:
|
|
|
|
| 4393 |
'type': c.get('type'),
|
| 4394 |
'relation': c.get('info', {}).get('relation_label', ''),
|
| 4395 |
'desc': str(c.get('desc', ''))[:120]}
|
| 4396 |
+
for c in cns if c]), width='stretch')
|
| 4397 |
|
| 4398 |
# ββ Tab 5: Search βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4399 |
with tabs[5]:
|
|
|
|
| 4407 |
'relation': n.get('info', {}).get('relation_label', ''),
|
| 4408 |
'n_children': len(n.get('related', [])),
|
| 4409 |
'desc': str(n.get('desc', ''))[:200]})
|
| 4410 |
+
st.dataframe(pd.DataFrame(out_), width='stretch')
|
| 4411 |
|
| 4412 |
# ββ Tab 6: Semantic map βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4413 |
with tabs[6]:
|
| 4414 |
if can is None or len(can) < 3:
|
| 4415 |
st.info('Semantic map available after build.')
|
| 4416 |
else:
|
| 4417 |
+
st.plotly_chart(semantic_map(can), width='stretch')
|
| 4418 |
|
| 4419 |
# ββ Tab 7: Metadata βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4420 |
with tabs[7]:
|
|
|
|
| 4422 |
st.info('Available after build.')
|
| 4423 |
else:
|
| 4424 |
show_cols = [c for c in can.columns if c != '_raw']
|
| 4425 |
+
st.dataframe(can[show_cols], width='stretch')
|
| 4426 |
|
| 4427 |
# ββ Tab 8: Export βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4428 |
with tabs[8]:
|
|
|
|
| 4438 |
data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
|
| 4439 |
file_name=f'{_base}_approach1_hierarchy.json',
|
| 4440 |
mime='application/json',
|
| 4441 |
+
width='stretch',
|
| 4442 |
)
|
| 4443 |
with col2:
|
| 4444 |
if facet_trees:
|
|
|
|
| 4447 |
data=json.dumps(facet_trees, indent=2, ensure_ascii=False).encode('utf-8'),
|
| 4448 |
file_name=f'{_base}_approach1_facets.json',
|
| 4449 |
mime='application/json',
|
| 4450 |
+
width='stretch',
|
| 4451 |
)
|
| 4452 |
|
| 4453 |
col3, col4 = st.columns(2)
|
|
|
|
| 4458 |
data=can.drop(columns=['_raw'], errors='ignore').to_csv(index=False).encode('utf-8'),
|
| 4459 |
file_name=f'{_base}_approach1_canonical.csv',
|
| 4460 |
mime='text/csv',
|
| 4461 |
+
width='stretch',
|
| 4462 |
)
|
| 4463 |
with col4:
|
| 4464 |
_prov_df = st.session_state.get('prov_df', pd.DataFrame())
|
|
|
|
| 4468 |
data=_prov_df.to_csv(index=False).encode('utf-8'),
|
| 4469 |
file_name=f'{_base}_approach1_concept_labels.csv',
|
| 4470 |
mime='text/csv',
|
| 4471 |
+
width='stretch',
|
| 4472 |
)
|
| 4473 |
|
| 4474 |
st.divider()
|
|
|
|
| 4481 |
'dataset name β convenient for `evaluate_all.py`.'
|
| 4482 |
)
|
| 4483 |
if st.button('πΎ Save all to outputs/approach_1/', type='primary',
|
| 4484 |
+
width='stretch'):
|
| 4485 |
try:
|
| 4486 |
_out_dir.mkdir(parents=True, exist_ok=True)
|
| 4487 |
saved = []
|
pages/3_Approach_2.py β views/run_approach_2.py
RENAMED
|
@@ -3467,8 +3467,7 @@ def plot_node_link(nodes: list, max_depth: int = 4,
|
|
| 3467 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3468 |
# STREAMLIT APP
|
| 3469 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3470 |
-
|
| 3471 |
-
layout='wide')
|
| 3472 |
st.title('π¬ Approach 2 β Role-Decomposed Metadata Hierarchy')
|
| 3473 |
st.caption('Group anchoring β LLM role extraction β role-nested LoD tree. '
|
| 3474 |
'Full method details and citations in the Method tab.')
|
|
@@ -3613,7 +3612,7 @@ if uploads:
|
|
| 3613 |
cfg_by[f.name] = detect_roles(df)
|
| 3614 |
with st.expander(f'π {f.name}', expanded=False):
|
| 3615 |
st.write(f'Rows: **{len(df):,}** Columns: **{len(df.columns)}**')
|
| 3616 |
-
st.dataframe(df.head(8),
|
| 3617 |
except Exception as e:
|
| 3618 |
st.error(f'Could not load {f.name}: {e}')
|
| 3619 |
|
|
@@ -3813,13 +3812,13 @@ with tabs[0]:
|
|
| 3813 |
|
| 3814 |
if viz_mode == 'Sunburst (drill-down)':
|
| 3815 |
st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
|
| 3816 |
-
|
| 3817 |
elif viz_mode == 'Treemap':
|
| 3818 |
-
st.plotly_chart(plot_treemap(display_nodes),
|
| 3819 |
else:
|
| 3820 |
st.plotly_chart(plot_node_link(display_nodes, depth_display,
|
| 3821 |
show_hidden, show_leaf_labels),
|
| 3822 |
-
|
| 3823 |
|
| 3824 |
n_l = len([n for n in nodes if n.get('type') == 'attribute'])
|
| 3825 |
n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
|
|
@@ -3912,7 +3911,7 @@ with tabs[1]:
|
|
| 3912 |
W_df = pd.DataFrame(
|
| 3913 |
W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
|
| 3914 |
W_df.insert(0, 'Variable', can['_label'].tolist())
|
| 3915 |
-
st.dataframe(W_df.round(4),
|
| 3916 |
|
| 3917 |
with tabs[2]:
|
| 3918 |
st.markdown('### Role decomposition')
|
|
@@ -3935,7 +3934,7 @@ with tabs[2]:
|
|
| 3935 |
if reg_rows:
|
| 3936 |
reg_df = pd.DataFrame(reg_rows).sort_values(
|
| 3937 |
'Regularity', ascending=False, na_position='last')
|
| 3938 |
-
st.dataframe(reg_df,
|
| 3939 |
|
| 3940 |
# ββ Per-variable role table βββββββββββββββββββββββββββββββββββββββββββ
|
| 3941 |
st.markdown('#### Per-variable role table')
|
|
@@ -3996,7 +3995,7 @@ with tabs[2]:
|
|
| 3996 |
|
| 3997 |
if role_rows:
|
| 3998 |
role_df = pd.DataFrame(role_rows)
|
| 3999 |
-
st.dataframe(role_df,
|
| 4000 |
st.download_button(
|
| 4001 |
'β¬οΈ Download per-variable role CSV',
|
| 4002 |
data=role_df.to_csv(index=False).encode('utf-8'),
|
|
@@ -4021,7 +4020,7 @@ with tabs[2]:
|
|
| 4021 |
'Reasons': ', '.join(f'{k}:{v}' for k, v in
|
| 4022 |
(a.get('summary', {}) or {}).items()),
|
| 4023 |
})
|
| 4024 |
-
st.dataframe(pd.DataFrame(sum_rows),
|
| 4025 |
hide_index=True)
|
| 4026 |
|
| 4027 |
# Drill-down per group
|
|
@@ -4050,7 +4049,7 @@ with tabs[2]:
|
|
| 4050 |
})
|
| 4051 |
if row_rows:
|
| 4052 |
st.dataframe(pd.DataFrame(row_rows),
|
| 4053 |
-
|
| 4054 |
# Download as CSV for offline analysis
|
| 4055 |
csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
|
| 4056 |
st.download_button(
|
|
@@ -4129,16 +4128,16 @@ with tabs[3]:
|
|
| 4129 |
& (prov_df['LLM proposed'].astype(str).str.len() > 0)]
|
| 4130 |
if len(rej):
|
| 4131 |
st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
|
| 4132 |
-
|
| 4133 |
|
| 4134 |
# ββ Full provenance table βββββββββββββββββββββββββββββββββββββββββββββ
|
| 4135 |
st.write('**Full per-node provenance**')
|
| 4136 |
-
st.dataframe(prov_df,
|
| 4137 |
|
| 4138 |
with tabs[4]:
|
| 4139 |
if can is not None:
|
| 4140 |
st.dataframe(can.drop(columns=['_row'], errors='ignore'),
|
| 4141 |
-
|
| 4142 |
|
| 4143 |
with tabs[5]:
|
| 4144 |
# ββ derive a per-CSV base name from the uploaded files ββββββββββββββββββββ
|
|
@@ -4169,7 +4168,7 @@ with tabs[5]:
|
|
| 4169 |
data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
|
| 4170 |
file_name=f'{csv_basis}_approach2_lod.json',
|
| 4171 |
mime='application/json',
|
| 4172 |
-
|
| 4173 |
)
|
| 4174 |
with col2:
|
| 4175 |
if can is not None:
|
|
@@ -4178,7 +4177,7 @@ with tabs[5]:
|
|
| 4178 |
data=can.to_csv(index=False).encode('utf-8'),
|
| 4179 |
file_name=f'{csv_basis}_approach2_canonical.csv',
|
| 4180 |
mime='text/csv',
|
| 4181 |
-
|
| 4182 |
)
|
| 4183 |
|
| 4184 |
st.divider()
|
|
@@ -4191,7 +4190,7 @@ with tabs[5]:
|
|
| 4191 |
'dataset name β convenient for `evaluate_all.py`.'
|
| 4192 |
)
|
| 4193 |
if st.button('πΎ Save all to outputs/approach_2/', type='primary',
|
| 4194 |
-
|
| 4195 |
try:
|
| 4196 |
_out_dir.mkdir(parents=True, exist_ok=True)
|
| 4197 |
saved = []
|
|
|
|
| 3467 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3468 |
# STREAMLIT APP
|
| 3469 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3470 |
+
# set_page_config handled by the navigation router (demo.py)
|
|
|
|
| 3471 |
st.title('π¬ Approach 2 β Role-Decomposed Metadata Hierarchy')
|
| 3472 |
st.caption('Group anchoring β LLM role extraction β role-nested LoD tree. '
|
| 3473 |
'Full method details and citations in the Method tab.')
|
|
|
|
| 3612 |
cfg_by[f.name] = detect_roles(df)
|
| 3613 |
with st.expander(f'π {f.name}', expanded=False):
|
| 3614 |
st.write(f'Rows: **{len(df):,}** Columns: **{len(df.columns)}**')
|
| 3615 |
+
st.dataframe(df.head(8), width='stretch')
|
| 3616 |
except Exception as e:
|
| 3617 |
st.error(f'Could not load {f.name}: {e}')
|
| 3618 |
|
|
|
|
| 3812 |
|
| 3813 |
if viz_mode == 'Sunburst (drill-down)':
|
| 3814 |
st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
|
| 3815 |
+
width='stretch')
|
| 3816 |
elif viz_mode == 'Treemap':
|
| 3817 |
+
st.plotly_chart(plot_treemap(display_nodes), width='stretch')
|
| 3818 |
else:
|
| 3819 |
st.plotly_chart(plot_node_link(display_nodes, depth_display,
|
| 3820 |
show_hidden, show_leaf_labels),
|
| 3821 |
+
width='stretch')
|
| 3822 |
|
| 3823 |
n_l = len([n for n in nodes if n.get('type') == 'attribute'])
|
| 3824 |
n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
|
|
|
|
| 3911 |
W_df = pd.DataFrame(
|
| 3912 |
W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
|
| 3913 |
W_df.insert(0, 'Variable', can['_label'].tolist())
|
| 3914 |
+
st.dataframe(W_df.round(4), width='stretch')
|
| 3915 |
|
| 3916 |
with tabs[2]:
|
| 3917 |
st.markdown('### Role decomposition')
|
|
|
|
| 3934 |
if reg_rows:
|
| 3935 |
reg_df = pd.DataFrame(reg_rows).sort_values(
|
| 3936 |
'Regularity', ascending=False, na_position='last')
|
| 3937 |
+
st.dataframe(reg_df, width='stretch', hide_index=True)
|
| 3938 |
|
| 3939 |
# ββ Per-variable role table βββββββββββββββββββββββββββββββββββββββββββ
|
| 3940 |
st.markdown('#### Per-variable role table')
|
|
|
|
| 3995 |
|
| 3996 |
if role_rows:
|
| 3997 |
role_df = pd.DataFrame(role_rows)
|
| 3998 |
+
st.dataframe(role_df, width='stretch', hide_index=True)
|
| 3999 |
st.download_button(
|
| 4000 |
'β¬οΈ Download per-variable role CSV',
|
| 4001 |
data=role_df.to_csv(index=False).encode('utf-8'),
|
|
|
|
| 4020 |
'Reasons': ', '.join(f'{k}:{v}' for k, v in
|
| 4021 |
(a.get('summary', {}) or {}).items()),
|
| 4022 |
})
|
| 4023 |
+
st.dataframe(pd.DataFrame(sum_rows), width='stretch',
|
| 4024 |
hide_index=True)
|
| 4025 |
|
| 4026 |
# Drill-down per group
|
|
|
|
| 4049 |
})
|
| 4050 |
if row_rows:
|
| 4051 |
st.dataframe(pd.DataFrame(row_rows),
|
| 4052 |
+
width='stretch', hide_index=True)
|
| 4053 |
# Download as CSV for offline analysis
|
| 4054 |
csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
|
| 4055 |
st.download_button(
|
|
|
|
| 4128 |
& (prov_df['LLM proposed'].astype(str).str.len() > 0)]
|
| 4129 |
if len(rej):
|
| 4130 |
st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
|
| 4131 |
+
width='stretch', hide_index=True)
|
| 4132 |
|
| 4133 |
# ββ Full provenance table βββββββββββββββββββββββββββββββββββββββββββββ
|
| 4134 |
st.write('**Full per-node provenance**')
|
| 4135 |
+
st.dataframe(prov_df, width='stretch', hide_index=True)
|
| 4136 |
|
| 4137 |
with tabs[4]:
|
| 4138 |
if can is not None:
|
| 4139 |
st.dataframe(can.drop(columns=['_row'], errors='ignore'),
|
| 4140 |
+
width='stretch')
|
| 4141 |
|
| 4142 |
with tabs[5]:
|
| 4143 |
# ββ derive a per-CSV base name from the uploaded files ββββββββββββββββββββ
|
|
|
|
| 4168 |
data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
|
| 4169 |
file_name=f'{csv_basis}_approach2_lod.json',
|
| 4170 |
mime='application/json',
|
| 4171 |
+
width='stretch',
|
| 4172 |
)
|
| 4173 |
with col2:
|
| 4174 |
if can is not None:
|
|
|
|
| 4177 |
data=can.to_csv(index=False).encode('utf-8'),
|
| 4178 |
file_name=f'{csv_basis}_approach2_canonical.csv',
|
| 4179 |
mime='text/csv',
|
| 4180 |
+
width='stretch',
|
| 4181 |
)
|
| 4182 |
|
| 4183 |
st.divider()
|
|
|
|
| 4190 |
'dataset name β convenient for `evaluate_all.py`.'
|
| 4191 |
)
|
| 4192 |
if st.button('πΎ Save all to outputs/approach_2/', type='primary',
|
| 4193 |
+
width='stretch'):
|
| 4194 |
try:
|
| 4195 |
_out_dir.mkdir(parents=True, exist_ok=True)
|
| 4196 |
saved = []
|
pages/1_Baseline.py β views/run_baseline.py
RENAMED
|
@@ -40,7 +40,7 @@ from sklearn.preprocessing import LabelEncoder
|
|
| 40 |
|
| 41 |
warnings.filterwarnings('ignore')
|
| 42 |
|
| 43 |
-
|
| 44 |
st.title('Metadata Hierarchy Builder β Baseline (Taxonomizer)')
|
| 45 |
st.caption(
|
| 46 |
'Pure Taxonomizer baseline: TF-IDF text objects + recursive agglomerative '
|
|
@@ -562,11 +562,11 @@ with st.spinner('Loading fileβ¦'):
|
|
| 562 |
st.subheader('Step 1 β File preview')
|
| 563 |
with st.expander(f'π {uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
|
| 564 |
expanded=False):
|
| 565 |
-
st.dataframe(df.head(10),
|
| 566 |
score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
|
| 567 |
if c in prof.columns]
|
| 568 |
st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
|
| 569 |
-
|
| 570 |
|
| 571 |
st.subheader('Step 2 β Confirm column roles')
|
| 572 |
cols = list(df.columns)
|
|
@@ -639,11 +639,11 @@ c4.metric('Avg branching', _sm['avg_branching_factor'])
|
|
| 639 |
tabs = st.tabs(['Sunburst', 'Treemap', 'Node detail', 'Canonical table', 'Export', 'π Evaluation'])
|
| 640 |
|
| 641 |
with tabs[0]:
|
| 642 |
-
st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth),
|
| 643 |
st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
|
| 644 |
|
| 645 |
with tabs[1]:
|
| 646 |
-
st.plotly_chart(plot_treemap(nodes),
|
| 647 |
|
| 648 |
with tabs[2]:
|
| 649 |
nm = _nmap(nodes)
|
|
@@ -661,10 +661,10 @@ with tabs[2]:
|
|
| 661 |
sub = can[can['_leaf_id'].isin(leaf_ids_set)]
|
| 662 |
st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
|
| 663 |
st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
|
| 664 |
-
|
| 665 |
|
| 666 |
with tabs[3]:
|
| 667 |
-
st.dataframe(can,
|
| 668 |
|
| 669 |
with tabs[4]:
|
| 670 |
_base = safe_name(project_name)
|
|
@@ -675,7 +675,7 @@ with tabs[4]:
|
|
| 675 |
data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
|
| 676 |
file_name=f'{_base}_baseline_hierarchy.json',
|
| 677 |
mime='application/json',
|
| 678 |
-
|
| 679 |
)
|
| 680 |
with col2:
|
| 681 |
st.download_button(
|
|
@@ -683,7 +683,7 @@ with tabs[4]:
|
|
| 683 |
data=can.to_csv(index=False).encode('utf-8'),
|
| 684 |
file_name=f'{_base}_baseline_canonical.csv',
|
| 685 |
mime='text/csv',
|
| 686 |
-
|
| 687 |
)
|
| 688 |
|
| 689 |
st.divider()
|
|
@@ -696,7 +696,7 @@ with tabs[4]:
|
|
| 696 |
'dataset name β convenient for `evaluate_all.py`.'
|
| 697 |
)
|
| 698 |
if st.button('πΎ Save all to outputs/baseline/', type='primary',
|
| 699 |
-
|
| 700 |
try:
|
| 701 |
_out_dir.mkdir(parents=True, exist_ok=True)
|
| 702 |
(_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
|
|
|
|
| 40 |
|
| 41 |
warnings.filterwarnings('ignore')
|
| 42 |
|
| 43 |
+
# set_page_config handled by the navigation router (demo.py)
|
| 44 |
st.title('Metadata Hierarchy Builder β Baseline (Taxonomizer)')
|
| 45 |
st.caption(
|
| 46 |
'Pure Taxonomizer baseline: TF-IDF text objects + recursive agglomerative '
|
|
|
|
| 562 |
st.subheader('Step 1 β File preview')
|
| 563 |
with st.expander(f'π {uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
|
| 564 |
expanded=False):
|
| 565 |
+
st.dataframe(df.head(10), width='stretch')
|
| 566 |
score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
|
| 567 |
if c in prof.columns]
|
| 568 |
st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
|
| 569 |
+
width='stretch')
|
| 570 |
|
| 571 |
st.subheader('Step 2 β Confirm column roles')
|
| 572 |
cols = list(df.columns)
|
|
|
|
| 639 |
tabs = st.tabs(['Sunburst', 'Treemap', 'Node detail', 'Canonical table', 'Export', 'π Evaluation'])
|
| 640 |
|
| 641 |
with tabs[0]:
|
| 642 |
+
st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth), width='stretch')
|
| 643 |
st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
|
| 644 |
|
| 645 |
with tabs[1]:
|
| 646 |
+
st.plotly_chart(plot_treemap(nodes), width='stretch')
|
| 647 |
|
| 648 |
with tabs[2]:
|
| 649 |
nm = _nmap(nodes)
|
|
|
|
| 661 |
sub = can[can['_leaf_id'].isin(leaf_ids_set)]
|
| 662 |
st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
|
| 663 |
st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
|
| 664 |
+
width='stretch')
|
| 665 |
|
| 666 |
with tabs[3]:
|
| 667 |
+
st.dataframe(can, width='stretch')
|
| 668 |
|
| 669 |
with tabs[4]:
|
| 670 |
_base = safe_name(project_name)
|
|
|
|
| 675 |
data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
|
| 676 |
file_name=f'{_base}_baseline_hierarchy.json',
|
| 677 |
mime='application/json',
|
| 678 |
+
width='stretch',
|
| 679 |
)
|
| 680 |
with col2:
|
| 681 |
st.download_button(
|
|
|
|
| 683 |
data=can.to_csv(index=False).encode('utf-8'),
|
| 684 |
file_name=f'{_base}_baseline_canonical.csv',
|
| 685 |
mime='text/csv',
|
| 686 |
+
width='stretch',
|
| 687 |
)
|
| 688 |
|
| 689 |
st.divider()
|
|
|
|
| 696 |
'dataset name β convenient for `evaluate_all.py`.'
|
| 697 |
)
|
| 698 |
if st.button('πΎ Save all to outputs/baseline/', type='primary',
|
| 699 |
+
width='stretch'):
|
| 700 |
try:
|
| 701 |
_out_dir.mkdir(parents=True, exist_ok=True)
|
| 702 |
(_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
|
views/viewer.py
ADDED
|
@@ -0,0 +1,562 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Metadata Hierarchy Explorer β TFM 2026
|
| 3 |
+
Pre-built results viewer for Baseline, Approach 1, and Approach 2.
|
| 4 |
+
|
| 5 |
+
Rendering faithfully replicates each app's display pipeline:
|
| 6 |
+
- Baseline : raw tree, Greens, Sunburst + Treemap
|
| 7 |
+
- Approach 1 : raw tree, Blues, Sunburst + Treemap + Node-link + Facets
|
| 8 |
+
- Approach 2 : compress one-child chains, Viridis, Sunburst + Treemap + Node-link
|
| 9 |
+
|
| 10 |
+
Level-of-Detail controls (depth, leaf labels, hidden nodes, compress chains)
|
| 11 |
+
match the controls in the individual apps.
|
| 12 |
+
"""
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
import json
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
import numpy as np
|
| 19 |
+
import plotly.graph_objects as go
|
| 20 |
+
import streamlit as st
|
| 21 |
+
|
| 22 |
+
# Page config is set by the navigation router (demo.py).
|
| 23 |
+
ROOT = Path(__file__).resolve().parent.parent / "outputs"
|
| 24 |
+
|
| 25 |
+
DEFAULT_DEPTH = 7
|
| 26 |
+
|
| 27 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
# PRE-BUILT OUTPUT PATHS
|
| 29 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
PREBUILT = {
|
| 31 |
+
"Baseline": {
|
| 32 |
+
"AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
|
| 33 |
+
"HCP": {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
|
| 34 |
+
},
|
| 35 |
+
"Approach 1": {
|
| 36 |
+
"AI-MIND": {
|
| 37 |
+
"hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
|
| 38 |
+
"facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
|
| 39 |
+
},
|
| 40 |
+
"HCP": {
|
| 41 |
+
"hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
|
| 42 |
+
"facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
|
| 43 |
+
},
|
| 44 |
+
},
|
| 45 |
+
"Approach 2": {
|
| 46 |
+
"AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
|
| 47 |
+
"HCP": {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
|
| 48 |
+
},
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Per-approach rendering config (matches each source app)
|
| 52 |
+
CONFIG = {
|
| 53 |
+
"Baseline": {"color": "Greens", "compress": False, "node_link": False},
|
| 54 |
+
"Approach 1": {"color": "Blues", "compress": False, "node_link": True},
|
| 55 |
+
"Approach 2": {"color": "Viridis", "compress": True, "node_link": True},
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
APPROACH_DESC = {
|
| 59 |
+
"Baseline": (
|
| 60 |
+
"Pure clustering baseline β TF-IDF representation + recursive agglomerative "
|
| 61 |
+
"(cosine) clustering, number of clusters chosen by silhouette. No external APIs, "
|
| 62 |
+
"no neural embeddings. Node labels are the most discriminative terms per cluster."
|
| 63 |
+
),
|
| 64 |
+
"Approach 1": (
|
| 65 |
+
"Global embedding pipeline β SBERT + NΓM concept-table alignment (GonΓ§alves 2019) "
|
| 66 |
+
"+ HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets. Optionally "
|
| 67 |
+
"retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal."
|
| 68 |
+
),
|
| 69 |
+
"Approach 2": (
|
| 70 |
+
"Dataset-constrained multi-aspect hierarchy β group-anchored L1/L2 β phrase-slot "
|
| 71 |
+
"mining β FASTopic semantic aspect discovery (Wu et al. NeurIPS 2024) β GMM/KMeans "
|
| 72 |
+
"clustering β deterministic 5-stage label generation. Optional local-LLM refinement."
|
| 73 |
+
),
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
+
# TREE TRANSFORMS (copied from approach_2.py β display-only, exact behaviour)
|
| 78 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
+
def _filter_dissolved(nodes: list) -> list:
|
| 80 |
+
drop_ids = {int(n["id"]) for n in nodes
|
| 81 |
+
if n.get("type") == "dissolved" or n.get("isShown") is False}
|
| 82 |
+
if not drop_ids:
|
| 83 |
+
return nodes
|
| 84 |
+
out = []
|
| 85 |
+
for n in nodes:
|
| 86 |
+
if int(n["id"]) in drop_ids:
|
| 87 |
+
continue
|
| 88 |
+
m = dict(n)
|
| 89 |
+
m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
|
| 90 |
+
out.append(m)
|
| 91 |
+
return out
|
| 92 |
+
|
| 93 |
+
def compress_one_child_chains(nodes: list) -> list:
|
| 94 |
+
"""Collapse chains where an aggregation node has exactly one aggregation child
|
| 95 |
+
(e.g. 'DMS β DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
|
| 96 |
+
nodes = _filter_dissolved(nodes)
|
| 97 |
+
nm = {int(n["id"]): dict(n) for n in nodes}
|
| 98 |
+
|
| 99 |
+
def _is_chain_link(n):
|
| 100 |
+
if n.get("type") != "aggregation":
|
| 101 |
+
return False
|
| 102 |
+
children = n.get("related", [])
|
| 103 |
+
return (len(children) == 1
|
| 104 |
+
and nm.get(int(children[0]), {}).get("type") == "aggregation")
|
| 105 |
+
|
| 106 |
+
changed = True
|
| 107 |
+
while changed:
|
| 108 |
+
changed = False
|
| 109 |
+
for nid, n in list(nm.items()):
|
| 110 |
+
if _is_chain_link(n):
|
| 111 |
+
child_id = int(n["related"][0])
|
| 112 |
+
child = nm[child_id]
|
| 113 |
+
new_node = dict(child)
|
| 114 |
+
new_node["id"] = nid
|
| 115 |
+
new_node["name"] = f"{n['name']} / {child['name']}"
|
| 116 |
+
new_node["desc"] = f"{n.get('desc', '')} | {child.get('desc', '')}"
|
| 117 |
+
nm[nid] = new_node
|
| 118 |
+
if child_id in nm:
|
| 119 |
+
del nm[child_id]
|
| 120 |
+
for other in nm.values():
|
| 121 |
+
other["related"] = [nid if int(c) == child_id else int(c)
|
| 122 |
+
for c in other.get("related", [])]
|
| 123 |
+
changed = True
|
| 124 |
+
break
|
| 125 |
+
return list(nm.values())
|
| 126 |
+
|
| 127 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 128 |
+
# RENDER HELPERS (DAG-safe value map β copied from approach_2.py)
|
| 129 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
def _leaf_ids(nodes: list, nid: int) -> list:
|
| 131 |
+
m = {int(n["id"]): n for n in nodes}
|
| 132 |
+
out = []
|
| 133 |
+
def rec(x):
|
| 134 |
+
n = m.get(int(x))
|
| 135 |
+
if not n:
|
| 136 |
+
return
|
| 137 |
+
if n.get("type") == "attribute":
|
| 138 |
+
out.append(int(x)); return
|
| 139 |
+
for c in n.get("related", []):
|
| 140 |
+
rec(int(c))
|
| 141 |
+
rec(nid)
|
| 142 |
+
return list(dict.fromkeys(out))
|
| 143 |
+
|
| 144 |
+
def _parent_map(nodes: list) -> dict:
|
| 145 |
+
pm = {}
|
| 146 |
+
for n in nodes:
|
| 147 |
+
for c in n.get("related", []):
|
| 148 |
+
if int(c) not in pm:
|
| 149 |
+
pm[int(c)] = int(n["id"])
|
| 150 |
+
return pm
|
| 151 |
+
|
| 152 |
+
def _tree_value_map(nodes: list, pm: dict) -> dict:
|
| 153 |
+
kids = {}
|
| 154 |
+
for child, par in pm.items():
|
| 155 |
+
kids.setdefault(int(par), []).append(int(child))
|
| 156 |
+
nodemap = {int(n["id"]): n for n in nodes}
|
| 157 |
+
memo = {}
|
| 158 |
+
def count(nid: int) -> int:
|
| 159 |
+
if nid in memo:
|
| 160 |
+
return memo[nid]
|
| 161 |
+
memo[nid] = 1
|
| 162 |
+
n = nodemap.get(nid)
|
| 163 |
+
if n is not None and n.get("type") == "attribute":
|
| 164 |
+
memo[nid] = 1
|
| 165 |
+
return 1
|
| 166 |
+
ch = kids.get(nid, [])
|
| 167 |
+
v = sum(count(c) for c in ch) if ch else 1
|
| 168 |
+
memo[nid] = max(1, v)
|
| 169 |
+
return memo[nid]
|
| 170 |
+
return {nid: count(nid) for nid in nodemap}
|
| 171 |
+
|
| 172 |
+
def _wrap_hover(text: str, width: int = 80) -> str:
|
| 173 |
+
import textwrap as _tw
|
| 174 |
+
s = str(text or "")
|
| 175 |
+
if not s:
|
| 176 |
+
return ""
|
| 177 |
+
lines = []
|
| 178 |
+
for raw_line in s.split("\n"):
|
| 179 |
+
lines.extend(_tw.wrap(raw_line, width=width) or [""])
|
| 180 |
+
return "<br>".join(lines)
|
| 181 |
+
|
| 182 |
+
def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
|
| 183 |
+
nodes = _filter_dissolved(nodes)
|
| 184 |
+
pm = _parent_map(nodes)
|
| 185 |
+
vm = _tree_value_map(nodes, pm)
|
| 186 |
+
ids, labels, parents, values, hover = [], [], [], [], []
|
| 187 |
+
for n in nodes:
|
| 188 |
+
nid = int(n["id"])
|
| 189 |
+
lc = len(_leaf_ids(nodes, nid))
|
| 190 |
+
ids.append(str(nid))
|
| 191 |
+
labels.append(str(n.get("name", ""))[:40])
|
| 192 |
+
parents.append("" if nid == 0 else str(pm.get(nid, 0)))
|
| 193 |
+
values.append(vm.get(nid, 1))
|
| 194 |
+
hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
|
| 195 |
+
f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
|
| 196 |
+
fig = go.Figure(go.Sunburst(
|
| 197 |
+
ids=ids, labels=labels, parents=parents, values=values,
|
| 198 |
+
branchvalues="total", hovertext=hover, hoverinfo="text",
|
| 199 |
+
maxdepth=max_depth, insidetextorientation="radial",
|
| 200 |
+
marker=dict(colorscale=color, line=dict(width=1, color="white"))))
|
| 201 |
+
fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
|
| 202 |
+
title=dict(text="Click sector to drill down β click centre to go back",
|
| 203 |
+
font=dict(size=13), x=0.5))
|
| 204 |
+
return fig
|
| 205 |
+
|
| 206 |
+
def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
|
| 207 |
+
nodes = _filter_dissolved(nodes)
|
| 208 |
+
pm = _parent_map(nodes)
|
| 209 |
+
vm = _tree_value_map(nodes, pm)
|
| 210 |
+
ids, labels, parents, values, hover = [], [], [], [], []
|
| 211 |
+
for n in nodes:
|
| 212 |
+
nid = int(n["id"])
|
| 213 |
+
lc = len(_leaf_ids(nodes, nid))
|
| 214 |
+
ids.append(str(nid))
|
| 215 |
+
labels.append(str(n.get("name", ""))[:40])
|
| 216 |
+
parents.append("" if nid == 0 else str(pm.get(nid, 0)))
|
| 217 |
+
values.append(vm.get(nid, 1))
|
| 218 |
+
hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
|
| 219 |
+
f"{_wrap_hover(n.get('desc', ''))}")
|
| 220 |
+
fig = go.Figure(go.Treemap(
|
| 221 |
+
ids=ids, labels=labels, parents=parents, values=values,
|
| 222 |
+
branchvalues="total", hovertext=hover, hoverinfo="text",
|
| 223 |
+
textinfo="label+value", maxdepth=max_depth,
|
| 224 |
+
marker=dict(colorscale=color, line=dict(width=1, color="white"))))
|
| 225 |
+
fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
|
| 226 |
+
return fig
|
| 227 |
+
|
| 228 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 229 |
+
# NODE-LINK TREE (Reingold-Tilford layout β copied from approach_2.py)
|
| 230 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 231 |
+
def _node_color(n: dict) -> str:
|
| 232 |
+
t = n.get("type", "")
|
| 233 |
+
if t == "root": return "#c44e52"
|
| 234 |
+
if t == "attribute": return "#4C72B0"
|
| 235 |
+
if t == "collapsed": return "#bbbbbb"
|
| 236 |
+
return "#8C8C8C"
|
| 237 |
+
|
| 238 |
+
def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
|
| 239 |
+
m = {int(n["id"]): n for n in nodes}
|
| 240 |
+
dnodes: dict = {}
|
| 241 |
+
edges: list = []
|
| 242 |
+
counter = 10 ** 9
|
| 243 |
+
|
| 244 |
+
def rec(nid, depth):
|
| 245 |
+
nonlocal counter
|
| 246 |
+
n = m.get(int(nid))
|
| 247 |
+
if not n:
|
| 248 |
+
return
|
| 249 |
+
if not show_hidden and n.get("isShown") is False and depth > 0:
|
| 250 |
+
return
|
| 251 |
+
dnodes[int(nid)] = n
|
| 252 |
+
if depth >= max_depth and n.get("related"):
|
| 253 |
+
counter += 1
|
| 254 |
+
cid = counter
|
| 255 |
+
n_leaves = len(_leaf_ids(nodes, nid))
|
| 256 |
+
dnodes[cid] = {"id": cid, "name": f"β¦ {n_leaves} variables",
|
| 257 |
+
"type": "collapsed", "related": [],
|
| 258 |
+
"desc": f"Collapsed: {n.get('name')}"}
|
| 259 |
+
edges.append((int(nid), cid))
|
| 260 |
+
return
|
| 261 |
+
for c in n.get("related", []):
|
| 262 |
+
ch = m.get(int(c))
|
| 263 |
+
if not ch:
|
| 264 |
+
continue
|
| 265 |
+
if not show_hidden and ch.get("isShown") is False:
|
| 266 |
+
continue
|
| 267 |
+
edges.append((int(nid), int(c)))
|
| 268 |
+
rec(int(c), depth + 1)
|
| 269 |
+
|
| 270 |
+
rec(0, 0)
|
| 271 |
+
return list(dnodes.values()), edges
|
| 272 |
+
|
| 273 |
+
def _positions(edges: list):
|
| 274 |
+
H_SCALE, V_SPACE = 3.0, 1.8
|
| 275 |
+
children: dict = defaultdict(list)
|
| 276 |
+
for p, c in edges:
|
| 277 |
+
children[p].append(c)
|
| 278 |
+
pos: dict = {}
|
| 279 |
+
counter = {"v": 0}
|
| 280 |
+
|
| 281 |
+
def rec(nid, depth):
|
| 282 |
+
ch = children.get(nid, [])
|
| 283 |
+
if not ch:
|
| 284 |
+
y_pos = counter["v"] * V_SPACE
|
| 285 |
+
counter["v"] += 1
|
| 286 |
+
pos[nid] = (depth * H_SCALE, y_pos)
|
| 287 |
+
return y_pos
|
| 288 |
+
child_ys = [rec(c, depth + 1) for c in ch]
|
| 289 |
+
y_pos = float(np.mean(child_ys))
|
| 290 |
+
pos[nid] = (depth * H_SCALE, y_pos)
|
| 291 |
+
return y_pos
|
| 292 |
+
|
| 293 |
+
rec(0, 0)
|
| 294 |
+
return pos
|
| 295 |
+
|
| 296 |
+
def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
|
| 297 |
+
nodes = _filter_dissolved(nodes)
|
| 298 |
+
dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
|
| 299 |
+
pos = _positions(edges)
|
| 300 |
+
|
| 301 |
+
ex, ey = [], []
|
| 302 |
+
for p, c in edges:
|
| 303 |
+
if p not in pos or c not in pos:
|
| 304 |
+
continue
|
| 305 |
+
x0, y0 = pos[p]
|
| 306 |
+
x1, y1 = pos[c]
|
| 307 |
+
xm = (x0 + x1) / 2
|
| 308 |
+
ex += [x0, xm, xm, x1, None]
|
| 309 |
+
ey += [y0, y0, y1, y1, None]
|
| 310 |
+
traces = [go.Scatter(x=ex, y=ey, mode="lines",
|
| 311 |
+
line=dict(width=1, color="#c8c8c8"),
|
| 312 |
+
hoverinfo="skip", showlegend=False)]
|
| 313 |
+
|
| 314 |
+
agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
|
| 315 |
+
lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
|
| 316 |
+
for n in dnodes:
|
| 317 |
+
nid = int(n["id"])
|
| 318 |
+
if nid not in pos:
|
| 319 |
+
continue
|
| 320 |
+
x, y = pos[nid]
|
| 321 |
+
lc = len(_leaf_ids(nodes, nid))
|
| 322 |
+
lab = str(n.get("name", ""))[:32]
|
| 323 |
+
hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
|
| 324 |
+
f"Variables: {lc}")
|
| 325 |
+
if n.get("type") == "attribute":
|
| 326 |
+
lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
|
| 327 |
+
lf_lab.append(lab if show_leaf_labels else "")
|
| 328 |
+
lf_hov.append(hov)
|
| 329 |
+
else:
|
| 330 |
+
agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
|
| 331 |
+
agg_lab.append(lab); agg_hov.append(hov)
|
| 332 |
+
|
| 333 |
+
traces.append(go.Scatter(
|
| 334 |
+
x=lf_x, y=lf_y, mode="markers+text" if show_leaf_labels else "markers",
|
| 335 |
+
text=lf_lab, textposition="middle right", textfont=dict(size=9),
|
| 336 |
+
marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
|
| 337 |
+
hovertext=lf_hov, hoverinfo="text", showlegend=False))
|
| 338 |
+
traces.append(go.Scatter(
|
| 339 |
+
x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
|
| 340 |
+
textposition="middle right", textfont=dict(size=10),
|
| 341 |
+
marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
|
| 342 |
+
hovertext=agg_hov, hoverinfo="text", showlegend=False))
|
| 343 |
+
|
| 344 |
+
n_rows = max(len(lf_y), len(agg_y), 1)
|
| 345 |
+
fig = go.Figure(traces)
|
| 346 |
+
fig.update_layout(
|
| 347 |
+
height=max(600, n_rows * 16),
|
| 348 |
+
margin=dict(l=10, r=140, t=10, b=10),
|
| 349 |
+
xaxis=dict(visible=False), yaxis=dict(visible=False),
|
| 350 |
+
plot_bgcolor="white",
|
| 351 |
+
)
|
| 352 |
+
return fig
|
| 353 |
+
|
| 354 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 355 |
+
# STATS / SAFE RENDERING
|
| 356 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 357 |
+
def _tree_depth(nodes: list) -> int:
|
| 358 |
+
"""Max depth of the rendered single-parent tree (root = depth 0)."""
|
| 359 |
+
nodes = _filter_dissolved(nodes)
|
| 360 |
+
m = {int(n["id"]): n for n in nodes}
|
| 361 |
+
best = {"d": 0}
|
| 362 |
+
def rec(nid, d):
|
| 363 |
+
best["d"] = max(best["d"], d)
|
| 364 |
+
for c in m.get(int(nid), {}).get("related", []):
|
| 365 |
+
if int(c) in m:
|
| 366 |
+
rec(int(c), d + 1)
|
| 367 |
+
rec(0, 0)
|
| 368 |
+
return best["d"]
|
| 369 |
+
|
| 370 |
+
def safe_render_depth(nodes: list, requested: int) -> int:
|
| 371 |
+
"""Plotly sunburst/treemap silently blank when asked to draw too many sectors
|
| 372 |
+
at once (large hierarchies like HCP). Cap the *initial* render depth β the
|
| 373 |
+
chart stays fully drillable by clicking, so no data is lost."""
|
| 374 |
+
n = len(_filter_dissolved(nodes))
|
| 375 |
+
if n > 400:
|
| 376 |
+
return min(requested, 3)
|
| 377 |
+
if n > 150:
|
| 378 |
+
return min(requested, 4)
|
| 379 |
+
return requested
|
| 380 |
+
|
| 381 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 382 |
+
# IO
|
| 383 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 384 |
+
@st.cache_data(show_spinner=False)
|
| 385 |
+
def _load_json(path_str: str):
|
| 386 |
+
with open(path_str, encoding="utf-8") as f:
|
| 387 |
+
return json.load(f)
|
| 388 |
+
|
| 389 |
+
def _read_bytes(path_str: str) -> bytes:
|
| 390 |
+
with open(path_str, "rb") as f:
|
| 391 |
+
return f.read()
|
| 392 |
+
|
| 393 |
+
@st.cache_data(show_spinner=False)
|
| 394 |
+
def _outputs_zip(root_str: str) -> bytes:
|
| 395 |
+
"""Zip the entire bundled outputs/ folder for one-click download."""
|
| 396 |
+
import io, zipfile
|
| 397 |
+
root = Path(root_str)
|
| 398 |
+
buf = io.BytesIO()
|
| 399 |
+
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 400 |
+
for p in sorted(root.rglob("*")):
|
| 401 |
+
if p.is_file():
|
| 402 |
+
zf.write(p, arcname=p.relative_to(root.parent).as_posix())
|
| 403 |
+
return buf.getvalue()
|
| 404 |
+
|
| 405 |
+
def count_nodes(nodes: list) -> tuple[int, int]:
|
| 406 |
+
nodes = _filter_dissolved(nodes)
|
| 407 |
+
leaves = sum(1 for n in nodes if n.get("type") == "attribute")
|
| 408 |
+
aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
|
| 409 |
+
return leaves, aggs
|
| 410 |
+
|
| 411 |
+
def concept_aligned_pct(nodes: list) -> float | None:
|
| 412 |
+
"""% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
|
| 413 |
+
aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
|
| 414 |
+
if not aggs:
|
| 415 |
+
return None
|
| 416 |
+
aligned = sum(1 for n in aggs
|
| 417 |
+
if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
|
| 418 |
+
return 100.0 * aligned / len(aggs) if aligned else None
|
| 419 |
+
|
| 420 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 421 |
+
# SIDEBAR
|
| 422 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 423 |
+
with st.sidebar:
|
| 424 |
+
approach = st.radio("**Select Approach**",
|
| 425 |
+
["Baseline", "Approach 1", "Approach 2"], index=0)
|
| 426 |
+
dataset = st.radio("**Select Dataset**", ["AI-MIND", "HCP"], index=0)
|
| 427 |
+
|
| 428 |
+
st.markdown("---")
|
| 429 |
+
st.caption("Results are pre-built from the thesis experiments. To run on your "
|
| 430 |
+
"own data, clone the repository and run the individual apps.")
|
| 431 |
+
st.markdown("[π¦ GitHub Repository]"
|
| 432 |
+
"(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")
|
| 433 |
+
|
| 434 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 435 |
+
# MAIN
|
| 436 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 437 |
+
cfg = CONFIG[approach]
|
| 438 |
+
color = cfg["color"]
|
| 439 |
+
|
| 440 |
+
st.title(f"π {approach} β {dataset} Dataset")
|
| 441 |
+
st.markdown(f"> {APPROACH_DESC[approach]}")
|
| 442 |
+
|
| 443 |
+
paths = PREBUILT[approach][dataset]
|
| 444 |
+
hier_path = paths.get("hierarchy")
|
| 445 |
+
if hier_path is None or not hier_path.exists():
|
| 446 |
+
st.error(f"Pre-built result not found: `{hier_path}`")
|
| 447 |
+
st.stop()
|
| 448 |
+
|
| 449 |
+
raw_nodes = _load_json(str(hier_path))
|
| 450 |
+
|
| 451 |
+
leaves, aggs = count_nodes(raw_nodes)
|
| 452 |
+
c1, c2, c3 = st.columns(3)
|
| 453 |
+
c1.metric("Leaf Variables", leaves)
|
| 454 |
+
c2.metric("Aggregation Nodes", aggs)
|
| 455 |
+
c3.metric("Total Nodes", leaves + aggs)
|
| 456 |
+
|
| 457 |
+
# ββ Build summary (collapsed) ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 458 |
+
facet_path = paths.get("facets")
|
| 459 |
+
n_facets = None
|
| 460 |
+
if facet_path is not None and facet_path.exists():
|
| 461 |
+
try:
|
| 462 |
+
n_facets = len(_load_json(str(facet_path)))
|
| 463 |
+
except Exception:
|
| 464 |
+
n_facets = None
|
| 465 |
+
|
| 466 |
+
with st.expander("βΉοΈ Build summary", expanded=False):
|
| 467 |
+
bs1, bs2, bs3, bs4 = st.columns(4)
|
| 468 |
+
bs1.metric("Variables", leaves)
|
| 469 |
+
bs2.metric("Internal nodes", aggs)
|
| 470 |
+
bs3.metric("Tree depth", _tree_depth(raw_nodes))
|
| 471 |
+
bs4.metric("Facets", n_facets if n_facets is not None else "β")
|
| 472 |
+
pct = concept_aligned_pct(raw_nodes)
|
| 473 |
+
if pct is not None:
|
| 474 |
+
st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
|
| 475 |
+
st.caption(
|
| 476 |
+
f"Source file: `{hier_path.name}` Β· "
|
| 477 |
+
f"Approach: **{approach}** Β· Dataset: **{dataset}**. "
|
| 478 |
+
"Tree topology and labels are reproduced exactly from the pre-built "
|
| 479 |
+
"thesis output (the algorithms are not re-run in this viewer)."
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
# ββ Downloads ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 483 |
+
d1, d2, d3 = st.columns(3)
|
| 484 |
+
with d1:
|
| 485 |
+
st.download_button("β¬οΈ Hierarchy JSON", data=_read_bytes(str(hier_path)),
|
| 486 |
+
file_name=hier_path.name, mime="application/json",
|
| 487 |
+
width='stretch')
|
| 488 |
+
with d2:
|
| 489 |
+
if facet_path is not None and facet_path.exists():
|
| 490 |
+
st.download_button("β¬οΈ Facets JSON", data=_read_bytes(str(facet_path)),
|
| 491 |
+
file_name=facet_path.name, mime="application/json",
|
| 492 |
+
width='stretch')
|
| 493 |
+
else:
|
| 494 |
+
st.button("β¬οΈ Facets JSON", disabled=True, width='stretch',
|
| 495 |
+
help="This approach/dataset has no facet tree.")
|
| 496 |
+
with d3:
|
| 497 |
+
st.download_button("β¬οΈ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
|
| 498 |
+
file_name="metadata_hierarchy_outputs.zip",
|
| 499 |
+
mime="application/zip", width='stretch')
|
| 500 |
+
|
| 501 |
+
st.markdown("---")
|
| 502 |
+
|
| 503 |
+
# ββ Level-of-Detail controls (above chart β matches the apps) ββββββββββββββββ
|
| 504 |
+
view_options = ["Sunburst (drill-down)", "Treemap"]
|
| 505 |
+
if cfg["node_link"]:
|
| 506 |
+
view_options.append("Node-link tree")
|
| 507 |
+
|
| 508 |
+
if cfg["compress"]:
|
| 509 |
+
vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
|
| 510 |
+
else:
|
| 511 |
+
vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
|
| 512 |
+
vc5 = None
|
| 513 |
+
|
| 514 |
+
with vc1:
|
| 515 |
+
viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
|
| 516 |
+
help="Sunburst best for large hierarchies [Taxonomizer]. "
|
| 517 |
+
"Node-link best for moderate-depth structure inspection.")
|
| 518 |
+
with vc2:
|
| 519 |
+
depth = st.slider("Depth (Level of Detail)", 1, 9, DEFAULT_DEPTH, 1,
|
| 520 |
+
help="Maximum tree levels shown. Set high to see the whole "
|
| 521 |
+
"hierarchy, lower to peel back to the interior.")
|
| 522 |
+
with vc3:
|
| 523 |
+
show_leaf_labels = st.checkbox("Leaf labels", value=False)
|
| 524 |
+
with vc4:
|
| 525 |
+
show_hidden = st.checkbox("Hidden nodes", value=False)
|
| 526 |
+
if vc5 is not None:
|
| 527 |
+
with vc5:
|
| 528 |
+
compress_chains = st.checkbox("Compress chains", value=True,
|
| 529 |
+
help="Merge one-child aggregation chains "
|
| 530 |
+
'(e.g. "DMS β DMS Recommended Standard") for '
|
| 531 |
+
"display. Export JSON keeps original structure.")
|
| 532 |
+
else:
|
| 533 |
+
compress_chains = False
|
| 534 |
+
|
| 535 |
+
st.divider()
|
| 536 |
+
|
| 537 |
+
display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
|
| 538 |
+
|
| 539 |
+
if viz_mode == "Sunburst (drill-down)":
|
| 540 |
+
st.plotly_chart(plot_sunburst(display_nodes, color, depth), width='stretch')
|
| 541 |
+
elif viz_mode == "Treemap":
|
| 542 |
+
st.plotly_chart(plot_treemap(display_nodes, color, depth), width='stretch')
|
| 543 |
+
else:
|
| 544 |
+
st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
|
| 545 |
+
width='stretch')
|
| 546 |
+
|
| 547 |
+
# ββ Facets (Approach 1 only) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 548 |
+
if facet_path is not None and facet_path.exists():
|
| 549 |
+
st.markdown("---")
|
| 550 |
+
st.subheader("π Parallel facets")
|
| 551 |
+
facets = _load_json(str(facet_path))
|
| 552 |
+
names = list(facets.keys())
|
| 553 |
+
if not names:
|
| 554 |
+
st.info("No facets available for this dataset.")
|
| 555 |
+
else:
|
| 556 |
+
sel = st.selectbox("Select facet", names)
|
| 557 |
+
fnodes = facets[sel]
|
| 558 |
+
ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
|
| 559 |
+
with ft1:
|
| 560 |
+
st.plotly_chart(plot_sunburst(fnodes, color, depth), width='stretch')
|
| 561 |
+
with ft2:
|
| 562 |
+
st.plotly_chart(plot_treemap(fnodes, color), width='stretch')
|