RoophaSharon commited on
Commit
ef5da89
Β·
1 Parent(s): 2b56f2e

Navigation router (branding + Demo View + collapsible Build hierarchy); full-range LoD slider (1-9, default 7); replace deprecated use_container_width

Browse files
demo.py CHANGED
@@ -1,580 +1,46 @@
1
  """
2
  Metadata Hierarchy Explorer β€” TFM 2026
3
- Pre-built results viewer for Baseline, Approach 1, and Approach 2.
4
-
5
- Rendering faithfully replicates each app's display pipeline:
6
- - Baseline : raw tree, Greens, Sunburst + Treemap
7
- - Approach 1 : raw tree, Blues, Sunburst + Treemap + Node-link + Facets
8
- - Approach 2 : compress one-child chains, Viridis, Sunburst + Treemap + Node-link
9
-
10
- Level-of-Detail controls (depth, leaf labels, hidden nodes, compress chains)
11
- match the controls in the individual apps.
12
  """
13
- from __future__ import annotations
14
- import json
15
- from collections import defaultdict
16
- from pathlib import Path
17
-
18
- import numpy as np
19
- import plotly.graph_objects as go
20
  import streamlit as st
21
 
22
- # ─────────────────────────────────────────────────────────────────────────────
23
- # PAGE CONFIG
24
- # ─────────────────────────────────────────────────────────────────────────────
25
  st.set_page_config(
26
  page_title="Metadata Hierarchy Explorer",
27
  page_icon="🌿",
28
  layout="wide",
29
  )
30
 
31
- ROOT = Path(__file__).parent / "outputs"
32
-
33
- DEFAULT_DEPTH = 7
34
-
35
- # ─────────────────────────────────────────────────────────────────────────────
36
- # PRE-BUILT OUTPUT PATHS
37
- # ─────────────────────────────────────────────────────────────────────────────
38
- PREBUILT = {
39
- "Baseline": {
40
- "AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
41
- "HCP": {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
42
- },
43
- "Approach 1": {
44
- "AI-MIND": {
45
- "hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
46
- "facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
47
- },
48
- "HCP": {
49
- "hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
50
- "facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
51
- },
52
- },
53
- "Approach 2": {
54
- "AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
55
- "HCP": {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
56
- },
57
- }
58
-
59
- # Per-approach rendering config (matches each source app)
60
- CONFIG = {
61
- "Baseline": {"color": "Greens", "compress": False, "node_link": False},
62
- "Approach 1": {"color": "Blues", "compress": False, "node_link": True},
63
- "Approach 2": {"color": "Viridis", "compress": True, "node_link": True},
64
- }
65
-
66
- APPROACH_DESC = {
67
- "Baseline": (
68
- "Pure clustering baseline β€” TF-IDF representation + recursive agglomerative "
69
- "(cosine) clustering, number of clusters chosen by silhouette. No external APIs, "
70
- "no neural embeddings. Node labels are the most discriminative terms per cluster."
71
- ),
72
- "Approach 1": (
73
- "Global embedding pipeline β€” SBERT + NΓ—M concept-table alignment (GonΓ§alves 2019) "
74
- "+ HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets. Optionally "
75
- "retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal."
76
- ),
77
- "Approach 2": (
78
- "Dataset-constrained multi-aspect hierarchy β€” group-anchored L1/L2 β†’ phrase-slot "
79
- "mining β†’ FASTopic semantic aspect discovery (Wu et al. NeurIPS 2024) β†’ GMM/KMeans "
80
- "clustering β†’ deterministic 5-stage label generation. Optional local-LLM refinement."
81
- ),
82
- }
83
-
84
- # ─────────────────────────────────────────────────────────────────────────────
85
- # TREE TRANSFORMS (copied from approach_2.py β€” display-only, exact behaviour)
86
- # ─────────────────────────────────────────────────���───────────────────────────
87
- def _filter_dissolved(nodes: list) -> list:
88
- drop_ids = {int(n["id"]) for n in nodes
89
- if n.get("type") == "dissolved" or n.get("isShown") is False}
90
- if not drop_ids:
91
- return nodes
92
- out = []
93
- for n in nodes:
94
- if int(n["id"]) in drop_ids:
95
- continue
96
- m = dict(n)
97
- m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
98
- out.append(m)
99
- return out
100
-
101
- def compress_one_child_chains(nodes: list) -> list:
102
- """Collapse chains where an aggregation node has exactly one aggregation child
103
- (e.g. 'DMS β†’ DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
104
- nodes = _filter_dissolved(nodes)
105
- nm = {int(n["id"]): dict(n) for n in nodes}
106
-
107
- def _is_chain_link(n):
108
- if n.get("type") != "aggregation":
109
- return False
110
- children = n.get("related", [])
111
- return (len(children) == 1
112
- and nm.get(int(children[0]), {}).get("type") == "aggregation")
113
-
114
- changed = True
115
- while changed:
116
- changed = False
117
- for nid, n in list(nm.items()):
118
- if _is_chain_link(n):
119
- child_id = int(n["related"][0])
120
- child = nm[child_id]
121
- new_node = dict(child)
122
- new_node["id"] = nid
123
- new_node["name"] = f"{n['name']} / {child['name']}"
124
- new_node["desc"] = f"{n.get('desc', '')} | {child.get('desc', '')}"
125
- nm[nid] = new_node
126
- if child_id in nm:
127
- del nm[child_id]
128
- for other in nm.values():
129
- other["related"] = [nid if int(c) == child_id else int(c)
130
- for c in other.get("related", [])]
131
- changed = True
132
- break
133
- return list(nm.values())
134
-
135
- # ─────────────────────────────────────────────────────────────────────────────
136
- # RENDER HELPERS (DAG-safe value map β€” copied from approach_2.py)
137
- # ─────────────────────────────────────────────────────────────────────────────
138
- def _leaf_ids(nodes: list, nid: int) -> list:
139
- m = {int(n["id"]): n for n in nodes}
140
- out = []
141
- def rec(x):
142
- n = m.get(int(x))
143
- if not n:
144
- return
145
- if n.get("type") == "attribute":
146
- out.append(int(x)); return
147
- for c in n.get("related", []):
148
- rec(int(c))
149
- rec(nid)
150
- return list(dict.fromkeys(out))
151
-
152
- def _parent_map(nodes: list) -> dict:
153
- pm = {}
154
- for n in nodes:
155
- for c in n.get("related", []):
156
- if int(c) not in pm:
157
- pm[int(c)] = int(n["id"])
158
- return pm
159
-
160
- def _tree_value_map(nodes: list, pm: dict) -> dict:
161
- kids = {}
162
- for child, par in pm.items():
163
- kids.setdefault(int(par), []).append(int(child))
164
- nodemap = {int(n["id"]): n for n in nodes}
165
- memo = {}
166
- def count(nid: int) -> int:
167
- if nid in memo:
168
- return memo[nid]
169
- memo[nid] = 1
170
- n = nodemap.get(nid)
171
- if n is not None and n.get("type") == "attribute":
172
- memo[nid] = 1
173
- return 1
174
- ch = kids.get(nid, [])
175
- v = sum(count(c) for c in ch) if ch else 1
176
- memo[nid] = max(1, v)
177
- return memo[nid]
178
- return {nid: count(nid) for nid in nodemap}
179
-
180
- def _wrap_hover(text: str, width: int = 80) -> str:
181
- import textwrap as _tw
182
- s = str(text or "")
183
- if not s:
184
- return ""
185
- lines = []
186
- for raw_line in s.split("\n"):
187
- lines.extend(_tw.wrap(raw_line, width=width) or [""])
188
- return "<br>".join(lines)
189
-
190
- def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
191
- nodes = _filter_dissolved(nodes)
192
- pm = _parent_map(nodes)
193
- vm = _tree_value_map(nodes, pm)
194
- ids, labels, parents, values, hover = [], [], [], [], []
195
- for n in nodes:
196
- nid = int(n["id"])
197
- lc = len(_leaf_ids(nodes, nid))
198
- ids.append(str(nid))
199
- labels.append(str(n.get("name", ""))[:40])
200
- parents.append("" if nid == 0 else str(pm.get(nid, 0)))
201
- values.append(vm.get(nid, 1))
202
- hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
203
- f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
204
- fig = go.Figure(go.Sunburst(
205
- ids=ids, labels=labels, parents=parents, values=values,
206
- branchvalues="total", hovertext=hover, hoverinfo="text",
207
- maxdepth=max_depth, insidetextorientation="radial",
208
- marker=dict(colorscale=color, line=dict(width=1, color="white"))))
209
- fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
210
- title=dict(text="Click sector to drill down β€” click centre to go back",
211
- font=dict(size=13), x=0.5))
212
- return fig
213
-
214
- def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
215
- nodes = _filter_dissolved(nodes)
216
- pm = _parent_map(nodes)
217
- vm = _tree_value_map(nodes, pm)
218
- ids, labels, parents, values, hover = [], [], [], [], []
219
- for n in nodes:
220
- nid = int(n["id"])
221
- lc = len(_leaf_ids(nodes, nid))
222
- ids.append(str(nid))
223
- labels.append(str(n.get("name", ""))[:40])
224
- parents.append("" if nid == 0 else str(pm.get(nid, 0)))
225
- values.append(vm.get(nid, 1))
226
- hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
227
- f"{_wrap_hover(n.get('desc', ''))}")
228
- fig = go.Figure(go.Treemap(
229
- ids=ids, labels=labels, parents=parents, values=values,
230
- branchvalues="total", hovertext=hover, hoverinfo="text",
231
- textinfo="label+value", maxdepth=max_depth,
232
- marker=dict(colorscale=color, line=dict(width=1, color="white"))))
233
- fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
234
- return fig
235
-
236
- # ─────────────────────────────────────────────────────────────────────────────
237
- # NODE-LINK TREE (Reingold-Tilford layout β€” copied from approach_2.py)
238
- # ─────────────────────────────────────────────────────────────────────────────
239
- def _node_color(n: dict) -> str:
240
- t = n.get("type", "")
241
- if t == "root": return "#c44e52"
242
- if t == "attribute": return "#4C72B0"
243
- if t == "collapsed": return "#bbbbbb"
244
- return "#8C8C8C"
245
-
246
- def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
247
- m = {int(n["id"]): n for n in nodes}
248
- dnodes: dict = {}
249
- edges: list = []
250
- counter = 10 ** 9
251
-
252
- def rec(nid, depth):
253
- nonlocal counter
254
- n = m.get(int(nid))
255
- if not n:
256
- return
257
- if not show_hidden and n.get("isShown") is False and depth > 0:
258
- return
259
- dnodes[int(nid)] = n
260
- if depth >= max_depth and n.get("related"):
261
- counter += 1
262
- cid = counter
263
- n_leaves = len(_leaf_ids(nodes, nid))
264
- dnodes[cid] = {"id": cid, "name": f"… {n_leaves} variables",
265
- "type": "collapsed", "related": [],
266
- "desc": f"Collapsed: {n.get('name')}"}
267
- edges.append((int(nid), cid))
268
- return
269
- for c in n.get("related", []):
270
- ch = m.get(int(c))
271
- if not ch:
272
- continue
273
- if not show_hidden and ch.get("isShown") is False:
274
- continue
275
- edges.append((int(nid), int(c)))
276
- rec(int(c), depth + 1)
277
-
278
- rec(0, 0)
279
- return list(dnodes.values()), edges
280
-
281
- def _positions(edges: list):
282
- H_SCALE, V_SPACE = 3.0, 1.8
283
- children: dict = defaultdict(list)
284
- for p, c in edges:
285
- children[p].append(c)
286
- pos: dict = {}
287
- counter = {"v": 0}
288
-
289
- def rec(nid, depth):
290
- ch = children.get(nid, [])
291
- if not ch:
292
- y_pos = counter["v"] * V_SPACE
293
- counter["v"] += 1
294
- pos[nid] = (depth * H_SCALE, y_pos)
295
- return y_pos
296
- child_ys = [rec(c, depth + 1) for c in ch]
297
- y_pos = float(np.mean(child_ys))
298
- pos[nid] = (depth * H_SCALE, y_pos)
299
- return y_pos
300
-
301
- rec(0, 0)
302
- return pos
303
-
304
- def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
305
- nodes = _filter_dissolved(nodes)
306
- dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
307
- pos = _positions(edges)
308
-
309
- ex, ey = [], []
310
- for p, c in edges:
311
- if p not in pos or c not in pos:
312
- continue
313
- x0, y0 = pos[p]
314
- x1, y1 = pos[c]
315
- xm = (x0 + x1) / 2
316
- ex += [x0, xm, xm, x1, None]
317
- ey += [y0, y0, y1, y1, None]
318
- traces = [go.Scatter(x=ex, y=ey, mode="lines",
319
- line=dict(width=1, color="#c8c8c8"),
320
- hoverinfo="skip", showlegend=False)]
321
-
322
- agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
323
- lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
324
- for n in dnodes:
325
- nid = int(n["id"])
326
- if nid not in pos:
327
- continue
328
- x, y = pos[nid]
329
- lc = len(_leaf_ids(nodes, nid))
330
- lab = str(n.get("name", ""))[:32]
331
- hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
332
- f"Variables: {lc}")
333
- if n.get("type") == "attribute":
334
- lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
335
- lf_lab.append(lab if show_leaf_labels else "")
336
- lf_hov.append(hov)
337
- else:
338
- agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
339
- agg_lab.append(lab); agg_hov.append(hov)
340
 
341
- traces.append(go.Scatter(
342
- x=lf_x, y=lf_y, mode="markers+text" if show_leaf_labels else "markers",
343
- text=lf_lab, textposition="middle right", textfont=dict(size=9),
344
- marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
345
- hovertext=lf_hov, hoverinfo="text", showlegend=False))
346
- traces.append(go.Scatter(
347
- x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
348
- textposition="middle right", textfont=dict(size=10),
349
- marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
350
- hovertext=agg_hov, hoverinfo="text", showlegend=False))
351
 
352
- n_rows = max(len(lf_y), len(agg_y), 1)
353
- fig = go.Figure(traces)
354
- fig.update_layout(
355
- height=max(600, n_rows * 16),
356
- margin=dict(l=10, r=140, t=10, b=10),
357
- xaxis=dict(visible=False), yaxis=dict(visible=False),
358
- plot_bgcolor="white",
359
- )
360
- return fig
361
-
362
- # ─────────────────────────────────────────────────────────────────────────────
363
- # STATS / SAFE RENDERING
364
- # ─────────────────────────────────────────────────────────────────────────────
365
- def _tree_depth(nodes: list) -> int:
366
- """Max depth of the rendered single-parent tree (root = depth 0)."""
367
- nodes = _filter_dissolved(nodes)
368
- m = {int(n["id"]): n for n in nodes}
369
- best = {"d": 0}
370
- def rec(nid, d):
371
- best["d"] = max(best["d"], d)
372
- for c in m.get(int(nid), {}).get("related", []):
373
- if int(c) in m:
374
- rec(int(c), d + 1)
375
- rec(0, 0)
376
- return best["d"]
377
-
378
- def safe_render_depth(nodes: list, requested: int) -> int:
379
- """Plotly sunburst/treemap silently blank when asked to draw too many sectors
380
- at once (large hierarchies like HCP). Cap the *initial* render depth β€” the
381
- chart stays fully drillable by clicking, so no data is lost."""
382
- n = len(_filter_dissolved(nodes))
383
- if n > 400:
384
- return min(requested, 3)
385
- if n > 150:
386
- return min(requested, 4)
387
- return requested
388
-
389
- # ─────────────────────────────────────────────────────────────────────────────
390
- # IO
391
- # ─────────────────────────────────────────────────────────────────────────────
392
- @st.cache_data(show_spinner=False)
393
- def _load_json(path_str: str):
394
- with open(path_str, encoding="utf-8") as f:
395
- return json.load(f)
396
-
397
- def _read_bytes(path_str: str) -> bytes:
398
- with open(path_str, "rb") as f:
399
- return f.read()
400
-
401
- @st.cache_data(show_spinner=False)
402
- def _outputs_zip(root_str: str) -> bytes:
403
- """Zip the entire bundled outputs/ folder for one-click download."""
404
- import io, zipfile
405
- root = Path(root_str)
406
- buf = io.BytesIO()
407
- with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
408
- for p in sorted(root.rglob("*")):
409
- if p.is_file():
410
- zf.write(p, arcname=p.relative_to(root.parent).as_posix())
411
- return buf.getvalue()
412
-
413
- def count_nodes(nodes: list) -> tuple[int, int]:
414
- nodes = _filter_dissolved(nodes)
415
- leaves = sum(1 for n in nodes if n.get("type") == "attribute")
416
- aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
417
- return leaves, aggs
418
-
419
- def concept_aligned_pct(nodes: list) -> float | None:
420
- """% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
421
- aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
422
- if not aggs:
423
- return None
424
- aligned = sum(1 for n in aggs
425
- if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
426
- return 100.0 * aligned / len(aggs) if aligned else None
427
-
428
- # ────────��────────────────────────────────────────────────────────────────────
429
- # SIDEBAR
430
- # ─────────────────────────────────────────────────────────────────────────────
431
  with st.sidebar:
432
  st.title("🌿 Hierarchy Explorer")
433
  st.caption("TFM 2026 β€” Metadata hierarchy construction")
434
  st.markdown("---")
 
435
 
436
- approach = st.radio("**Select Approach**",
437
- ["Baseline", "Approach 1", "Approach 2"], index=0)
438
- dataset = st.radio("**Select Dataset**", ["AI-MIND", "HCP"], index=0)
439
-
440
- st.markdown("---")
441
- st.caption("Results are pre-built from the thesis experiments. To run on your "
442
- "own data, clone the repository and run the individual apps.")
443
- st.markdown("[πŸ“¦ GitHub Repository]"
444
- "(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")
445
-
446
- # ─────────────────────────────────────────────────────────────────────────────
447
- # MAIN
448
- # ─────────────────────────────────────────────────────────────────────────────
449
- cfg = CONFIG[approach]
450
- color = cfg["color"]
451
-
452
- st.title(f"πŸ“Š {approach} β€” {dataset} Dataset")
453
- st.markdown(f"> {APPROACH_DESC[approach]}")
454
-
455
- paths = PREBUILT[approach][dataset]
456
- hier_path = paths.get("hierarchy")
457
- if hier_path is None or not hier_path.exists():
458
- st.error(f"Pre-built result not found: `{hier_path}`")
459
- st.stop()
460
 
461
- raw_nodes = _load_json(str(hier_path))
462
-
463
- leaves, aggs = count_nodes(raw_nodes)
464
- c1, c2, c3 = st.columns(3)
465
- c1.metric("Leaf Variables", leaves)
466
- c2.metric("Aggregation Nodes", aggs)
467
- c3.metric("Total Nodes", leaves + aggs)
468
-
469
- # ── Build summary (collapsed) ────────────────────────────────────────────────
470
- facet_path = paths.get("facets")
471
- n_facets = None
472
- if facet_path is not None and facet_path.exists():
473
- try:
474
- n_facets = len(_load_json(str(facet_path)))
475
- except Exception:
476
- n_facets = None
477
-
478
- with st.expander("ℹ️ Build summary", expanded=False):
479
- bs1, bs2, bs3, bs4 = st.columns(4)
480
- bs1.metric("Variables", leaves)
481
- bs2.metric("Internal nodes", aggs)
482
- bs3.metric("Tree depth", _tree_depth(raw_nodes))
483
- bs4.metric("Facets", n_facets if n_facets is not None else "β€”")
484
- pct = concept_aligned_pct(raw_nodes)
485
- if pct is not None:
486
- st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
487
- st.caption(
488
- f"Source file: `{hier_path.name}` Β· "
489
- f"Approach: **{approach}** Β· Dataset: **{dataset}**. "
490
- "Tree topology and labels are reproduced exactly from the pre-built "
491
- "thesis output (the algorithms are not re-run in this viewer)."
492
- )
493
-
494
- # ── Downloads ────────────────────────────────────────────────────────────────
495
- d1, d2, d3 = st.columns(3)
496
- with d1:
497
- st.download_button("⬇️ Hierarchy JSON", data=_read_bytes(str(hier_path)),
498
- file_name=hier_path.name, mime="application/json",
499
- use_container_width=True)
500
- with d2:
501
- if facet_path is not None and facet_path.exists():
502
- st.download_button("⬇️ Facets JSON", data=_read_bytes(str(facet_path)),
503
- file_name=facet_path.name, mime="application/json",
504
- use_container_width=True)
505
- else:
506
- st.button("⬇️ Facets JSON", disabled=True, use_container_width=True,
507
- help="This approach/dataset has no facet tree.")
508
- with d3:
509
- st.download_button("⬇️ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
510
- file_name="metadata_hierarchy_outputs.zip",
511
- mime="application/zip", use_container_width=True)
512
-
513
- st.markdown("---")
514
-
515
- # ── Level-of-Detail controls (above chart β€” matches the apps) ────────────────
516
- view_options = ["Sunburst (drill-down)", "Treemap"]
517
- if cfg["node_link"]:
518
- view_options.append("Node-link tree")
519
-
520
- if cfg["compress"]:
521
- vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
522
- else:
523
- vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
524
- vc5 = None
525
-
526
- with vc1:
527
- viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
528
- help="Sunburst best for large hierarchies [Taxonomizer]. "
529
- "Node-link best for moderate-depth structure inspection.")
530
- with vc2:
531
- depth = st.slider("Depth (Level of Detail)", 1, 8, DEFAULT_DEPTH, 1)
532
- with vc3:
533
- show_leaf_labels = st.checkbox("Leaf labels", value=False)
534
- with vc4:
535
- show_hidden = st.checkbox("Hidden nodes", value=False)
536
- if vc5 is not None:
537
- with vc5:
538
- compress_chains = st.checkbox("Compress chains", value=True,
539
- help="Merge one-child aggregation chains "
540
- '(e.g. "DMS β†’ DMS Recommended Standard") for '
541
- "display. Export JSON keeps original structure.")
542
- else:
543
- compress_chains = False
544
-
545
- st.divider()
546
-
547
- display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
548
-
549
- if viz_mode == "Sunburst (drill-down)":
550
- eff = safe_render_depth(display_nodes, depth)
551
- if eff < depth:
552
- st.caption(f"Large hierarchy β€” showing {eff} levels initially to render "
553
- "reliably. **Click any sector to drill deeper.**")
554
- st.plotly_chart(plot_sunburst(display_nodes, color, eff), use_container_width=True)
555
- elif viz_mode == "Treemap":
556
- eff = safe_render_depth(display_nodes, depth)
557
- if eff < depth:
558
- st.caption(f"Large hierarchy β€” showing {eff} levels initially to render "
559
- "reliably. **Click a tile to drill deeper.**")
560
- st.plotly_chart(plot_treemap(display_nodes, color, eff), use_container_width=True)
561
- else:
562
- st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
563
- use_container_width=True)
564
-
565
- # ── Facets (Approach 1 only) ─────────────────────────────────────────────────
566
- if facet_path is not None and facet_path.exists():
567
  st.markdown("---")
568
- st.subheader("πŸ”€ Parallel facets")
569
- facets = _load_json(str(facet_path))
570
- names = list(facets.keys())
571
- if not names:
572
- st.info("No facets available for this dataset.")
573
- else:
574
- sel = st.selectbox("Select facet", names)
575
- fnodes = facets[sel]
576
- ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
577
- with ft1:
578
- st.plotly_chart(plot_sunburst(fnodes, color, depth), use_container_width=True)
579
- with ft2:
580
- st.plotly_chart(plot_treemap(fnodes, color), use_container_width=True)
 
1
  """
2
  Metadata Hierarchy Explorer β€” TFM 2026
3
+ Navigation router (Streamlit st.navigation).
4
+
5
+ Sidebar layout:
6
+ 🌿 Hierarchy Explorer / TFM 2026 (branding, top)
7
+ πŸ“Š Demo View (pre-built results viewer)
8
+ … the Demo View's own controls … (Select Approach / Dataset, etc.)
9
+ πŸ› οΈ Build hierarchy (collapsible) (upload a CSV and run an app)
10
+ β€’ Baseline β€’ Approach 1 β€’ Approach 2
 
11
  """
 
 
 
 
 
 
 
12
  import streamlit as st
13
 
 
 
 
14
  st.set_page_config(
15
  page_title="Metadata Hierarchy Explorer",
16
  page_icon="🌿",
17
  layout="wide",
18
  )
19
 
20
+ # ── Pages ────────────────────────────────────────────────────────────────────
21
+ viewer = st.Page("views/viewer.py", title="Demo View", icon="πŸ“Š", default=True)
22
+ base = st.Page("views/run_baseline.py", title="Baseline", icon="🟒")
23
+ appr1 = st.Page("views/run_approach_1.py", title="Approach 1", icon="🌳")
24
+ appr2 = st.Page("views/run_approach_2.py", title="Approach 2", icon="πŸ”¬")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Hidden default nav β€” we render our own links so we control the order.
27
+ pg = st.navigation([viewer, base, appr1, appr2], position="hidden")
 
 
 
 
 
 
 
 
28
 
29
+ # ── Sidebar TOP: branding + Demo View link ──────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  with st.sidebar:
31
  st.title("🌿 Hierarchy Explorer")
32
  st.caption("TFM 2026 β€” Metadata hierarchy construction")
33
  st.markdown("---")
34
+ st.page_link(viewer, label="Demo View", icon="πŸ“Š")
35
 
36
+ # ── The selected page renders here (its own sidebar controls included) ───────
37
+ pg.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # ── Sidebar BOTTOM: collapsible "Build hierarchy" group ─────────────────────
40
+ with st.sidebar:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  st.markdown("---")
42
+ with st.expander("πŸ› οΈ Build hierarchy", expanded=False):
43
+ st.caption("Upload your own CSV and run an algorithm live.")
44
+ st.page_link(base, label="Baseline", icon="🟒")
45
+ st.page_link(appr1, label="Approach 1", icon="🌳")
46
+ st.page_link(appr2, label="Approach 2", icon="πŸ”¬")
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- ο»Ώstreamlit>=1.30
2
  pandas>=2.0
3
  numpy>=1.24
4
  scikit-learn>=1.3
 
1
+ ο»Ώstreamlit>=1.43
2
  pandas>=2.0
3
  numpy>=1.24
4
  scikit-learn>=1.3
pages/2_Approach_1.py β†’ views/run_approach_1.py RENAMED
@@ -57,7 +57,7 @@ except Exception:
57
 
58
  warnings.filterwarnings('ignore')
59
 
60
- st.set_page_config(page_title='Metadata Hierarchy β€” Approach 1', page_icon='🌳', layout='wide')
61
  st.title('Metadata Hierarchy Builder β€” Approach 1')
62
  st.caption(
63
  'Automatic concept-label extraction from metadata text + HiExpan refinement + Castanet facets. '
@@ -3876,7 +3876,7 @@ if uploads:
3876
  if warn:
3877
  st.warning('Looked like raw data β€” columns converted to metadata rows.')
3878
  st.write(f'Rows: **{len(df):,}**, Columns: **{len(df.columns)}**')
3879
- st.dataframe(df.head(10), use_container_width=True)
3880
  except Exception as e:
3881
  st.error(f'Failed to load {p.name}: {e}')
3882
 
@@ -3902,7 +3902,7 @@ if uploads:
3902
  key=f'meta_{name}')
3903
  prev = list(dict.fromkeys(leaf + group + text + meta))
3904
  if prev:
3905
- st.dataframe(df[prev].head(6), use_container_width=True)
3906
  configs[name] = {'leaf_cols': leaf, 'group_cols': group,
3907
  'text_cols': text, 'metadata_cols': meta}
3908
 
@@ -4202,12 +4202,12 @@ with tabs[0]:
4202
 
4203
  if viz_mode == 'Sunburst (drill-down)':
4204
  st.caption('Hover for concept provenance (confidence, source, alternatives). Click to drill down.')
4205
- st.plotly_chart(plot_sunburst(nodes, depth), use_container_width=True)
4206
  elif viz_mode == 'Treemap':
4207
- st.plotly_chart(plot_treemap(nodes), use_container_width=True)
4208
  else:
4209
  st.plotly_chart(plot_node_link(nodes, depth, show_hidden, show_leaf_labels),
4210
- use_container_width=True)
4211
  pr = path_rows(nodes)
4212
  max_d = max((r['depth'] for r in pr), default=0)
4213
  c1, c2, c3 = st.columns(3)
@@ -4225,7 +4225,7 @@ with tabs[0]:
4225
  exp_rows = [{'Segment': seg, 'Expansion': v['expansion'],
4226
  'Evidence': ', '.join(v['evidence'])}
4227
  for seg, v in code_exp.items()]
4228
- st.dataframe(pd.DataFrame(exp_rows), use_container_width=True)
4229
 
4230
  # Concept label provenance for internal nodes
4231
  prov_rows = []
@@ -4241,7 +4241,7 @@ with tabs[0]:
4241
  })
4242
  if prov_rows:
4243
  with st.expander('Concept label provenance for internal nodes', expanded=False):
4244
- st.dataframe(pd.DataFrame(prov_rows), use_container_width=True)
4245
 
4246
  # ── Tab 1: Faceted view ───────────────────────────────────────────────────────
4247
  with tabs[1]:
@@ -4251,11 +4251,11 @@ with tabs[1]:
4251
  'Concept facet uses automatically assigned labels from embedding alignment.'
4252
  )
4253
  if facet_trees:
4254
- st.plotly_chart(plot_facets_parallel(facet_trees), use_container_width=True)
4255
  st.markdown('### Per-facet detail')
4256
  sel_facet = st.selectbox('Inspect facet tree', list(facet_trees.keys()))
4257
  ft = facet_trees[sel_facet]
4258
- st.plotly_chart(plot_sunburst(ft, max_depth=3), use_container_width=True)
4259
  n_groups = len([n for n in ft if n.get('type') == 'aggregation'])
4260
  st.info(f'Facet **{sel_facet}**: {n_groups} groups, '
4261
  f'{len([n for n in ft if n.get("type")=="attribute"])} variables')
@@ -4273,11 +4273,11 @@ with tabs[2]:
4273
  st.markdown('### Sibling coherence β€” before refinement (worst first)')
4274
  before = hiexpan_report.get('coherence_before', [])
4275
  if before:
4276
- st.dataframe(pd.DataFrame(before), use_container_width=True)
4277
  st.markdown('### Sibling coherence β€” after refinement')
4278
  after = hiexpan_report.get('coherence_after', [])
4279
  if after:
4280
- st.dataframe(pd.DataFrame(after), use_container_width=True)
4281
  b_mean = np.mean([r['coherence_score'] for r in before]) if before else float('nan')
4282
  a_mean = np.mean([r['coherence_score'] for r in after])
4283
  st.metric('Mean coherence improvement',
@@ -4324,7 +4324,7 @@ with tabs[3]:
4324
  if can is not None:
4325
  conflict_df = compute_conflict_table(can, nodes)
4326
  if len(conflict_df):
4327
- st.dataframe(conflict_df, use_container_width=True)
4328
  else:
4329
  st.success('No low-confidence placements detected.')
4330
  else:
@@ -4393,7 +4393,7 @@ with tabs[4]:
4393
  'type': c.get('type'),
4394
  'relation': c.get('info', {}).get('relation_label', ''),
4395
  'desc': str(c.get('desc', ''))[:120]}
4396
- for c in cns if c]), use_container_width=True)
4397
 
4398
  # ── Tab 5: Search ─────────────────────────────────────────────────────────────
4399
  with tabs[5]:
@@ -4407,14 +4407,14 @@ with tabs[5]:
4407
  'relation': n.get('info', {}).get('relation_label', ''),
4408
  'n_children': len(n.get('related', [])),
4409
  'desc': str(n.get('desc', ''))[:200]})
4410
- st.dataframe(pd.DataFrame(out_), use_container_width=True)
4411
 
4412
  # ── Tab 6: Semantic map ───────────────────────────────────────────────────────
4413
  with tabs[6]:
4414
  if can is None or len(can) < 3:
4415
  st.info('Semantic map available after build.')
4416
  else:
4417
- st.plotly_chart(semantic_map(can), use_container_width=True)
4418
 
4419
  # ── Tab 7: Metadata ───────────────────────────────────────────────────────────
4420
  with tabs[7]:
@@ -4422,7 +4422,7 @@ with tabs[7]:
4422
  st.info('Available after build.')
4423
  else:
4424
  show_cols = [c for c in can.columns if c != '_raw']
4425
- st.dataframe(can[show_cols], use_container_width=True)
4426
 
4427
  # ── Tab 8: Export ─────────────────────────────────────────────────────────────
4428
  with tabs[8]:
@@ -4438,7 +4438,7 @@ with tabs[8]:
4438
  data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
4439
  file_name=f'{_base}_approach1_hierarchy.json',
4440
  mime='application/json',
4441
- use_container_width=True,
4442
  )
4443
  with col2:
4444
  if facet_trees:
@@ -4447,7 +4447,7 @@ with tabs[8]:
4447
  data=json.dumps(facet_trees, indent=2, ensure_ascii=False).encode('utf-8'),
4448
  file_name=f'{_base}_approach1_facets.json',
4449
  mime='application/json',
4450
- use_container_width=True,
4451
  )
4452
 
4453
  col3, col4 = st.columns(2)
@@ -4458,7 +4458,7 @@ with tabs[8]:
4458
  data=can.drop(columns=['_raw'], errors='ignore').to_csv(index=False).encode('utf-8'),
4459
  file_name=f'{_base}_approach1_canonical.csv',
4460
  mime='text/csv',
4461
- use_container_width=True,
4462
  )
4463
  with col4:
4464
  _prov_df = st.session_state.get('prov_df', pd.DataFrame())
@@ -4468,7 +4468,7 @@ with tabs[8]:
4468
  data=_prov_df.to_csv(index=False).encode('utf-8'),
4469
  file_name=f'{_base}_approach1_concept_labels.csv',
4470
  mime='text/csv',
4471
- use_container_width=True,
4472
  )
4473
 
4474
  st.divider()
@@ -4481,7 +4481,7 @@ with tabs[8]:
4481
  'dataset name β€” convenient for `evaluate_all.py`.'
4482
  )
4483
  if st.button('πŸ’Ύ Save all to outputs/approach_1/', type='primary',
4484
- use_container_width=True):
4485
  try:
4486
  _out_dir.mkdir(parents=True, exist_ok=True)
4487
  saved = []
 
57
 
58
  warnings.filterwarnings('ignore')
59
 
60
+ # set_page_config handled by the navigation router (demo.py)
61
  st.title('Metadata Hierarchy Builder β€” Approach 1')
62
  st.caption(
63
  'Automatic concept-label extraction from metadata text + HiExpan refinement + Castanet facets. '
 
3876
  if warn:
3877
  st.warning('Looked like raw data β€” columns converted to metadata rows.')
3878
  st.write(f'Rows: **{len(df):,}**, Columns: **{len(df.columns)}**')
3879
+ st.dataframe(df.head(10), width='stretch')
3880
  except Exception as e:
3881
  st.error(f'Failed to load {p.name}: {e}')
3882
 
 
3902
  key=f'meta_{name}')
3903
  prev = list(dict.fromkeys(leaf + group + text + meta))
3904
  if prev:
3905
+ st.dataframe(df[prev].head(6), width='stretch')
3906
  configs[name] = {'leaf_cols': leaf, 'group_cols': group,
3907
  'text_cols': text, 'metadata_cols': meta}
3908
 
 
4202
 
4203
  if viz_mode == 'Sunburst (drill-down)':
4204
  st.caption('Hover for concept provenance (confidence, source, alternatives). Click to drill down.')
4205
+ st.plotly_chart(plot_sunburst(nodes, depth), width='stretch')
4206
  elif viz_mode == 'Treemap':
4207
+ st.plotly_chart(plot_treemap(nodes), width='stretch')
4208
  else:
4209
  st.plotly_chart(plot_node_link(nodes, depth, show_hidden, show_leaf_labels),
4210
+ width='stretch')
4211
  pr = path_rows(nodes)
4212
  max_d = max((r['depth'] for r in pr), default=0)
4213
  c1, c2, c3 = st.columns(3)
 
4225
  exp_rows = [{'Segment': seg, 'Expansion': v['expansion'],
4226
  'Evidence': ', '.join(v['evidence'])}
4227
  for seg, v in code_exp.items()]
4228
+ st.dataframe(pd.DataFrame(exp_rows), width='stretch')
4229
 
4230
  # Concept label provenance for internal nodes
4231
  prov_rows = []
 
4241
  })
4242
  if prov_rows:
4243
  with st.expander('Concept label provenance for internal nodes', expanded=False):
4244
+ st.dataframe(pd.DataFrame(prov_rows), width='stretch')
4245
 
4246
  # ── Tab 1: Faceted view ───────────────────────────────────────────────────────
4247
  with tabs[1]:
 
4251
  'Concept facet uses automatically assigned labels from embedding alignment.'
4252
  )
4253
  if facet_trees:
4254
+ st.plotly_chart(plot_facets_parallel(facet_trees), width='stretch')
4255
  st.markdown('### Per-facet detail')
4256
  sel_facet = st.selectbox('Inspect facet tree', list(facet_trees.keys()))
4257
  ft = facet_trees[sel_facet]
4258
+ st.plotly_chart(plot_sunburst(ft, max_depth=3), width='stretch')
4259
  n_groups = len([n for n in ft if n.get('type') == 'aggregation'])
4260
  st.info(f'Facet **{sel_facet}**: {n_groups} groups, '
4261
  f'{len([n for n in ft if n.get("type")=="attribute"])} variables')
 
4273
  st.markdown('### Sibling coherence β€” before refinement (worst first)')
4274
  before = hiexpan_report.get('coherence_before', [])
4275
  if before:
4276
+ st.dataframe(pd.DataFrame(before), width='stretch')
4277
  st.markdown('### Sibling coherence β€” after refinement')
4278
  after = hiexpan_report.get('coherence_after', [])
4279
  if after:
4280
+ st.dataframe(pd.DataFrame(after), width='stretch')
4281
  b_mean = np.mean([r['coherence_score'] for r in before]) if before else float('nan')
4282
  a_mean = np.mean([r['coherence_score'] for r in after])
4283
  st.metric('Mean coherence improvement',
 
4324
  if can is not None:
4325
  conflict_df = compute_conflict_table(can, nodes)
4326
  if len(conflict_df):
4327
+ st.dataframe(conflict_df, width='stretch')
4328
  else:
4329
  st.success('No low-confidence placements detected.')
4330
  else:
 
4393
  'type': c.get('type'),
4394
  'relation': c.get('info', {}).get('relation_label', ''),
4395
  'desc': str(c.get('desc', ''))[:120]}
4396
+ for c in cns if c]), width='stretch')
4397
 
4398
  # ── Tab 5: Search ─────────────────────────────────────────────────────────────
4399
  with tabs[5]:
 
4407
  'relation': n.get('info', {}).get('relation_label', ''),
4408
  'n_children': len(n.get('related', [])),
4409
  'desc': str(n.get('desc', ''))[:200]})
4410
+ st.dataframe(pd.DataFrame(out_), width='stretch')
4411
 
4412
  # ── Tab 6: Semantic map ───────────────────────────────────────────────────────
4413
  with tabs[6]:
4414
  if can is None or len(can) < 3:
4415
  st.info('Semantic map available after build.')
4416
  else:
4417
+ st.plotly_chart(semantic_map(can), width='stretch')
4418
 
4419
  # ── Tab 7: Metadata ───────────────────────────────────────────────────────────
4420
  with tabs[7]:
 
4422
  st.info('Available after build.')
4423
  else:
4424
  show_cols = [c for c in can.columns if c != '_raw']
4425
+ st.dataframe(can[show_cols], width='stretch')
4426
 
4427
  # ── Tab 8: Export ─────────────────────────────────────────────────────────────
4428
  with tabs[8]:
 
4438
  data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
4439
  file_name=f'{_base}_approach1_hierarchy.json',
4440
  mime='application/json',
4441
+ width='stretch',
4442
  )
4443
  with col2:
4444
  if facet_trees:
 
4447
  data=json.dumps(facet_trees, indent=2, ensure_ascii=False).encode('utf-8'),
4448
  file_name=f'{_base}_approach1_facets.json',
4449
  mime='application/json',
4450
+ width='stretch',
4451
  )
4452
 
4453
  col3, col4 = st.columns(2)
 
4458
  data=can.drop(columns=['_raw'], errors='ignore').to_csv(index=False).encode('utf-8'),
4459
  file_name=f'{_base}_approach1_canonical.csv',
4460
  mime='text/csv',
4461
+ width='stretch',
4462
  )
4463
  with col4:
4464
  _prov_df = st.session_state.get('prov_df', pd.DataFrame())
 
4468
  data=_prov_df.to_csv(index=False).encode('utf-8'),
4469
  file_name=f'{_base}_approach1_concept_labels.csv',
4470
  mime='text/csv',
4471
+ width='stretch',
4472
  )
4473
 
4474
  st.divider()
 
4481
  'dataset name β€” convenient for `evaluate_all.py`.'
4482
  )
4483
  if st.button('πŸ’Ύ Save all to outputs/approach_1/', type='primary',
4484
+ width='stretch'):
4485
  try:
4486
  _out_dir.mkdir(parents=True, exist_ok=True)
4487
  saved = []
pages/3_Approach_2.py β†’ views/run_approach_2.py RENAMED
@@ -3467,8 +3467,7 @@ def plot_node_link(nodes: list, max_depth: int = 4,
3467
  # ──────────────────────────────────────────────────────────────────────────────
3468
  # STREAMLIT APP
3469
  # ──────────────────────────────────────────────────────────────────────────────
3470
- st.set_page_config(page_title='Approach 2 β€” Multi-Aspect Hierarchy', page_icon='πŸ”¬',
3471
- layout='wide')
3472
  st.title('πŸ”¬ Approach 2 β€” Role-Decomposed Metadata Hierarchy')
3473
  st.caption('Group anchoring β†’ LLM role extraction β†’ role-nested LoD tree. '
3474
  'Full method details and citations in the Method tab.')
@@ -3613,7 +3612,7 @@ if uploads:
3613
  cfg_by[f.name] = detect_roles(df)
3614
  with st.expander(f'πŸ“„ {f.name}', expanded=False):
3615
  st.write(f'Rows: **{len(df):,}** Columns: **{len(df.columns)}**')
3616
- st.dataframe(df.head(8), use_container_width=True)
3617
  except Exception as e:
3618
  st.error(f'Could not load {f.name}: {e}')
3619
 
@@ -3813,13 +3812,13 @@ with tabs[0]:
3813
 
3814
  if viz_mode == 'Sunburst (drill-down)':
3815
  st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
3816
- use_container_width=True)
3817
  elif viz_mode == 'Treemap':
3818
- st.plotly_chart(plot_treemap(display_nodes), use_container_width=True)
3819
  else:
3820
  st.plotly_chart(plot_node_link(display_nodes, depth_display,
3821
  show_hidden, show_leaf_labels),
3822
- use_container_width=True)
3823
 
3824
  n_l = len([n for n in nodes if n.get('type') == 'attribute'])
3825
  n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
@@ -3912,7 +3911,7 @@ with tabs[1]:
3912
  W_df = pd.DataFrame(
3913
  W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
3914
  W_df.insert(0, 'Variable', can['_label'].tolist())
3915
- st.dataframe(W_df.round(4), use_container_width=True)
3916
 
3917
  with tabs[2]:
3918
  st.markdown('### Role decomposition')
@@ -3935,7 +3934,7 @@ with tabs[2]:
3935
  if reg_rows:
3936
  reg_df = pd.DataFrame(reg_rows).sort_values(
3937
  'Regularity', ascending=False, na_position='last')
3938
- st.dataframe(reg_df, use_container_width=True, hide_index=True)
3939
 
3940
  # ── Per-variable role table ───────────────────────────────────────────
3941
  st.markdown('#### Per-variable role table')
@@ -3996,7 +3995,7 @@ with tabs[2]:
3996
 
3997
  if role_rows:
3998
  role_df = pd.DataFrame(role_rows)
3999
- st.dataframe(role_df, use_container_width=True, hide_index=True)
4000
  st.download_button(
4001
  '⬇️ Download per-variable role CSV',
4002
  data=role_df.to_csv(index=False).encode('utf-8'),
@@ -4021,7 +4020,7 @@ with tabs[2]:
4021
  'Reasons': ', '.join(f'{k}:{v}' for k, v in
4022
  (a.get('summary', {}) or {}).items()),
4023
  })
4024
- st.dataframe(pd.DataFrame(sum_rows), use_container_width=True,
4025
  hide_index=True)
4026
 
4027
  # Drill-down per group
@@ -4050,7 +4049,7 @@ with tabs[2]:
4050
  })
4051
  if row_rows:
4052
  st.dataframe(pd.DataFrame(row_rows),
4053
- use_container_width=True, hide_index=True)
4054
  # Download as CSV for offline analysis
4055
  csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
4056
  st.download_button(
@@ -4129,16 +4128,16 @@ with tabs[3]:
4129
  & (prov_df['LLM proposed'].astype(str).str.len() > 0)]
4130
  if len(rej):
4131
  st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
4132
- use_container_width=True, hide_index=True)
4133
 
4134
  # ── Full provenance table ─────────────────────────────────────────────
4135
  st.write('**Full per-node provenance**')
4136
- st.dataframe(prov_df, use_container_width=True, hide_index=True)
4137
 
4138
  with tabs[4]:
4139
  if can is not None:
4140
  st.dataframe(can.drop(columns=['_row'], errors='ignore'),
4141
- use_container_width=True)
4142
 
4143
  with tabs[5]:
4144
  # ── derive a per-CSV base name from the uploaded files ────────────────────
@@ -4169,7 +4168,7 @@ with tabs[5]:
4169
  data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
4170
  file_name=f'{csv_basis}_approach2_lod.json',
4171
  mime='application/json',
4172
- use_container_width=True,
4173
  )
4174
  with col2:
4175
  if can is not None:
@@ -4178,7 +4177,7 @@ with tabs[5]:
4178
  data=can.to_csv(index=False).encode('utf-8'),
4179
  file_name=f'{csv_basis}_approach2_canonical.csv',
4180
  mime='text/csv',
4181
- use_container_width=True,
4182
  )
4183
 
4184
  st.divider()
@@ -4191,7 +4190,7 @@ with tabs[5]:
4191
  'dataset name β€” convenient for `evaluate_all.py`.'
4192
  )
4193
  if st.button('πŸ’Ύ Save all to outputs/approach_2/', type='primary',
4194
- use_container_width=True):
4195
  try:
4196
  _out_dir.mkdir(parents=True, exist_ok=True)
4197
  saved = []
 
3467
  # ──────────────────────────────────────────────────────────────────────────────
3468
  # STREAMLIT APP
3469
  # ──────────────────────────────────────────────────────────────────────────────
3470
+ # set_page_config handled by the navigation router (demo.py)
 
3471
  st.title('πŸ”¬ Approach 2 β€” Role-Decomposed Metadata Hierarchy')
3472
  st.caption('Group anchoring β†’ LLM role extraction β†’ role-nested LoD tree. '
3473
  'Full method details and citations in the Method tab.')
 
3612
  cfg_by[f.name] = detect_roles(df)
3613
  with st.expander(f'πŸ“„ {f.name}', expanded=False):
3614
  st.write(f'Rows: **{len(df):,}** Columns: **{len(df.columns)}**')
3615
+ st.dataframe(df.head(8), width='stretch')
3616
  except Exception as e:
3617
  st.error(f'Could not load {f.name}: {e}')
3618
 
 
3812
 
3813
  if viz_mode == 'Sunburst (drill-down)':
3814
  st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
3815
+ width='stretch')
3816
  elif viz_mode == 'Treemap':
3817
+ st.plotly_chart(plot_treemap(display_nodes), width='stretch')
3818
  else:
3819
  st.plotly_chart(plot_node_link(display_nodes, depth_display,
3820
  show_hidden, show_leaf_labels),
3821
+ width='stretch')
3822
 
3823
  n_l = len([n for n in nodes if n.get('type') == 'attribute'])
3824
  n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
 
3911
  W_df = pd.DataFrame(
3912
  W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
3913
  W_df.insert(0, 'Variable', can['_label'].tolist())
3914
+ st.dataframe(W_df.round(4), width='stretch')
3915
 
3916
  with tabs[2]:
3917
  st.markdown('### Role decomposition')
 
3934
  if reg_rows:
3935
  reg_df = pd.DataFrame(reg_rows).sort_values(
3936
  'Regularity', ascending=False, na_position='last')
3937
+ st.dataframe(reg_df, width='stretch', hide_index=True)
3938
 
3939
  # ── Per-variable role table ───────────────────────────────────────────
3940
  st.markdown('#### Per-variable role table')
 
3995
 
3996
  if role_rows:
3997
  role_df = pd.DataFrame(role_rows)
3998
+ st.dataframe(role_df, width='stretch', hide_index=True)
3999
  st.download_button(
4000
  '⬇️ Download per-variable role CSV',
4001
  data=role_df.to_csv(index=False).encode('utf-8'),
 
4020
  'Reasons': ', '.join(f'{k}:{v}' for k, v in
4021
  (a.get('summary', {}) or {}).items()),
4022
  })
4023
+ st.dataframe(pd.DataFrame(sum_rows), width='stretch',
4024
  hide_index=True)
4025
 
4026
  # Drill-down per group
 
4049
  })
4050
  if row_rows:
4051
  st.dataframe(pd.DataFrame(row_rows),
4052
+ width='stretch', hide_index=True)
4053
  # Download as CSV for offline analysis
4054
  csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
4055
  st.download_button(
 
4128
  & (prov_df['LLM proposed'].astype(str).str.len() > 0)]
4129
  if len(rej):
4130
  st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
4131
+ width='stretch', hide_index=True)
4132
 
4133
  # ── Full provenance table ─────────────────────────────────────────────
4134
  st.write('**Full per-node provenance**')
4135
+ st.dataframe(prov_df, width='stretch', hide_index=True)
4136
 
4137
  with tabs[4]:
4138
  if can is not None:
4139
  st.dataframe(can.drop(columns=['_row'], errors='ignore'),
4140
+ width='stretch')
4141
 
4142
  with tabs[5]:
4143
  # ── derive a per-CSV base name from the uploaded files ────────────────────
 
4168
  data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
4169
  file_name=f'{csv_basis}_approach2_lod.json',
4170
  mime='application/json',
4171
+ width='stretch',
4172
  )
4173
  with col2:
4174
  if can is not None:
 
4177
  data=can.to_csv(index=False).encode('utf-8'),
4178
  file_name=f'{csv_basis}_approach2_canonical.csv',
4179
  mime='text/csv',
4180
+ width='stretch',
4181
  )
4182
 
4183
  st.divider()
 
4190
  'dataset name β€” convenient for `evaluate_all.py`.'
4191
  )
4192
  if st.button('πŸ’Ύ Save all to outputs/approach_2/', type='primary',
4193
+ width='stretch'):
4194
  try:
4195
  _out_dir.mkdir(parents=True, exist_ok=True)
4196
  saved = []
pages/1_Baseline.py β†’ views/run_baseline.py RENAMED
@@ -40,7 +40,7 @@ from sklearn.preprocessing import LabelEncoder
40
 
41
  warnings.filterwarnings('ignore')
42
 
43
- st.set_page_config(page_title='Metadata Hierarchy β€” Baseline', page_icon='🌿', layout='wide')
44
  st.title('Metadata Hierarchy Builder β€” Baseline (Taxonomizer)')
45
  st.caption(
46
  'Pure Taxonomizer baseline: TF-IDF text objects + recursive agglomerative '
@@ -562,11 +562,11 @@ with st.spinner('Loading file…'):
562
  st.subheader('Step 1 β€” File preview')
563
  with st.expander(f'πŸ“„ {uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
564
  expanded=False):
565
- st.dataframe(df.head(10), use_container_width=True)
566
  score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
567
  if c in prof.columns]
568
  st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
569
- use_container_width=True)
570
 
571
  st.subheader('Step 2 β€” Confirm column roles')
572
  cols = list(df.columns)
@@ -639,11 +639,11 @@ c4.metric('Avg branching', _sm['avg_branching_factor'])
639
  tabs = st.tabs(['Sunburst', 'Treemap', 'Node detail', 'Canonical table', 'Export', 'πŸ“Š Evaluation'])
640
 
641
  with tabs[0]:
642
- st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth), use_container_width=True)
643
  st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
644
 
645
  with tabs[1]:
646
- st.plotly_chart(plot_treemap(nodes), use_container_width=True)
647
 
648
  with tabs[2]:
649
  nm = _nmap(nodes)
@@ -661,10 +661,10 @@ with tabs[2]:
661
  sub = can[can['_leaf_id'].isin(leaf_ids_set)]
662
  st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
663
  st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
664
- use_container_width=True)
665
 
666
  with tabs[3]:
667
- st.dataframe(can, use_container_width=True)
668
 
669
  with tabs[4]:
670
  _base = safe_name(project_name)
@@ -675,7 +675,7 @@ with tabs[4]:
675
  data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
676
  file_name=f'{_base}_baseline_hierarchy.json',
677
  mime='application/json',
678
- use_container_width=True,
679
  )
680
  with col2:
681
  st.download_button(
@@ -683,7 +683,7 @@ with tabs[4]:
683
  data=can.to_csv(index=False).encode('utf-8'),
684
  file_name=f'{_base}_baseline_canonical.csv',
685
  mime='text/csv',
686
- use_container_width=True,
687
  )
688
 
689
  st.divider()
@@ -696,7 +696,7 @@ with tabs[4]:
696
  'dataset name β€” convenient for `evaluate_all.py`.'
697
  )
698
  if st.button('πŸ’Ύ Save all to outputs/baseline/', type='primary',
699
- use_container_width=True):
700
  try:
701
  _out_dir.mkdir(parents=True, exist_ok=True)
702
  (_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
 
40
 
41
  warnings.filterwarnings('ignore')
42
 
43
+ # set_page_config handled by the navigation router (demo.py)
44
  st.title('Metadata Hierarchy Builder β€” Baseline (Taxonomizer)')
45
  st.caption(
46
  'Pure Taxonomizer baseline: TF-IDF text objects + recursive agglomerative '
 
562
  st.subheader('Step 1 β€” File preview')
563
  with st.expander(f'πŸ“„ {uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
564
  expanded=False):
565
+ st.dataframe(df.head(10), width='stretch')
566
  score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
567
  if c in prof.columns]
568
  st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
569
+ width='stretch')
570
 
571
  st.subheader('Step 2 β€” Confirm column roles')
572
  cols = list(df.columns)
 
639
  tabs = st.tabs(['Sunburst', 'Treemap', 'Node detail', 'Canonical table', 'Export', 'πŸ“Š Evaluation'])
640
 
641
  with tabs[0]:
642
+ st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth), width='stretch')
643
  st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
644
 
645
  with tabs[1]:
646
+ st.plotly_chart(plot_treemap(nodes), width='stretch')
647
 
648
  with tabs[2]:
649
  nm = _nmap(nodes)
 
661
  sub = can[can['_leaf_id'].isin(leaf_ids_set)]
662
  st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
663
  st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
664
+ width='stretch')
665
 
666
  with tabs[3]:
667
+ st.dataframe(can, width='stretch')
668
 
669
  with tabs[4]:
670
  _base = safe_name(project_name)
 
675
  data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
676
  file_name=f'{_base}_baseline_hierarchy.json',
677
  mime='application/json',
678
+ width='stretch',
679
  )
680
  with col2:
681
  st.download_button(
 
683
  data=can.to_csv(index=False).encode('utf-8'),
684
  file_name=f'{_base}_baseline_canonical.csv',
685
  mime='text/csv',
686
+ width='stretch',
687
  )
688
 
689
  st.divider()
 
696
  'dataset name β€” convenient for `evaluate_all.py`.'
697
  )
698
  if st.button('πŸ’Ύ Save all to outputs/baseline/', type='primary',
699
+ width='stretch'):
700
  try:
701
  _out_dir.mkdir(parents=True, exist_ok=True)
702
  (_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
views/viewer.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metadata Hierarchy Explorer β€” TFM 2026
3
+ Pre-built results viewer for Baseline, Approach 1, and Approach 2.
4
+
5
+ Rendering faithfully replicates each app's display pipeline:
6
+ - Baseline : raw tree, Greens, Sunburst + Treemap
7
+ - Approach 1 : raw tree, Blues, Sunburst + Treemap + Node-link + Facets
8
+ - Approach 2 : compress one-child chains, Viridis, Sunburst + Treemap + Node-link
9
+
10
+ Level-of-Detail controls (depth, leaf labels, hidden nodes, compress chains)
11
+ match the controls in the individual apps.
12
+ """
13
+ from __future__ import annotations
14
+ import json
15
+ from collections import defaultdict
16
+ from pathlib import Path
17
+
18
+ import numpy as np
19
+ import plotly.graph_objects as go
20
+ import streamlit as st
21
+
22
+ # Page config is set by the navigation router (demo.py).
23
+ ROOT = Path(__file__).resolve().parent.parent / "outputs"
24
+
25
+ DEFAULT_DEPTH = 7
26
+
27
+ # ─────────────────────────────────────────────────────────────────────────────
28
+ # PRE-BUILT OUTPUT PATHS
29
+ # ─────────────────────────────────────────────────────────────────────────────
30
+ PREBUILT = {
31
+ "Baseline": {
32
+ "AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
33
+ "HCP": {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
34
+ },
35
+ "Approach 1": {
36
+ "AI-MIND": {
37
+ "hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
38
+ "facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
39
+ },
40
+ "HCP": {
41
+ "hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
42
+ "facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
43
+ },
44
+ },
45
+ "Approach 2": {
46
+ "AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
47
+ "HCP": {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
48
+ },
49
+ }
50
+
51
+ # Per-approach rendering config (matches each source app)
52
+ CONFIG = {
53
+ "Baseline": {"color": "Greens", "compress": False, "node_link": False},
54
+ "Approach 1": {"color": "Blues", "compress": False, "node_link": True},
55
+ "Approach 2": {"color": "Viridis", "compress": True, "node_link": True},
56
+ }
57
+
58
+ APPROACH_DESC = {
59
+ "Baseline": (
60
+ "Pure clustering baseline β€” TF-IDF representation + recursive agglomerative "
61
+ "(cosine) clustering, number of clusters chosen by silhouette. No external APIs, "
62
+ "no neural embeddings. Node labels are the most discriminative terms per cluster."
63
+ ),
64
+ "Approach 1": (
65
+ "Global embedding pipeline β€” SBERT + NΓ—M concept-table alignment (GonΓ§alves 2019) "
66
+ "+ HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets. Optionally "
67
+ "retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal."
68
+ ),
69
+ "Approach 2": (
70
+ "Dataset-constrained multi-aspect hierarchy β€” group-anchored L1/L2 β†’ phrase-slot "
71
+ "mining β†’ FASTopic semantic aspect discovery (Wu et al. NeurIPS 2024) β†’ GMM/KMeans "
72
+ "clustering β†’ deterministic 5-stage label generation. Optional local-LLM refinement."
73
+ ),
74
+ }
75
+
76
+ # ─────────────────────────────────────────────────────────────────────────────
77
+ # TREE TRANSFORMS (copied from approach_2.py β€” display-only, exact behaviour)
78
+ # ─────────────────────────────────────────────────────────────────────────────
79
+ def _filter_dissolved(nodes: list) -> list:
80
+ drop_ids = {int(n["id"]) for n in nodes
81
+ if n.get("type") == "dissolved" or n.get("isShown") is False}
82
+ if not drop_ids:
83
+ return nodes
84
+ out = []
85
+ for n in nodes:
86
+ if int(n["id"]) in drop_ids:
87
+ continue
88
+ m = dict(n)
89
+ m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
90
+ out.append(m)
91
+ return out
92
+
93
+ def compress_one_child_chains(nodes: list) -> list:
94
+ """Collapse chains where an aggregation node has exactly one aggregation child
95
+ (e.g. 'DMS β†’ DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
96
+ nodes = _filter_dissolved(nodes)
97
+ nm = {int(n["id"]): dict(n) for n in nodes}
98
+
99
+ def _is_chain_link(n):
100
+ if n.get("type") != "aggregation":
101
+ return False
102
+ children = n.get("related", [])
103
+ return (len(children) == 1
104
+ and nm.get(int(children[0]), {}).get("type") == "aggregation")
105
+
106
+ changed = True
107
+ while changed:
108
+ changed = False
109
+ for nid, n in list(nm.items()):
110
+ if _is_chain_link(n):
111
+ child_id = int(n["related"][0])
112
+ child = nm[child_id]
113
+ new_node = dict(child)
114
+ new_node["id"] = nid
115
+ new_node["name"] = f"{n['name']} / {child['name']}"
116
+ new_node["desc"] = f"{n.get('desc', '')} | {child.get('desc', '')}"
117
+ nm[nid] = new_node
118
+ if child_id in nm:
119
+ del nm[child_id]
120
+ for other in nm.values():
121
+ other["related"] = [nid if int(c) == child_id else int(c)
122
+ for c in other.get("related", [])]
123
+ changed = True
124
+ break
125
+ return list(nm.values())
126
+
127
+ # ─────────────────────────────────────────────────────────────────────────────
128
+ # RENDER HELPERS (DAG-safe value map β€” copied from approach_2.py)
129
+ # ─────────────────────────────────────────────────────────────────────────────
130
+ def _leaf_ids(nodes: list, nid: int) -> list:
131
+ m = {int(n["id"]): n for n in nodes}
132
+ out = []
133
+ def rec(x):
134
+ n = m.get(int(x))
135
+ if not n:
136
+ return
137
+ if n.get("type") == "attribute":
138
+ out.append(int(x)); return
139
+ for c in n.get("related", []):
140
+ rec(int(c))
141
+ rec(nid)
142
+ return list(dict.fromkeys(out))
143
+
144
+ def _parent_map(nodes: list) -> dict:
145
+ pm = {}
146
+ for n in nodes:
147
+ for c in n.get("related", []):
148
+ if int(c) not in pm:
149
+ pm[int(c)] = int(n["id"])
150
+ return pm
151
+
152
+ def _tree_value_map(nodes: list, pm: dict) -> dict:
153
+ kids = {}
154
+ for child, par in pm.items():
155
+ kids.setdefault(int(par), []).append(int(child))
156
+ nodemap = {int(n["id"]): n for n in nodes}
157
+ memo = {}
158
+ def count(nid: int) -> int:
159
+ if nid in memo:
160
+ return memo[nid]
161
+ memo[nid] = 1
162
+ n = nodemap.get(nid)
163
+ if n is not None and n.get("type") == "attribute":
164
+ memo[nid] = 1
165
+ return 1
166
+ ch = kids.get(nid, [])
167
+ v = sum(count(c) for c in ch) if ch else 1
168
+ memo[nid] = max(1, v)
169
+ return memo[nid]
170
+ return {nid: count(nid) for nid in nodemap}
171
+
172
+ def _wrap_hover(text: str, width: int = 80) -> str:
173
+ import textwrap as _tw
174
+ s = str(text or "")
175
+ if not s:
176
+ return ""
177
+ lines = []
178
+ for raw_line in s.split("\n"):
179
+ lines.extend(_tw.wrap(raw_line, width=width) or [""])
180
+ return "<br>".join(lines)
181
+
182
+ def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
183
+ nodes = _filter_dissolved(nodes)
184
+ pm = _parent_map(nodes)
185
+ vm = _tree_value_map(nodes, pm)
186
+ ids, labels, parents, values, hover = [], [], [], [], []
187
+ for n in nodes:
188
+ nid = int(n["id"])
189
+ lc = len(_leaf_ids(nodes, nid))
190
+ ids.append(str(nid))
191
+ labels.append(str(n.get("name", ""))[:40])
192
+ parents.append("" if nid == 0 else str(pm.get(nid, 0)))
193
+ values.append(vm.get(nid, 1))
194
+ hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
195
+ f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
196
+ fig = go.Figure(go.Sunburst(
197
+ ids=ids, labels=labels, parents=parents, values=values,
198
+ branchvalues="total", hovertext=hover, hoverinfo="text",
199
+ maxdepth=max_depth, insidetextorientation="radial",
200
+ marker=dict(colorscale=color, line=dict(width=1, color="white"))))
201
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
202
+ title=dict(text="Click sector to drill down β€” click centre to go back",
203
+ font=dict(size=13), x=0.5))
204
+ return fig
205
+
206
+ def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
207
+ nodes = _filter_dissolved(nodes)
208
+ pm = _parent_map(nodes)
209
+ vm = _tree_value_map(nodes, pm)
210
+ ids, labels, parents, values, hover = [], [], [], [], []
211
+ for n in nodes:
212
+ nid = int(n["id"])
213
+ lc = len(_leaf_ids(nodes, nid))
214
+ ids.append(str(nid))
215
+ labels.append(str(n.get("name", ""))[:40])
216
+ parents.append("" if nid == 0 else str(pm.get(nid, 0)))
217
+ values.append(vm.get(nid, 1))
218
+ hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
219
+ f"{_wrap_hover(n.get('desc', ''))}")
220
+ fig = go.Figure(go.Treemap(
221
+ ids=ids, labels=labels, parents=parents, values=values,
222
+ branchvalues="total", hovertext=hover, hoverinfo="text",
223
+ textinfo="label+value", maxdepth=max_depth,
224
+ marker=dict(colorscale=color, line=dict(width=1, color="white"))))
225
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
226
+ return fig
227
+
228
+ # ─────────────────────────────────────────────────────────────────────────────
229
+ # NODE-LINK TREE (Reingold-Tilford layout β€” copied from approach_2.py)
230
+ # ─────────────────────────────────────────────────────────────────────────────
231
+ def _node_color(n: dict) -> str:
232
+ t = n.get("type", "")
233
+ if t == "root": return "#c44e52"
234
+ if t == "attribute": return "#4C72B0"
235
+ if t == "collapsed": return "#bbbbbb"
236
+ return "#8C8C8C"
237
+
238
+ def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
239
+ m = {int(n["id"]): n for n in nodes}
240
+ dnodes: dict = {}
241
+ edges: list = []
242
+ counter = 10 ** 9
243
+
244
+ def rec(nid, depth):
245
+ nonlocal counter
246
+ n = m.get(int(nid))
247
+ if not n:
248
+ return
249
+ if not show_hidden and n.get("isShown") is False and depth > 0:
250
+ return
251
+ dnodes[int(nid)] = n
252
+ if depth >= max_depth and n.get("related"):
253
+ counter += 1
254
+ cid = counter
255
+ n_leaves = len(_leaf_ids(nodes, nid))
256
+ dnodes[cid] = {"id": cid, "name": f"… {n_leaves} variables",
257
+ "type": "collapsed", "related": [],
258
+ "desc": f"Collapsed: {n.get('name')}"}
259
+ edges.append((int(nid), cid))
260
+ return
261
+ for c in n.get("related", []):
262
+ ch = m.get(int(c))
263
+ if not ch:
264
+ continue
265
+ if not show_hidden and ch.get("isShown") is False:
266
+ continue
267
+ edges.append((int(nid), int(c)))
268
+ rec(int(c), depth + 1)
269
+
270
+ rec(0, 0)
271
+ return list(dnodes.values()), edges
272
+
273
+ def _positions(edges: list):
274
+ H_SCALE, V_SPACE = 3.0, 1.8
275
+ children: dict = defaultdict(list)
276
+ for p, c in edges:
277
+ children[p].append(c)
278
+ pos: dict = {}
279
+ counter = {"v": 0}
280
+
281
+ def rec(nid, depth):
282
+ ch = children.get(nid, [])
283
+ if not ch:
284
+ y_pos = counter["v"] * V_SPACE
285
+ counter["v"] += 1
286
+ pos[nid] = (depth * H_SCALE, y_pos)
287
+ return y_pos
288
+ child_ys = [rec(c, depth + 1) for c in ch]
289
+ y_pos = float(np.mean(child_ys))
290
+ pos[nid] = (depth * H_SCALE, y_pos)
291
+ return y_pos
292
+
293
+ rec(0, 0)
294
+ return pos
295
+
296
+ def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
297
+ nodes = _filter_dissolved(nodes)
298
+ dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
299
+ pos = _positions(edges)
300
+
301
+ ex, ey = [], []
302
+ for p, c in edges:
303
+ if p not in pos or c not in pos:
304
+ continue
305
+ x0, y0 = pos[p]
306
+ x1, y1 = pos[c]
307
+ xm = (x0 + x1) / 2
308
+ ex += [x0, xm, xm, x1, None]
309
+ ey += [y0, y0, y1, y1, None]
310
+ traces = [go.Scatter(x=ex, y=ey, mode="lines",
311
+ line=dict(width=1, color="#c8c8c8"),
312
+ hoverinfo="skip", showlegend=False)]
313
+
314
+ agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
315
+ lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
316
+ for n in dnodes:
317
+ nid = int(n["id"])
318
+ if nid not in pos:
319
+ continue
320
+ x, y = pos[nid]
321
+ lc = len(_leaf_ids(nodes, nid))
322
+ lab = str(n.get("name", ""))[:32]
323
+ hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
324
+ f"Variables: {lc}")
325
+ if n.get("type") == "attribute":
326
+ lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
327
+ lf_lab.append(lab if show_leaf_labels else "")
328
+ lf_hov.append(hov)
329
+ else:
330
+ agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
331
+ agg_lab.append(lab); agg_hov.append(hov)
332
+
333
+ traces.append(go.Scatter(
334
+ x=lf_x, y=lf_y, mode="markers+text" if show_leaf_labels else "markers",
335
+ text=lf_lab, textposition="middle right", textfont=dict(size=9),
336
+ marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
337
+ hovertext=lf_hov, hoverinfo="text", showlegend=False))
338
+ traces.append(go.Scatter(
339
+ x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
340
+ textposition="middle right", textfont=dict(size=10),
341
+ marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
342
+ hovertext=agg_hov, hoverinfo="text", showlegend=False))
343
+
344
+ n_rows = max(len(lf_y), len(agg_y), 1)
345
+ fig = go.Figure(traces)
346
+ fig.update_layout(
347
+ height=max(600, n_rows * 16),
348
+ margin=dict(l=10, r=140, t=10, b=10),
349
+ xaxis=dict(visible=False), yaxis=dict(visible=False),
350
+ plot_bgcolor="white",
351
+ )
352
+ return fig
353
+
354
+ # ─────────────────────────────────────────────────────────────────────────────
355
+ # STATS / SAFE RENDERING
356
+ # ─────────────────────────────────────────────────────────────────────────────
357
+ def _tree_depth(nodes: list) -> int:
358
+ """Max depth of the rendered single-parent tree (root = depth 0)."""
359
+ nodes = _filter_dissolved(nodes)
360
+ m = {int(n["id"]): n for n in nodes}
361
+ best = {"d": 0}
362
+ def rec(nid, d):
363
+ best["d"] = max(best["d"], d)
364
+ for c in m.get(int(nid), {}).get("related", []):
365
+ if int(c) in m:
366
+ rec(int(c), d + 1)
367
+ rec(0, 0)
368
+ return best["d"]
369
+
370
+ def safe_render_depth(nodes: list, requested: int) -> int:
371
+ """Plotly sunburst/treemap silently blank when asked to draw too many sectors
372
+ at once (large hierarchies like HCP). Cap the *initial* render depth β€” the
373
+ chart stays fully drillable by clicking, so no data is lost."""
374
+ n = len(_filter_dissolved(nodes))
375
+ if n > 400:
376
+ return min(requested, 3)
377
+ if n > 150:
378
+ return min(requested, 4)
379
+ return requested
380
+
381
+ # ─────────────────────────────────────────────────────────────────────────────
382
+ # IO
383
+ # ─────────────────────────────────────────────────────────────────────────────
384
+ @st.cache_data(show_spinner=False)
385
+ def _load_json(path_str: str):
386
+ with open(path_str, encoding="utf-8") as f:
387
+ return json.load(f)
388
+
389
+ def _read_bytes(path_str: str) -> bytes:
390
+ with open(path_str, "rb") as f:
391
+ return f.read()
392
+
393
+ @st.cache_data(show_spinner=False)
394
+ def _outputs_zip(root_str: str) -> bytes:
395
+ """Zip the entire bundled outputs/ folder for one-click download."""
396
+ import io, zipfile
397
+ root = Path(root_str)
398
+ buf = io.BytesIO()
399
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
400
+ for p in sorted(root.rglob("*")):
401
+ if p.is_file():
402
+ zf.write(p, arcname=p.relative_to(root.parent).as_posix())
403
+ return buf.getvalue()
404
+
405
+ def count_nodes(nodes: list) -> tuple[int, int]:
406
+ nodes = _filter_dissolved(nodes)
407
+ leaves = sum(1 for n in nodes if n.get("type") == "attribute")
408
+ aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
409
+ return leaves, aggs
410
+
411
+ def concept_aligned_pct(nodes: list) -> float | None:
412
+ """% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
413
+ aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
414
+ if not aggs:
415
+ return None
416
+ aligned = sum(1 for n in aggs
417
+ if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
418
+ return 100.0 * aligned / len(aggs) if aligned else None
419
+
420
+ # ─────────────────────────────────────────────────────────────────────────────
421
+ # SIDEBAR
422
+ # ─────────────────────────────────────────────────────────────────────────────
423
+ with st.sidebar:
424
+ approach = st.radio("**Select Approach**",
425
+ ["Baseline", "Approach 1", "Approach 2"], index=0)
426
+ dataset = st.radio("**Select Dataset**", ["AI-MIND", "HCP"], index=0)
427
+
428
+ st.markdown("---")
429
+ st.caption("Results are pre-built from the thesis experiments. To run on your "
430
+ "own data, clone the repository and run the individual apps.")
431
+ st.markdown("[πŸ“¦ GitHub Repository]"
432
+ "(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")
433
+
434
+ # ─────────────────────────────────────────────────────────────────────────────
435
+ # MAIN
436
+ # ─────────────────────────────────────────────────────────────────────────────
437
+ cfg = CONFIG[approach]
438
+ color = cfg["color"]
439
+
440
+ st.title(f"πŸ“Š {approach} β€” {dataset} Dataset")
441
+ st.markdown(f"> {APPROACH_DESC[approach]}")
442
+
443
+ paths = PREBUILT[approach][dataset]
444
+ hier_path = paths.get("hierarchy")
445
+ if hier_path is None or not hier_path.exists():
446
+ st.error(f"Pre-built result not found: `{hier_path}`")
447
+ st.stop()
448
+
449
+ raw_nodes = _load_json(str(hier_path))
450
+
451
+ leaves, aggs = count_nodes(raw_nodes)
452
+ c1, c2, c3 = st.columns(3)
453
+ c1.metric("Leaf Variables", leaves)
454
+ c2.metric("Aggregation Nodes", aggs)
455
+ c3.metric("Total Nodes", leaves + aggs)
456
+
457
+ # ── Build summary (collapsed) ────────────────────────────────────────────────
458
+ facet_path = paths.get("facets")
459
+ n_facets = None
460
+ if facet_path is not None and facet_path.exists():
461
+ try:
462
+ n_facets = len(_load_json(str(facet_path)))
463
+ except Exception:
464
+ n_facets = None
465
+
466
+ with st.expander("ℹ️ Build summary", expanded=False):
467
+ bs1, bs2, bs3, bs4 = st.columns(4)
468
+ bs1.metric("Variables", leaves)
469
+ bs2.metric("Internal nodes", aggs)
470
+ bs3.metric("Tree depth", _tree_depth(raw_nodes))
471
+ bs4.metric("Facets", n_facets if n_facets is not None else "β€”")
472
+ pct = concept_aligned_pct(raw_nodes)
473
+ if pct is not None:
474
+ st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
475
+ st.caption(
476
+ f"Source file: `{hier_path.name}` Β· "
477
+ f"Approach: **{approach}** Β· Dataset: **{dataset}**. "
478
+ "Tree topology and labels are reproduced exactly from the pre-built "
479
+ "thesis output (the algorithms are not re-run in this viewer)."
480
+ )
481
+
482
+ # ── Downloads ────────────────────────────────────────────────────────────────
483
+ d1, d2, d3 = st.columns(3)
484
+ with d1:
485
+ st.download_button("⬇️ Hierarchy JSON", data=_read_bytes(str(hier_path)),
486
+ file_name=hier_path.name, mime="application/json",
487
+ width='stretch')
488
+ with d2:
489
+ if facet_path is not None and facet_path.exists():
490
+ st.download_button("⬇️ Facets JSON", data=_read_bytes(str(facet_path)),
491
+ file_name=facet_path.name, mime="application/json",
492
+ width='stretch')
493
+ else:
494
+ st.button("⬇️ Facets JSON", disabled=True, width='stretch',
495
+ help="This approach/dataset has no facet tree.")
496
+ with d3:
497
+ st.download_button("⬇️ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
498
+ file_name="metadata_hierarchy_outputs.zip",
499
+ mime="application/zip", width='stretch')
500
+
501
+ st.markdown("---")
502
+
503
+ # ── Level-of-Detail controls (above chart β€” matches the apps) ────────────────
504
+ view_options = ["Sunburst (drill-down)", "Treemap"]
505
+ if cfg["node_link"]:
506
+ view_options.append("Node-link tree")
507
+
508
+ if cfg["compress"]:
509
+ vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
510
+ else:
511
+ vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
512
+ vc5 = None
513
+
514
+ with vc1:
515
+ viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
516
+ help="Sunburst best for large hierarchies [Taxonomizer]. "
517
+ "Node-link best for moderate-depth structure inspection.")
518
+ with vc2:
519
+ depth = st.slider("Depth (Level of Detail)", 1, 9, DEFAULT_DEPTH, 1,
520
+ help="Maximum tree levels shown. Set high to see the whole "
521
+ "hierarchy, lower to peel back to the interior.")
522
+ with vc3:
523
+ show_leaf_labels = st.checkbox("Leaf labels", value=False)
524
+ with vc4:
525
+ show_hidden = st.checkbox("Hidden nodes", value=False)
526
+ if vc5 is not None:
527
+ with vc5:
528
+ compress_chains = st.checkbox("Compress chains", value=True,
529
+ help="Merge one-child aggregation chains "
530
+ '(e.g. "DMS β†’ DMS Recommended Standard") for '
531
+ "display. Export JSON keeps original structure.")
532
+ else:
533
+ compress_chains = False
534
+
535
+ st.divider()
536
+
537
+ display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
538
+
539
+ if viz_mode == "Sunburst (drill-down)":
540
+ st.plotly_chart(plot_sunburst(display_nodes, color, depth), width='stretch')
541
+ elif viz_mode == "Treemap":
542
+ st.plotly_chart(plot_treemap(display_nodes, color, depth), width='stretch')
543
+ else:
544
+ st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
545
+ width='stretch')
546
+
547
+ # ── Facets (Approach 1 only) ─────────────────────────────────────────────────
548
+ if facet_path is not None and facet_path.exists():
549
+ st.markdown("---")
550
+ st.subheader("πŸ”€ Parallel facets")
551
+ facets = _load_json(str(facet_path))
552
+ names = list(facets.keys())
553
+ if not names:
554
+ st.info("No facets available for this dataset.")
555
+ else:
556
+ sel = st.selectbox("Select facet", names)
557
+ fnodes = facets[sel]
558
+ ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
559
+ with ft1:
560
+ st.plotly_chart(plot_sunburst(fnodes, color, depth), width='stretch')
561
+ with ft2:
562
+ st.plotly_chart(plot_treemap(fnodes, color), width='stretch')