Spaces:

rufasharon
/

metadata_hierarchy_tfm2026

Sleeping

metadata_hierarchy_tfm2026 / version2 /baseline.py

RoophaSharon

UX v2: move configuration to main area, deploy version2 to the Space

dd46f48 3 days ago

53.1 kB

	# baseline.py — Metadata Hierarchy Builder — Baseline (Taxonomizer)
	#
	# Baseline = Taxonomizer (Mahmood & Mueller, IEEE TVCG 2019), semantic-space
	# pipeline, adapted to a metadata-only setting. No hardcoded domain patterns.
	#
	# Pipeline:
	# 1. Load metadata file (CSV / TSV / XLSX / JSON)
	# 2. Detect column roles (leaf / context / text / meta) — same as Approach 1 / 2
	# 3. Build canonical schema (incl. _semantic_text = description values only)
	# 4. Embed each variable (code + description) via Word2Vec skip-gram and build
	# the cosine-distance semantic space [TAX §3.2]
	# 5. Recursively cluster (agglomerative, cosine) into the dendrogram taxonomy;
	# internal-node labels = data-driven contrastive terms of each cluster
	# 6. Visualise (Sunburst / Treemap / Node-link)
	# 7. Export visualization-ready JSON + canonical CSV
	#
	# Paper & justified adaptations (metadata/schema setting, fully automatic):
	# [TAX] Mahmood & Mueller — Taxonomizer, IEEE TVCG 2019.
	# Builds a SEMANTIC space (cosine over word2vec skip-gram embeddings of
	# attribute names; gensim, Wikipedia, window=5, dim=128) merged with a
	# DATA space (correlation over raw values), clustered into a dendrogram;
	# inner nodes labelled semi-automatically by distributional degree-of-
	# entailment + WordNet synonyms.
	# Adaptations (all documented):
	# 1. No DATA space — a schema/dictionary has no raw values, so we use the
	# semantic space alone (Taxonomizer with semantic weight = 1.0).
	# 2. Embed the attribute's short NAME (the description's name clause), since
	# the bare code goes out-of-vocabulary (a limitation the paper flags,
	# e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a
	# paragraph; using the short name (not the full description prose) keeps
	# domain-specific words from being diluted by shared explanatory text.
	# 3. Fully-automatic labels — the paper's labelling is semi-automatic
	# (human picks from suggestions); a baseline must be non-interactive, so
	# we use data-driven contrastive terms from each cluster's members.
	#
	# Dependencies: gensim
	# pip install gensim

	from __future__ import annotations
	import csv, json, re, warnings
	from collections import Counter, defaultdict
	from pathlib import Path
	import tempfile

	import numpy as np
	import pandas as pd
	import plotly.graph_objects as go
	import streamlit as st
	from sklearn.cluster import AgglomerativeClustering
	from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, silhouette_score
	from sklearn.preprocessing import LabelEncoder

	warnings.filterwarnings('ignore')

	st.set_page_config(page_title='Metadata Hierarchy — Baseline', layout='wide')
	st.title('Metadata Hierarchy Builder — Baseline (Taxonomizer)')
	st.caption(
	'Taxonomizer baseline [Mahmood & Mueller, IEEE TVCG 2019]: Word2Vec skip-gram '
	'semantic space (short attribute names) + balanced Ward agglomerative clustering '
	'into the dendrogram taxonomy; nodes labelled by data-driven contrastive terms. '
	'Semantic space only (no raw data values); no hardcoded patterns, no external APIs.'
	)

	# ─────────────────────────────────────────────────────────────────────────────
	# CONSTANTS
	# ─────────────────────────────────────────────────────────────────────────────
	LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split()
	GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
	TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation comment'.split()
	META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split()

	# URL pattern — strip embedded links (e.g. HCP FreeSurfer NeuroLex URLs) so web
	# tokens cannot dominate the embedding or the cluster label. [shared with A1]
	_URL_RE = re.compile(r'(https?://\S+\|www\.\S+\|\b\w+\.(?:org\|com\|net\|gov\|edu)\b/?\S*)',
	re.IGNORECASE)

	# ─────────────────────────────────────────────────────────────────────────────
	# FILE LOADING
	# ─────────────────────────────────────────────────────────────────────────────
	def safe_name(name: str) -> str:
	return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name)

	def try_read_csv(path: Path) -> pd.DataFrame:
	best, best_score = None, -1
	for enc in ['utf-8-sig', 'utf-8', 'latin1']:
	for sep in [None, ',', '\t', ';', '\|']:
	try:
	df = pd.read_csv(path, sep=sep, engine='python', encoding=enc)
	score = df.shape[1] * 10 - float(df.isna().mean().mean())
	if score > best_score:
	best, best_score = df, score
	except Exception:
	pass
	if best is None:
	raise ValueError(f'Could not read {path.name}')
	best.columns = [str(c).strip().replace(';', '') for c in best.columns]
	# Repair comma-packed rows (AI-Mind format)
	if len(best) > 0:
	first = best.iloc[:, 0].astype(str)
	other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0
	if first.str.contains(',').mean() > 0.50 and other_null > 0.70:
	lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines()
	if lines:
	header = [h.strip().replace(';', '') for h in lines[0].split(',')]
	rows = []
	for line in lines[1:]:
	line = line.strip().rstrip(';')
	if not line:
	continue
	if line.startswith('"') and line.endswith('"'):
	line = line[1:-1]
	try:
	parts = next(csv.reader([line], quotechar='"'))
	except Exception:
	continue
	if len(parts) >= len(header):
	rows.append(parts[:len(header)])
	if rows:
	best = pd.DataFrame(rows, columns=header)
	best.columns = [str(c).strip().replace(';', '') for c in best.columns]
	return best

	def load_any(path: Path) -> pd.DataFrame:
	s = path.suffix.lower()
	if s in ['.csv', '.tsv', '.txt']:
	return try_read_csv(path)
	if s in ['.xlsx', '.xls']:
	return pd.read_excel(path)
	if s == '.json':
	obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
	if isinstance(obj, list):
	return pd.json_normalize(obj)
	if isinstance(obj, dict):
	for v in obj.values():
	if isinstance(v, list):
	return pd.json_normalize(v)
	raise ValueError(f'Unsupported file type: {s}')

	def save_upload(f) -> Path:
	tmp = Path(tempfile.mkdtemp(prefix='baseline_'))
	p = tmp / safe_name(f.name)
	p.write_bytes(f.getbuffer())
	return p

	# ─────────────────────────────────────────────────────────────────────────────
	# ROLE DETECTION [GON]
	# ─────────────────────────────────────────────────────────────────────────────
	def norm(c: str) -> str:
	return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_')

	def kscore(c: str, keys: list) -> int:
	nc = norm(c)
	return sum(1 for k in keys if k in nc)

	def profile_columns(df: pd.DataFrame) -> pd.DataFrame:
	out = []
	n = max(len(df), 1)
	for col in df.columns:
	s = df[col]
	non = float(s.notna().mean())
	nun = int(s.nunique(dropna=True))
	ur = nun / n
	avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
	out.append({
	'column': str(col),
	'non_null': round(non, 3),
	'unique_values': nun,
	'unique_ratio': round(ur, 3),
	'avg_length': round(avg, 1),
	'leaf_score': 4*kscore(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0),
	'group_score': 4kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n0.5, 80) else 0) + (1 if avg < 60 else 0),
	'text_score': 5*kscore(col, TEXT_KEYS) + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0),
	'metadata_score': 4kscore(col, META_KEYS) + (2 if 1 < nun < min(n0.8, 100) else 0),
	})
	return pd.DataFrame(out)

	def detect_roles(df: pd.DataFrame) -> tuple:
	"""Auto-detect column roles. Identical logic to Approach 1 / 2 so the
	preprocessing up to the canonical table is comparable across all apps."""
	prof = profile_columns(df)
	leaf = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist()
	text = (prof[(prof.text_score >= 4) \| (prof.avg_length > 80)]
	.sort_values('text_score', ascending=False)['column'].tolist()) or leaf.copy()
	group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)]
	.sort_values('group_score', ascending=False)['column'].head(3).tolist())
	meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
	.sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
	# Representation columns (decimal/precision/unit/type/format/…) must never
	# become structural levels; prefer them as metadata. [GON][TAX]
	_META_SUBSTR_BLOCK = {
	'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
	'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
	}
	def _is_repr(col_name):
	nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower())
	return any(sub in nc for sub in _META_SUBSTR_BLOCK)
	meta_extra = [c for c in prof['column'].tolist()
	if _is_repr(c) and c not in text and c not in leaf and c not in meta]
	group = [c for c in group if not _is_repr(c)]
	meta = list(dict.fromkeys(meta + meta_extra))[:8]
	return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof

	# ─────────────────────────────────────────────────────────────────────────────
	# CANONICAL SCHEMA [GON]
	# ─────────────────────────────────────────────────────────────────────────────
	def sv(x) -> str:
	return '' if pd.isna(x) else str(x).strip()

	def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
	leaf_cols = cfg.get('leaf_cols', [])
	group_cols = cfg.get('group_cols', [])
	text_cols = cfg.get('text_cols', [])
	meta_cols = cfg.get('metadata_cols', [])
	rows = []
	for i, row in df.iterrows():
	leaf_parts = [sv(row.get(c, '')) for c in leaf_cols]
	leaf_parts = [p for p in leaf_parts if p]
	label = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}'
	group_parts = [sv(row.get(c, '')) for c in group_cols]
	group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']]
	gpath = ' > '.join(group_parts) if group_parts else 'Ungrouped'
	parts = []
	for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)):
	v = sv(row.get(c, ''))
	if v:
	parts.append(f'{c}: {v}')
	text = ' \| '.join(parts) if parts else label
	# _semantic_text: description VALUES only — no "fieldname:" prefixes, no
	# other fields, URLs stripped. This is the clean text Taxonomizer embeds
	# (the attribute's meaning), identical in spirit to Approach 1's column.
	sem_parts = [sv(row.get(c, '')) for c in text_cols]
	sem_parts = [p for p in sem_parts if p]
	if not sem_parts:
	sem_parts = list(leaf_parts)
	semantic = _URL_RE.sub(' ', ' '.join(sem_parts)) if sem_parts else label
	rows.append({
	'_source_file': source,
	'_row_index': int(i),
	'_leaf_label': label,
	'_leaf_id': f'{gpath}.{label}' if gpath != 'Ungrouped' else label,
	'_group_path': gpath,
	'_text': text,
	'_semantic_text': semantic,
	})
	can = pd.DataFrame(rows)
	if can['_leaf_id'].duplicated().any():
	cnt: dict = defaultdict(int)
	ids = []
	for lid in can['_leaf_id']:
	cnt[lid] += 1
	ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
	can['_leaf_id'] = ids
	return can

	# ─────────────────────────────────────────────────────────────────────────────
	# TAXONOMIZER CORE [TAX — Mahmood & Mueller, IEEE TVCG 2019]
	#
	# Taxonomizer builds the taxonomy from a SEMANTIC SPACE (cosine distance between
	# word2vec skip-gram embeddings of attribute names) merged with a DATA SPACE
	# (correlation over the raw values). In a metadata/schema setting we have no
	# raw data values, so we use the semantic space alone (= Taxonomizer with
	# semantic weight 1.0). Because attribute names here are opaque codes that go
	# out-of-vocabulary — a limitation the paper explicitly flags (e.g. "BP") — we
	# embed code + description so real words carry the meaning (OOV code tokens are
	# skipped during averaging). Internal-node labels: the paper uses semi-automatic
	# distributional degree-of-entailment + WordNet synonyms; a baseline must be
	# fully automatic, so we use data-driven contrastive terms drawn from the data.
	# ─────────────────────────────────────────────────────────────────────────────

	_W2V_STOP = frozenset(
	'a an the and or but if in on at to of for with by is are was were be '
	'been being have has had do does did will would could should may might '
	'shall can this that these those i you he she it we they me him her us '
	'them my your his her its our their what which who whom when where why '
	'how all each every few more most other some such no not only same so '
	'than too very just because as until while'.split()
	)

	@st.cache_resource(show_spinner=False)
	def _load_w2v():
	"""Load pre-trained Word2Vec / GloVe model via gensim downloader.

	We prefer glove-wiki-gigaword-100 (~66 MB) because its Wikipedia training
	corpus and skip-gram-style objective most closely match Taxonomizer's
	described word2vec-Wikipedia-dim128 model.
	"""
	try:
	import gensim.downloader as api
	return api.load('glove-wiki-gigaword-100')
	except Exception as e:
	st.error(
	f'Could not load Word2Vec model: {e}\n\n'
	'Run: pip install gensim and restart the app.\n'
	'The model (~66 MB) is downloaded automatically on first use.'
	)
	return None

	def _tokenize(label: str) -> list[str]:
	return [t for t in re.sub(r'[^a-zA-Z]+', ' ', label).lower().split()
	if len(t) > 2 and t not in _W2V_STOP]

	def attribute_name(text: str) -> str:
	"""The attribute's short NAME — what Taxonomizer actually embeds [TAX §3.2].

	The paper embeds the attribute name ("not more than a few words long"), not a
	paragraph. Descriptions here are formatted '<name>: <full sentence>' (some
	prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
	that is not a pure all-caps marker. Embedding this short name — rather than
	the full description prose — keeps the domain-specific words from being
	diluted by shared explanatory text, so the taxonomy clusters more by theme
	(e.g. DMS / PAL / SWM).
	"""
	text = str(text)
	for clause in re.split(r'[:\n]', text):
	clause = clause.strip()
	if clause and not all(2 <= len(w) <= 6 and w.isupper() for w in clause.split()):
	return clause
	return text.strip()

	def embed_labels_w2v(labels: list[str], model) -> np.ndarray:
	"""Average Word2Vec vectors for each label's tokens [TAX §4.1].

	Falls back to a zero vector for labels where none of the tokens are in the
	model vocabulary (rare for standard English attribute names).
	"""
	dim = model.vector_size
	out = np.zeros((len(labels), dim), dtype=np.float32)
	for i, label in enumerate(labels):
	toks = _tokenize(label)
	vecs = [model[t] for t in toks if t in model]
	if vecs:
	out[i] = np.mean(vecs, axis=0)
	# L2-normalise so cosine distance = 1 - dot
	norms = np.linalg.norm(out, axis=1, keepdims=True)
	norms[norms == 0] = 1.0
	return out / norms

	def _cluster(X: np.ndarray, k: int) -> np.ndarray:
	"""Ward-linkage agglomerative cut into k clusters.

	Ward (on the L2-normalised embedding vectors, where Euclidean ∝ √cosine)
	minimises within-cluster variance and so produces balanced clusters.
	This avoids the average/single-linkage chaining pathology that otherwise
	peels off tiny clusters and leaves one giant residual (i.e. no real
	hierarchy forms).
	"""
	return AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(X)

	def best_k(X: np.ndarray, n: int, k_min: int = 2, k_max: int = 8) -> int:
	"""Pick the number of clusters that maximises the silhouette score.

	Fully data-driven — no fixed cluster count. Returns 1 only when the node
	is too small to split (n <= k_min).
	"""
	k_hi = min(k_max, n - 1)
	if k_hi < k_min:
	return 1
	best, best_s = 1, -1.0
	for k in range(k_min, k_hi + 1):
	labels = _cluster(X, k)
	if len(set(labels)) < 2:
	continue
	try:
	s = silhouette_score(X, labels)
	except Exception:
	continue
	if s > best_s:
	best_s, best = s, k
	return best

	def _doc_freq(texts: list[str]) -> Counter:
	"""Document frequency: how many member texts each content word appears in."""
	c: Counter = Counter()
	for t in texts:
	for w in set(_tokenize(t)):
	c[w] += 1
	return c

	def cluster_term_label(member_texts: list[str], sibling_texts: list[str],
	used: set, vocab=None, top_n: int = 2) -> str:
	"""Label a node with the content words most characteristic of its members.

	Data-driven labelling: each candidate word is scored by how much more
	frequent it is inside the cluster than in the sibling pool (contrastive
	document frequency), so labels are domain terms drawn from the dataset
	itself — not external ontology words. This replaces Taxonomizer's
	WordNet degree-of-entailment, which produces over-general, off-domain
	abstractions on specialised scientific metadata.

	If `vocab` is given (the Word2Vec model), only real dictionary words are
	eligible, so opaque attribute codes (e.g. 'dms', 'motml') are filtered out
	of labels. Codes are used only as a last-resort fallback.
	"""
	def in_vocab(w: str) -> bool:
	return vocab is None or w in vocab

	n_in = max(len(member_texts), 1)
	n_out = max(len(sibling_texts), 1)
	cin = _doc_freq(member_texts)
	cout = _doc_freq(sibling_texts)

	scores: dict[str, float] = {}
	for w, f in cin.items():
	if w in used or len(w) <= 2 or not in_vocab(w):
	continue
	p_in = f / n_in
	p_out = cout.get(w, 0) / n_out
	# ignore single-occurrence noise unless the term is widely shared
	if f < 2 and p_in < 0.5:
	continue
	scores[w] = p_in - p_out

	picks = [w for w, _ in sorted(scores.items(), key=lambda x: -x[1])[:top_n]
	if scores[w] > 0]
	if not picks:
	# fallback: most frequent shared real word, then any shared token
	for require_vocab in (True, False):
	for w, _ in cin.most_common():
	if w not in used and len(w) > 2 and (not require_vocab or in_vocab(w)):
	picks = [w]
	break
	if picks:
	break
	return ' / '.join(p.title() for p in picks) if picks else 'Group'

	# ─────────────────────────────────────────────────────────────────────────────
	# HIERARCHY CONSTRUCTION [TAX + GON]
	# ─────────────────────────────────────────────────────────────────────────────
	def _nmap(nodes: list) -> dict:
	return {int(n['id']): n for n in nodes}

	def _next_id(nodes: list) -> int:
	return max((int(n['id']) for n in nodes), default=0) + 1

	def _add_child(nodes: list, parent_id: int, child_id: int):
	m = _nmap(nodes)
	p = m.get(int(parent_id))
	if p is None:
	return
	rel = list(p.get('related', []))
	if int(child_id) not in rel:
	rel.append(int(child_id))
	p['related'] = rel

	def _make_agg(nid: int, name: str, desc: str = '') -> dict:
	return {'id': int(nid), 'name': str(name), 'related': [],
	'type': 'aggregation', 'isShown': True, 'desc': desc, 'dtype': 'determine'}

	def _leaf_ids(nodes: list, nid: int) -> list:
	m = _nmap(nodes)
	out: list = []
	def rec(x):
	n = m.get(int(x))
	if not n:
	return
	if n.get('type') == 'attribute':
	out.append(int(x))
	return
	for c in n.get('related', []):
	rec(int(c))
	rec(nid)
	return list(dict.fromkeys(out))

	def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
	max_depth: int = 3, min_cluster_size: int = 6,
	branch_max: int = 8) -> list:
	"""Taxonomizer semantic-space construction [TAX].

	Embeds each variable from its short attribute NAME (Word2Vec skip-gram
	average) — the name clause of the description, as Taxonomizer specifies.
	Recursively clusters via balanced Ward linkage — the semantic-space
	dendrogram. Labels each internal node with the contrastive content terms of
	its members (data-driven, fully automatic). No hardcoding.
	"""
	# ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
	nodes: list = [{'id': 0, 'name': project, 'type': 'root',
	'dtype': 'root', 'isShown': True, 'related': [], 'desc': 'Root node'}]
	row_to_node: list = []
	embed_list: list[str] = [] # short attribute name → embedding input + labels
	for i, (_, r) in enumerate(can.iterrows(), start=1):
	sem = str(r.get('_semantic_text', '') or r['_leaf_label'])
	name = attribute_name(sem) or str(r['_leaf_label'])
	nodes.append({'id': i, 'name': r['_leaf_label'], 'dtype': 'determine',
	'related': [], 'isShown': True, 'type': 'attribute',
	'desc': r['_text'],
	'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}})
	row_to_node.append(i)
	embed_list.append(name)
	label_list = embed_list
	row_to_node = np.array(row_to_node)

	# ── Word2Vec semantic-space embeddings [TAX §3.2] ─────────────────────────
	emb = embed_labels_w2v(embed_list, w2v_model) # (N, dim), L2-normalised

	# ── recursive clustering down the Ward dendrogram ─────────────────────────
	def attach_leaves(parent_id: int, idx: np.ndarray):
	for i in idx:
	_add_child(nodes, parent_id, int(row_to_node[i]))

	def recurse(parent_id: int, idx: np.ndarray, depth: int, used: set):
	n = len(idx)
	if n <= min_cluster_size or depth >= max_depth:
	attach_leaves(parent_id, idx)
	return

	sub = emb[idx]
	k_cap = min(branch_max, n - 1)
	# Branching floor: a node with n leaves and `remaining` levels left must
	# fan out enough to fit all its leaves into buckets of ~min_cluster_size
	# by the depth cap, i.e. k >= (n / min_cluster_size) ** (1/remaining).
	# Without this, silhouette keeps picking k=2 on overlapping data (e.g.
	# HCP), giving a near-binary tree that dumps ~100 leaves per bottom node.
	remaining = max(1, max_depth - depth)
	k_floor = int(np.ceil((n / max(min_cluster_size, 1)) ** (1.0 / remaining)))
	k_floor = max(2, min(k_floor, k_cap))
	k = best_k(sub, n, k_min=k_floor, k_max=k_cap)
	if k <= 1:
	k = min(k_floor, k_cap) if n > min_cluster_size else 1
	if k <= 1:
	attach_leaves(parent_id, idx)
	return

	cluster_labels = _cluster(sub, k)
	for c in range(k):
	mask = cluster_labels == c
	members = idx[mask]
	if len(members) == 0:
	continue
	if len(members) == 1: # don't create singleton internal nodes
	_add_child(nodes, parent_id, int(row_to_node[members[0]]))
	continue
	mset = set(members.tolist())
	member_texts = [label_list[i] for i in members]
	sibling_texts = [label_list[i] for i in idx if i not in mset]
	# data-driven contrastive-term labelling
	label = cluster_term_label(member_texts, sibling_texts, used)
	nid = _next_id(nodes)
	nodes.append(_make_agg(nid, label,
	desc=f'Cluster of {len(members)} variables — '
	f'label terms: {label}'))
	_add_child(nodes, parent_id, nid)
	recurse(nid, members, depth + 1, used \| {label.lower()})

	recurse(0, np.arange(len(can)), 0, set())

	for n in nodes:
	n['related'] = list(dict.fromkeys(int(x) for x in n.get('related', [])))
	return nodes

	# ─────────────────────────────────────────────────────────────────────────────
	# VISUALISATION
	# ─────────────────────────────────────────────────────────────────────────────
	def _parent_map(nodes: list) -> dict:
	pm: dict = {}
	for n in nodes:
	for c in n.get('related', []):
	if int(c) not in pm:
	pm[int(c)] = int(n['id'])
	return pm

	# ─────────────────────────────────────────────────────────────────────────────
	# EVALUATION HELPERS
	# ─────────────────────────────────────────────────────────────────────────────
	def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list[int]:
	"""Return predicted cluster id (depth-1 aggregation ancestor) for each row in can."""
	pm = _parent_map(nodes)
	def depth1(nid: int) -> int:
	# Walk up until our parent is root (id==0) or we have no parent
	while pm.get(nid, -1) not in (-1, 0):
	nid = pm[nid]
	return nid
	lid_to_nid = {n['metadata']['leaf_id']: int(n['id'])
	for n in nodes if n.get('type') == 'attribute' and 'metadata' in n}
	return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1
	for lid in can['_leaf_id']]

	def _purity(y_true, y_pred) -> float:
	from collections import Counter
	clusters: dict = {}
	for t, p in zip(y_true, y_pred):
	clusters.setdefault(p, []).append(t)
	correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values())
	return correct / max(len(y_true), 1)

	def _structural_stats(nodes: list) -> dict:
	pm = _parent_map(nodes)
	def depth_of(nid: int) -> int:
	d = 0
	while nid in pm:
	nid = pm[nid]; d += 1
	return d
	agg = [n for n in nodes if n.get('type') == 'aggregation']
	leafs = [n for n in nodes if n.get('type') == 'attribute']
	depths = [depth_of(int(n['id'])) for n in leafs]
	branches = [len(n.get('related', [])) for n in agg]
	singletons = sum(1 for b in branches if b == 1)
	return {
	'n_aggregation_nodes': len(agg),
	'max_depth': int(max(depths, default=0)),
	'avg_leaf_depth': round(float(np.mean(depths)), 2) if depths else 0.0,
	'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0,
	'singleton_nodes_%': round(100.0 * singletons / max(len(agg), 1), 1),
	}

	def _wrap(text: str, width: int = 70) -> str:
	"""Wrap long hover text onto multiple <br> lines so it never runs off-screen."""
	import textwrap
	text = str(text).replace('<', '<')
	lines: list = []
	for para in text.split('\n'):
	wrapped = textwrap.wrap(para, width=width) or ['']
	lines.extend(wrapped)
	return '<br>'.join(lines)

	def plot_sunburst(nodes: list, max_depth: int = 4) -> go.Figure:
	pm = _parent_map(nodes)
	ids, labels, parents, values, hover = [], [], [], [], []
	for n in nodes:
	nid = int(n['id'])
	lc = len(_leaf_ids(nodes, nid))
	ids.append(str(nid))
	labels.append(str(n.get('name', ''))[:40])
	parents.append('' if nid == 0 else str(pm.get(nid, 0)))
	values.append(max(1, lc))
	desc = _wrap(n.get('desc', ''))
	hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Type: {n.get("type","")}'
	f'<br>Variables: {lc}<br><br>{desc}')
	fig = go.Figure(go.Sunburst(
	ids=ids, labels=labels, parents=parents, values=values,
	branchvalues='total', hovertext=hover, hoverinfo='text',
	maxdepth=max_depth, insidetextorientation='radial',
	marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
	))
	fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
	title='Click a sector to drill down — click centre to go back')
	return fig

	def plot_treemap(nodes: list) -> go.Figure:
	pm = _parent_map(nodes)
	ids, labels, parents, values, hover = [], [], [], [], []
	for n in nodes:
	nid = int(n['id'])
	lc = len(_leaf_ids(nodes, nid))
	ids.append(str(nid))
	labels.append(str(n.get('name', ''))[:40])
	parents.append('' if nid == 0 else str(pm.get(nid, 0)))
	values.append(max(1, lc))
	desc = _wrap(n.get('desc', ''))
	hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Variables: {lc}<br>{desc}')
	fig = go.Figure(go.Treemap(
	ids=ids, labels=labels, parents=parents, values=values,
	branchvalues='total', hovertext=hover, hoverinfo='text',
	textinfo='label+value',
	marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
	))
	fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
	return fig

	# ─────────────────────────────────────────────────────────────────────────────
	# NODE-LINK TREE (Reingold–Tilford layout — matches Approach 1 / 2 interface)
	# ─────────────────────────────────────────────────────────────────────────────
	def _bl_node_color(n: dict) -> str:
	t = n.get('type', '')
	if t == 'root': return '#2a7d2a'
	if t == 'attribute': return '#74c476'
	if t == 'collapsed': return '#bbbbbb'
	return '#238b45'

	def _display_graph(nodes: list, max_depth: int = 4):
	"""Walk the tree to the chosen depth, inserting 'collapsed' placeholders for
	branches cut off below max_depth (the Level-of-Detail control)."""
	m = _nmap(nodes)
	dnodes: dict = {}
	edges: list = []
	counter = 10 ** 9

	def rec(nid, depth):
	nonlocal counter
	n = m.get(int(nid))
	if not n:
	return
	dnodes[int(nid)] = n
	if depth >= max_depth and n.get('related'):
	counter += 1
	cid = counter
	n_leaves = len(_leaf_ids(nodes, nid))
	dnodes[cid] = {'id': cid, 'name': f'… {n_leaves} variables',
	'type': 'collapsed', 'related': [],
	'desc': f"Collapsed: {n.get('name')}", 'isShown': True}
	edges.append((int(nid), cid))
	return
	for c in n.get('related', []):
	if int(c) not in m:
	continue
	edges.append((int(nid), int(c)))
	rec(int(c), depth + 1)

	rec(0, 0)
	return list(dnodes.values()), edges

	def _positions(edges: list):
	"""Reingold–Tilford style positions: x = depth, y = subtree-aware vertical."""
	H_SCALE, V_SPACE = 3.0, 1.8
	children: dict = defaultdict(list)
	for p, c in edges:
	children[p].append(c)
	pos: dict = {}
	counter = {'v': 0}

	def rec(nid, depth):
	ch = children.get(nid, [])
	if not ch:
	y = counter['v'] * V_SPACE
	counter['v'] += 1
	pos[nid] = (depth * H_SCALE, y)
	return y
	y = float(np.mean([rec(c, depth + 1) for c in ch]))
	pos[nid] = (depth * H_SCALE, y)
	return y

	rec(0, 0)
	return pos

	def plot_node_link(nodes: list, max_depth: int = 4, show_leaf_labels: bool = False) -> go.Figure:
	"""Node-link tree with elbow edges. Best for inspecting structure at moderate
	depth; Sunburst is recommended for large hierarchies (Taxonomizer)."""
	dnodes, edges = _display_graph(nodes, max_depth)
	pos = _positions(edges)

	ex, ey = [], []
	for p, c in edges:
	if p not in pos or c not in pos:
	continue
	x0, y0 = pos[p]; x1, y1 = pos[c]
	xm = (x0 + x1) / 2
	ex += [x0, xm, xm, x1, None]
	ey += [y0, y0, y1, y1, None]
	traces = [go.Scatter(x=ex, y=ey, mode='lines',
	line=dict(width=1, color='#c8c8c8'),
	hoverinfo='skip', showlegend=False)]

	agg_x, agg_y, agg_l, agg_c, agg_h = [], [], [], [], []
	lf_x, lf_y, lf_l, lf_c, lf_h = [], [], [], [], []
	for n in dnodes:
	nid = int(n['id'])
	if nid not in pos:
	continue
	x, y = pos[nid]
	lc = len(_leaf_ids(nodes, nid))
	lab = str(n.get('name', nid))
	htxt = (f"<b>{_wrap(n.get('name',''))}</b><br>Type: {n.get('type','')}"
	f"<br>Variables: {lc}<br><br>{_wrap(n.get('desc',''))}")
	col = _bl_node_color(n)
	if n.get('type') in ('root', 'aggregation', 'collapsed'):
	agg_x.append(x); agg_y.append(y)
	agg_l.append((lab + (f' ({lc})' if lc else ''))[:50])
	agg_c.append(col); agg_h.append(htxt)
	else:
	lf_x.append(x); lf_y.append(y)
	lf_l.append(lab[:40] if show_leaf_labels else '')
	lf_c.append(col); lf_h.append(htxt)

	if agg_x:
	traces.append(go.Scatter(
	x=agg_x, y=agg_y, mode='markers+text', text=agg_l,
	textposition='middle right', hovertext=agg_h, hoverinfo='text',
	marker=dict(size=16, color=agg_c, line=dict(color='white', width=2)),
	showlegend=False))
	if lf_x:
	traces.append(go.Scatter(
	x=lf_x, y=lf_y, mode='markers+text', text=lf_l,
	textposition='middle right', hovertext=lf_h, hoverinfo='text',
	marker=dict(size=7, color=lf_c, symbol='circle', opacity=0.75,
	line=dict(color='white', width=1)),
	showlegend=False))

	n_leaves = max(12, len(lf_x))
	fig = go.Figure(traces)
	fig.update_layout(
	height=max(700, min(4000, int(n_leaves * 32))),
	margin=dict(l=20, r=220, t=30, b=20),
	plot_bgcolor='white', paper_bgcolor='white',
	xaxis=dict(visible=False, fixedrange=False),
	yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
	dragmode='pan')
	return fig

	# ─────────────────────────────────────────────────────────────────────────────
	# SIDEBAR
	# ─────────────────────────────────────────────────────────────────────────────
	with st.sidebar:
	st.header('1. Upload')
	uploaded = st.file_uploader(
	'Upload a metadata file',
	type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
	accept_multiple_files=False,
	)
	st.header('2. Taxonomizer settings')
	tx_max_depth = st.slider('Max taxonomy depth', 2, 6, 3, 1,
	help='How many abstract-to-concrete levels to build')
	tx_min_size = st.slider('Min cluster size', 3, 20, 6, 1,
	help='Clusters smaller than this stop splitting (leaves attach directly)')
	tx_branch = st.slider('Max branches per node', 3, 12, 8, 1,
	help='Upper bound on clusters per split; the actual number is chosen by silhouette')

	st.header('3. Display')
	max_items = st.slider('Maximum variables', 25, 1200, 900, 25,
	help='Cap on variables included (lower only to speed up very large files). '
	'Default keeps full datasets like HCP (813).')
	group_filter = st.text_input('Row filter (optional)', value='',
	help='Filter rows by contextual path text before building')

	# ─────────────────────────────────────────────────────────────────────────────
	# MAIN
	# ─────────────────────────────────────────────────────────────────────────────
	if not uploaded:
	st.info('Upload a metadata CSV / XLSX / JSON file to begin.')
	st.markdown("""
	### Baseline algorithm — Taxonomizer (semantic space)

	Based on Mahmood & Mueller, IEEE TVCG 2019 (Taxonomizer), adapted to a
	metadata-only setting. No hardcoded domain patterns, no external APIs.

	\| Step \| Method \| Paper \|
	\|------\|--------\|-------\|
	\| Variable representation \| short attribute name (description's name clause; codes are OOV) \| Taxonomizer §3.2 / §4.1 \|
	\| Embedding \| Word2Vec skip-gram — average of word vectors (`glove-wiki-gigaword-100`) \| Taxonomizer §3.2 \|
	\| Semantic space \| Cosine-distance matrix (no data space — schema has no raw values) \| Taxonomizer §3.2 (adapted) \|
	\| Hierarchy construction \| Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram \| Taxonomizer §4.2 \|
	\| Internal node labelling \| Data-driven contrastive terms (paper's labelling is semi-automatic) \| Taxonomizer §4.3 (adapted) \|

	This page is the pure Taxonomizer-style semantic-space reference method:
	variable meanings are embedded and recursively clustered into a hierarchy,
	with node labels generated from contrastive terms.

	Approach 1 adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.

	Approach 2 adds NMF/FASTopic aspect discovery + GMM clustering + optional LLM labels.
	""")
	st.stop()

	path = save_upload(uploaded)

	@st.cache_data(show_spinner=False)
	def _load_profile(path_str: str):
	df = load_any(Path(path_str))
	cfg, prof = detect_roles(df)
	return df, cfg, prof

	with st.spinner('Loading file…'):
	df, auto_cfg, prof = _load_profile(str(path))

	st.subheader('Step 1 — File preview')
	with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
	expanded=False):
	st.dataframe(df.head(10), use_container_width=True)
	score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
	if c in prof.columns]
	st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
	use_container_width=True)

	st.subheader('Step 2 — Confirm column roles')
	cols = list(df.columns)
	# Scope widget keys to the uploaded file so a NEW file always shows its own
	# auto-detected defaults (Streamlit otherwise keeps the previous file's
	# selections under a fixed key, which silently overrides the new defaults).
	_fk = safe_name(uploaded.name)
	with st.expander('Column configuration', expanded=True):
	left, right = st.columns(2)
	with left:
	leaf_cols = st.multiselect('Leaf variable column(s)', cols,
	default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
	group_cols = st.multiselect('Context column(s) (optional)', cols,
	default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
	help='Optional contextual columns for display/filtering.')
	with right:
	text_cols = st.multiselect('Text/description column(s)', cols,
	default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
	meta_cols = st.multiselect('Metadata/type column(s)', cols,
	default=[c for c in auto_cfg.get('metadata_cols', []) if c in cols], key=f'meta_{_fk}')

	if not leaf_cols:
	st.error('Choose at least one leaf variable column.')
	st.stop()

	cfg = {'leaf_cols': leaf_cols, 'group_cols': group_cols,
	'text_cols': text_cols, 'metadata_cols': meta_cols}

	if st.button('Build baseline hierarchy', type='primary'):
	# ── load Word2Vec model (cached after first call) ──────────────────────
	with st.spinner('Loading Word2Vec model (first run downloads ~66 MB)…'):
	_w2v = _load_w2v()
	if _w2v is None:
	st.stop()

	with st.spinner('Building hierarchy…'):
	_can = build_canonical(df, cfg, source=Path(uploaded.name).stem)

	if group_filter.strip():
	_can = _can[_can['_group_path'].str.contains(
	group_filter.strip(), case=False, na=False)].copy()

	if len(_can) > max_items:
	_can = _can.head(max_items).copy()

	_can = _can.reset_index(drop=True)

	if len(_can) < 2:
	st.error('Need at least 2 variables after filtering.')
	st.stop()

	_pname = Path(uploaded.name).stem
	_nodes = build_hierarchy(_can, _w2v, project=_pname,
	max_depth=tx_max_depth,
	min_cluster_size=tx_min_size,
	branch_max=tx_branch)

	st.session_state['_bl_nodes'] = _nodes
	st.session_state['_bl_can'] = _can
	st.session_state['_bl_project'] = _pname

	if '_bl_nodes' not in st.session_state:
	st.info('Configure columns above then click Build baseline hierarchy.')
	st.stop()

	nodes = st.session_state['_bl_nodes']
	can = st.session_state['_bl_can']
	project_name = st.session_state['_bl_project']

	_sm = _structural_stats(nodes)
	n_leaves = len([n for n in nodes if n['type'] == 'attribute'])
	n_internal = len([n for n in nodes if n['type'] == 'aggregation'])

	st.divider()
	c1, c2, c3, c4 = st.columns(4)
	c1.metric('Variables', n_leaves)
	c2.metric('Aggregation nodes', n_internal)
	c3.metric('Max depth', _sm['max_depth'])
	c4.metric('Avg branching', _sm['avg_branching_factor'])

	tabs = st.tabs(['Visualization', 'Node detail', 'Canonical table', 'Export', 'Evaluation'])

	with tabs[0]:
	# ── Visualization controls (above chart — matches Approach 1 / 2) ─────────
	vc1, vc2, vc3 = st.columns([3, 2, 1])
	with vc1:
	viz_mode = st.radio(
	'View mode',
	['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
	horizontal=True, index=0,
	help='Sunburst best for large hierarchies [Taxonomizer]. '
	'Node-link best for inspecting structure at moderate depth.')
	with vc2:
	display_depth = st.slider('Depth (Level of Detail)', 1, 8, 4, 1,
	help='How many levels to reveal at once.')
	with vc3:
	show_leaf_labels = st.checkbox('Leaf labels', value=False,
	help='Show variable names on the node-link tree.')
	st.divider()

	if viz_mode == 'Sunburst (drill-down)':
	st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth),
	use_container_width=True)
	st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
	elif viz_mode == 'Treemap':
	st.plotly_chart(plot_treemap(nodes), use_container_width=True)
	else:
	st.plotly_chart(plot_node_link(nodes, max_depth=display_depth,
	show_leaf_labels=show_leaf_labels),
	use_container_width=True)

	with tabs[1]:
	nm = _nmap(nodes)
	agg_nodes = [n for n in nodes if n['type'] in ('aggregation', 'root')]
	options = [f'{n["name"]} [{len(_leaf_ids(nodes, int(n["id"])))} vars]'
	for n in agg_nodes]
	if options:
	sel = st.selectbox('Select a node', options)
	sel_name = sel.split(' [')[0]
	sel_node = next((n for n in agg_nodes if n['name'] == sel_name), None)
	if sel_node:
	lids = _leaf_ids(nodes, int(sel_node['id']))
	leaf_ids_set = {nm[i]['metadata']['leaf_id']
	for i in lids if i in nm and 'metadata' in nm[i]}
	sub = can[can['_leaf_id'].isin(leaf_ids_set)]
	st.write(f'{len(lids)} variables under "{sel_node["name"]}"')
	st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
	use_container_width=True)

	with tabs[2]:
	st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), use_container_width=True)

	with tabs[3]:
	_base = safe_name(project_name)
	col1, col2 = st.columns(2)
	with col1:
	st.download_button(
	'Hierarchy JSON',
	data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
	file_name=f'{_base}_baseline_hierarchy.json',
	mime='application/json',
	use_container_width=True,
	)
	with col2:
	st.download_button(
	'Canonical CSV',
	data=can.to_csv(index=False).encode('utf-8'),
	file_name=f'{_base}_baseline_canonical.csv',
	mime='text/csv',
	use_container_width=True,
	)

	st.divider()
	# ── Save directly into the project's outputs/baseline/ folder ──────────────
	_out_dir = Path(__file__).resolve().parent / 'outputs' / 'baseline'
	st.markdown('### Save to project folder')
	st.caption(
	"The download buttons above go to your browser's Downloads folder (a browser "
	f'restriction). This button instead writes the files into `{_out_dir}` with the '
	'dataset name — convenient for `evaluate_all.py`.'
	)
	if st.button('Save all to outputs/baseline/', type='primary',
	use_container_width=True):
	try:
	_out_dir.mkdir(parents=True, exist_ok=True)
	(_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
	json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
	can.to_csv(_out_dir / f'{_base}_baseline_canonical.csv', index=False)
	st.success(f'Saved to `{_out_dir}`:\n\n'
	f'- {_base}_baseline_hierarchy.json\n'
	f'- {_base}_baseline_canonical.csv')
	except Exception as _e:
	st.error(f'Could not save: {_e}')

	with tabs[4]:
	import hierarchy_eval as he

	st.subheader('Hierarchy Quality Evaluation')
	st.caption(
	'No manually curated reference taxonomy is available for this experiment. '
	'The metrics below are reference-free: they assess hierarchy structure, '
	'label coherence and interpretability directly.'
	)

	with st.spinner('Computing reference-free metrics…'):
	tm = he.traco_metrics(nodes)
	npmi = he.npmi_coherence(nodes, can['_text'].tolist())

	# ── PRIMARY: reference-free hierarchy quality ─────────────────────────────
	st.markdown('#### Primary — reference-free hierarchy quality')
	p1, p2, p3 = st.columns(3)
	p1.metric('Parent–child coherence', tm['pc_coherence'],
	help='TraCo (Wu et al., AAAI 2024). Mean similarity of each node to its parent. '
	'Higher = children correctly nest under their parent theme.')
	p2.metric('Sibling diversity', tm['sibling_diversity'],
	help='TraCo (Wu et al., AAAI 2024). Mean distance between sibling nodes. '
	'Higher = siblings are distinct (LOW = redundant/repeated siblings).')
	p3.metric('NPMI label coherence', npmi,
	help='Lau et al., EACL 2014. Whether node-label terms genuinely co-occur in the '
	'data. Higher = meaningful labels, not arbitrary term salads.')
	st.caption(f'Embedding backend: {tm["encoder"]}. '
	'Coherence & diversity ∈ [−1, 1]; NPMI ∈ ≈[−1, 1].')

	# ── Label-quality proxies (interpretability) ──────────────────────────────
	st.markdown('#### Label quality (interpretability — reference-free)')
	lq = he.label_quality(nodes)
	l1, l2, l3 = st.columns(3)
	l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
	help='% of internal labels that read as a real concept (short noun '
	'phrase, WordNet head) rather than a "/"-joined term fragment.')
	l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
	help='% of internal labels duplicating a sibling label (lower is better).')
	l3.metric('Avg label words', lq['avg_label_words'],
	help='Mean label length in words (shorter = more name-like).')

	# ── Structural metrics ────────────────────────────────────────────────────
	st.markdown('#### Structural statistics')
	sm = he.structural_stats(nodes)
	s1, s2, s3, s4, s5 = st.columns(5)
	s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
	s2.metric('Max leaf depth', sm['max_depth'])
	s3.metric('Avg leaf depth', sm['avg_leaf_depth'])
	s4.metric('Avg branching', sm['avg_branching_factor'])
	s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%",
	help='Aggregation nodes with a single child (sparse-hierarchy indicator)')