# baseline.py — Metadata Hierarchy Builder — Baseline (Taxonomizer) # # Baseline = Taxonomizer (Mahmood & Mueller, IEEE TVCG 2019), semantic-space # pipeline, adapted to a metadata-only setting. No hardcoded domain patterns. # # Pipeline: # 1. Load metadata file (CSV / TSV / XLSX / JSON) # 2. Detect column roles (leaf / context / text / meta) — same as Approach 1 / 2 # 3. Build canonical schema (incl. _semantic_text = description values only) # 4. Embed each variable (code + description) via Word2Vec skip-gram and build # the cosine-distance semantic space [TAX §3.2] # 5. Recursively cluster (agglomerative, cosine) into the dendrogram taxonomy; # internal-node labels = data-driven contrastive terms of each cluster # 6. Visualise (Sunburst / Treemap / Node-link) # 7. Export visualization-ready JSON + canonical CSV # # Paper & justified adaptations (metadata/schema setting, fully automatic): # [TAX] Mahmood & Mueller — Taxonomizer, IEEE TVCG 2019. # Builds a SEMANTIC space (cosine over word2vec skip-gram embeddings of # attribute names; gensim, Wikipedia, window=5, dim=128) merged with a # DATA space (correlation over raw values), clustered into a dendrogram; # inner nodes labelled semi-automatically by distributional degree-of- # entailment + WordNet synonyms. # Adaptations (all documented): # 1. No DATA space — a schema/dictionary has no raw values, so we use the # semantic space alone (Taxonomizer with semantic weight = 1.0). # 2. Embed the attribute's short NAME (the description's name clause), since # the bare code goes out-of-vocabulary (a limitation the paper flags, # e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a # paragraph; using the short name (not the full description prose) keeps # domain-specific words from being diluted by shared explanatory text. # 3. Fully-automatic labels — the paper's labelling is semi-automatic # (human picks from suggestions); a baseline must be non-interactive, so # we use data-driven contrastive terms from each cluster's members. # # Dependencies: gensim # pip install gensim from __future__ import annotations import csv, json, re, warnings from collections import Counter, defaultdict from pathlib import Path import tempfile import numpy as np import pandas as pd import plotly.graph_objects as go import streamlit as st from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, silhouette_score from sklearn.preprocessing import LabelEncoder warnings.filterwarnings('ignore') st.set_page_config(page_title='Metadata Hierarchy — Baseline', layout='wide') st.title('Metadata Hierarchy Builder — Baseline (Taxonomizer)') st.caption( 'Taxonomizer baseline [Mahmood & Mueller, IEEE TVCG 2019]: Word2Vec skip-gram ' 'semantic space (short attribute names) + balanced Ward agglomerative clustering ' 'into the dendrogram taxonomy; nodes labelled by data-driven contrastive terms. ' 'Semantic space only (no raw data values); no hardcoded patterns, no external APIs.' ) # ───────────────────────────────────────────────────────────────────────────── # CONSTANTS # ───────────────────────────────────────────────────────────────────────────── LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split() GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split() TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation comment'.split() META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split() # URL pattern — strip embedded links (e.g. HCP FreeSurfer NeuroLex URLs) so web # tokens cannot dominate the embedding or the cluster label. [shared with A1] _URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)', re.IGNORECASE) # ───────────────────────────────────────────────────────────────────────────── # FILE LOADING # ───────────────────────────────────────────────────────────────────────────── def safe_name(name: str) -> str: return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name) def try_read_csv(path: Path) -> pd.DataFrame: best, best_score = None, -1 for enc in ['utf-8-sig', 'utf-8', 'latin1']: for sep in [None, ',', '\t', ';', '|']: try: df = pd.read_csv(path, sep=sep, engine='python', encoding=enc) score = df.shape[1] * 10 - float(df.isna().mean().mean()) if score > best_score: best, best_score = df, score except Exception: pass if best is None: raise ValueError(f'Could not read {path.name}') best.columns = [str(c).strip().replace(';', '') for c in best.columns] # Repair comma-packed rows (AI-Mind format) if len(best) > 0: first = best.iloc[:, 0].astype(str) other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0 if first.str.contains(',').mean() > 0.50 and other_null > 0.70: lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines() if lines: header = [h.strip().replace(';', '') for h in lines[0].split(',')] rows = [] for line in lines[1:]: line = line.strip().rstrip(';') if not line: continue if line.startswith('"') and line.endswith('"'): line = line[1:-1] try: parts = next(csv.reader([line], quotechar='"')) except Exception: continue if len(parts) >= len(header): rows.append(parts[:len(header)]) if rows: best = pd.DataFrame(rows, columns=header) best.columns = [str(c).strip().replace(';', '') for c in best.columns] return best def load_any(path: Path) -> pd.DataFrame: s = path.suffix.lower() if s in ['.csv', '.tsv', '.txt']: return try_read_csv(path) if s in ['.xlsx', '.xls']: return pd.read_excel(path) if s == '.json': obj = json.loads(path.read_text(encoding='utf-8', errors='replace')) if isinstance(obj, list): return pd.json_normalize(obj) if isinstance(obj, dict): for v in obj.values(): if isinstance(v, list): return pd.json_normalize(v) raise ValueError(f'Unsupported file type: {s}') def save_upload(f) -> Path: tmp = Path(tempfile.mkdtemp(prefix='baseline_')) p = tmp / safe_name(f.name) p.write_bytes(f.getbuffer()) return p # ───────────────────────────────────────────────────────────────────────────── # ROLE DETECTION [GON] # ───────────────────────────────────────────────────────────────────────────── def norm(c: str) -> str: return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_') def kscore(c: str, keys: list) -> int: nc = norm(c) return sum(1 for k in keys if k in nc) def profile_columns(df: pd.DataFrame) -> pd.DataFrame: out = [] n = max(len(df), 1) for col in df.columns: s = df[col] non = float(s.notna().mean()) nun = int(s.nunique(dropna=True)) ur = nun / n avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0 out.append({ 'column': str(col), 'non_null': round(non, 3), 'unique_values': nun, 'unique_ratio': round(ur, 3), 'avg_length': round(avg, 1), 'leaf_score': 4*kscore(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0), 'group_score': 4*kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n*0.5, 80) else 0) + (1 if avg < 60 else 0), 'text_score': 5*kscore(col, TEXT_KEYS) + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0), 'metadata_score': 4*kscore(col, META_KEYS) + (2 if 1 < nun < min(n*0.8, 100) else 0), }) return pd.DataFrame(out) def detect_roles(df: pd.DataFrame) -> tuple: """Auto-detect column roles. Identical logic to Approach 1 / 2 so the preprocessing up to the canonical table is comparable across all apps.""" prof = profile_columns(df) leaf = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist() text = (prof[(prof.text_score >= 4) | (prof.avg_length > 80)] .sort_values('text_score', ascending=False)['column'].tolist()) or leaf.copy() group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)] .sort_values('group_score', ascending=False)['column'].head(3).tolist()) meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))] .sort_values('metadata_score', ascending=False)['column'].head(5).tolist()) # Representation columns (decimal/precision/unit/type/format/…) must never # become structural levels; prefer them as metadata. [GON][TAX] _META_SUBSTR_BLOCK = { 'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale', 'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing', } def _is_repr(col_name): nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower()) return any(sub in nc for sub in _META_SUBSTR_BLOCK) meta_extra = [c for c in prof['column'].tolist() if _is_repr(c) and c not in text and c not in leaf and c not in meta] group = [c for c in group if not _is_repr(c)] meta = list(dict.fromkeys(meta + meta_extra))[:8] return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof # ───────────────────────────────────────────────────────────────────────────── # CANONICAL SCHEMA [GON] # ───────────────────────────────────────────────────────────────────────────── def sv(x) -> str: return '' if pd.isna(x) else str(x).strip() def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame: leaf_cols = cfg.get('leaf_cols', []) group_cols = cfg.get('group_cols', []) text_cols = cfg.get('text_cols', []) meta_cols = cfg.get('metadata_cols', []) rows = [] for i, row in df.iterrows(): leaf_parts = [sv(row.get(c, '')) for c in leaf_cols] leaf_parts = [p for p in leaf_parts if p] label = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}' group_parts = [sv(row.get(c, '')) for c in group_cols] group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']] gpath = ' > '.join(group_parts) if group_parts else 'Ungrouped' parts = [] for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)): v = sv(row.get(c, '')) if v: parts.append(f'{c}: {v}') text = ' | '.join(parts) if parts else label # _semantic_text: description VALUES only — no "fieldname:" prefixes, no # other fields, URLs stripped. This is the clean text Taxonomizer embeds # (the attribute's meaning), identical in spirit to Approach 1's column. sem_parts = [sv(row.get(c, '')) for c in text_cols] sem_parts = [p for p in sem_parts if p] if not sem_parts: sem_parts = list(leaf_parts) semantic = _URL_RE.sub(' ', ' '.join(sem_parts)) if sem_parts else label rows.append({ '_source_file': source, '_row_index': int(i), '_leaf_label': label, '_leaf_id': f'{gpath}.{label}' if gpath != 'Ungrouped' else label, '_group_path': gpath, '_text': text, '_semantic_text': semantic, }) can = pd.DataFrame(rows) if can['_leaf_id'].duplicated().any(): cnt: dict = defaultdict(int) ids = [] for lid in can['_leaf_id']: cnt[lid] += 1 ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}') can['_leaf_id'] = ids return can # ───────────────────────────────────────────────────────────────────────────── # TAXONOMIZER CORE [TAX — Mahmood & Mueller, IEEE TVCG 2019] # # Taxonomizer builds the taxonomy from a SEMANTIC SPACE (cosine distance between # word2vec skip-gram embeddings of attribute names) merged with a DATA SPACE # (correlation over the raw values). In a metadata/schema setting we have no # raw data values, so we use the semantic space alone (= Taxonomizer with # semantic weight 1.0). Because attribute *names* here are opaque codes that go # out-of-vocabulary — a limitation the paper explicitly flags (e.g. "BP") — we # embed code + description so real words carry the meaning (OOV code tokens are # skipped during averaging). Internal-node labels: the paper uses semi-automatic # distributional degree-of-entailment + WordNet synonyms; a baseline must be # fully automatic, so we use data-driven contrastive terms drawn from the data. # ───────────────────────────────────────────────────────────────────────────── _W2V_STOP = frozenset( 'a an the and or but if in on at to of for with by is are was were be ' 'been being have has had do does did will would could should may might ' 'shall can this that these those i you he she it we they me him her us ' 'them my your his her its our their what which who whom when where why ' 'how all each every few more most other some such no not only same so ' 'than too very just because as until while'.split() ) @st.cache_resource(show_spinner=False) def _load_w2v(): """Load pre-trained Word2Vec / GloVe model via gensim downloader. We prefer glove-wiki-gigaword-100 (~66 MB) because its Wikipedia training corpus and skip-gram-style objective most closely match Taxonomizer's described word2vec-Wikipedia-dim128 model. """ try: import gensim.downloader as api return api.load('glove-wiki-gigaword-100') except Exception as e: st.error( f'Could not load Word2Vec model: {e}\n\n' 'Run: pip install gensim and restart the app.\n' 'The model (~66 MB) is downloaded automatically on first use.' ) return None def _tokenize(label: str) -> list[str]: return [t for t in re.sub(r'[^a-zA-Z]+', ' ', label).lower().split() if len(t) > 2 and t not in _W2V_STOP] def attribute_name(text: str) -> str: """The attribute's short NAME — what Taxonomizer actually embeds [TAX §3.2]. The paper embeds the attribute name ("not more than a few words long"), not a paragraph. Descriptions here are formatted ': ' (some prefixed with a marker like 'KEY: : …'), so we take the first clause that is not a pure all-caps marker. Embedding this short name — rather than the full description prose — keeps the domain-specific words from being diluted by shared explanatory text, so the taxonomy clusters more by theme (e.g. DMS / PAL / SWM). """ text = str(text) for clause in re.split(r'[:\n]', text): clause = clause.strip() if clause and not all(2 <= len(w) <= 6 and w.isupper() for w in clause.split()): return clause return text.strip() def embed_labels_w2v(labels: list[str], model) -> np.ndarray: """Average Word2Vec vectors for each label's tokens [TAX §4.1]. Falls back to a zero vector for labels where none of the tokens are in the model vocabulary (rare for standard English attribute names). """ dim = model.vector_size out = np.zeros((len(labels), dim), dtype=np.float32) for i, label in enumerate(labels): toks = _tokenize(label) vecs = [model[t] for t in toks if t in model] if vecs: out[i] = np.mean(vecs, axis=0) # L2-normalise so cosine distance = 1 - dot norms = np.linalg.norm(out, axis=1, keepdims=True) norms[norms == 0] = 1.0 return out / norms def _cluster(X: np.ndarray, k: int) -> np.ndarray: """Ward-linkage agglomerative cut into k clusters. Ward (on the L2-normalised embedding vectors, where Euclidean ∝ √cosine) minimises within-cluster variance and so produces *balanced* clusters. This avoids the average/single-linkage chaining pathology that otherwise peels off tiny clusters and leaves one giant residual (i.e. no real hierarchy forms). """ return AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(X) def best_k(X: np.ndarray, n: int, k_min: int = 2, k_max: int = 8) -> int: """Pick the number of clusters that maximises the silhouette score. Fully data-driven — no fixed cluster count. Returns 1 only when the node is too small to split (n <= k_min). """ k_hi = min(k_max, n - 1) if k_hi < k_min: return 1 best, best_s = 1, -1.0 for k in range(k_min, k_hi + 1): labels = _cluster(X, k) if len(set(labels)) < 2: continue try: s = silhouette_score(X, labels) except Exception: continue if s > best_s: best_s, best = s, k return best def _doc_freq(texts: list[str]) -> Counter: """Document frequency: how many member texts each content word appears in.""" c: Counter = Counter() for t in texts: for w in set(_tokenize(t)): c[w] += 1 return c def cluster_term_label(member_texts: list[str], sibling_texts: list[str], used: set, vocab=None, top_n: int = 2) -> str: """Label a node with the content words most characteristic of its members. Data-driven labelling: each candidate word is scored by how much more frequent it is *inside* the cluster than in the sibling pool (contrastive document frequency), so labels are domain terms drawn from the dataset itself — not external ontology words. This replaces Taxonomizer's WordNet degree-of-entailment, which produces over-general, off-domain abstractions on specialised scientific metadata. If `vocab` is given (the Word2Vec model), only real dictionary words are eligible, so opaque attribute codes (e.g. 'dms', 'motml') are filtered out of labels. Codes are used only as a last-resort fallback. """ def in_vocab(w: str) -> bool: return vocab is None or w in vocab n_in = max(len(member_texts), 1) n_out = max(len(sibling_texts), 1) cin = _doc_freq(member_texts) cout = _doc_freq(sibling_texts) scores: dict[str, float] = {} for w, f in cin.items(): if w in used or len(w) <= 2 or not in_vocab(w): continue p_in = f / n_in p_out = cout.get(w, 0) / n_out # ignore single-occurrence noise unless the term is widely shared if f < 2 and p_in < 0.5: continue scores[w] = p_in - p_out picks = [w for w, _ in sorted(scores.items(), key=lambda x: -x[1])[:top_n] if scores[w] > 0] if not picks: # fallback: most frequent shared real word, then any shared token for require_vocab in (True, False): for w, _ in cin.most_common(): if w not in used and len(w) > 2 and (not require_vocab or in_vocab(w)): picks = [w] break if picks: break return ' / '.join(p.title() for p in picks) if picks else 'Group' # ───────────────────────────────────────────────────────────────────────────── # HIERARCHY CONSTRUCTION [TAX + GON] # ───────────────────────────────────────────────────────────────────────────── def _nmap(nodes: list) -> dict: return {int(n['id']): n for n in nodes} def _next_id(nodes: list) -> int: return max((int(n['id']) for n in nodes), default=0) + 1 def _add_child(nodes: list, parent_id: int, child_id: int): m = _nmap(nodes) p = m.get(int(parent_id)) if p is None: return rel = list(p.get('related', [])) if int(child_id) not in rel: rel.append(int(child_id)) p['related'] = rel def _make_agg(nid: int, name: str, desc: str = '') -> dict: return {'id': int(nid), 'name': str(name), 'related': [], 'type': 'aggregation', 'isShown': True, 'desc': desc, 'dtype': 'determine'} def _leaf_ids(nodes: list, nid: int) -> list: m = _nmap(nodes) out: list = [] def rec(x): n = m.get(int(x)) if not n: return if n.get('type') == 'attribute': out.append(int(x)) return for c in n.get('related', []): rec(int(c)) rec(nid) return list(dict.fromkeys(out)) def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project', max_depth: int = 3, min_cluster_size: int = 6, branch_max: int = 8) -> list: """Taxonomizer semantic-space construction [TAX]. Embeds each variable from its short attribute NAME (Word2Vec skip-gram average) — the name clause of the description, as Taxonomizer specifies. Recursively clusters via balanced Ward linkage — the semantic-space dendrogram. Labels each internal node with the contrastive content terms of its members (data-driven, fully automatic). No hardcoding. """ # ── leaf attribute nodes (ids 1..N) ────────────────────────────────────── nodes: list = [{'id': 0, 'name': project, 'type': 'root', 'dtype': 'root', 'isShown': True, 'related': [], 'desc': 'Root node'}] row_to_node: list = [] embed_list: list[str] = [] # short attribute name → embedding input + labels for i, (_, r) in enumerate(can.iterrows(), start=1): sem = str(r.get('_semantic_text', '') or r['_leaf_label']) name = attribute_name(sem) or str(r['_leaf_label']) nodes.append({'id': i, 'name': r['_leaf_label'], 'dtype': 'determine', 'related': [], 'isShown': True, 'type': 'attribute', 'desc': r['_text'], 'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}}) row_to_node.append(i) embed_list.append(name) label_list = embed_list row_to_node = np.array(row_to_node) # ── Word2Vec semantic-space embeddings [TAX §3.2] ───────────────────────── emb = embed_labels_w2v(embed_list, w2v_model) # (N, dim), L2-normalised # ── recursive clustering down the Ward dendrogram ───────────────────────── def attach_leaves(parent_id: int, idx: np.ndarray): for i in idx: _add_child(nodes, parent_id, int(row_to_node[i])) def recurse(parent_id: int, idx: np.ndarray, depth: int, used: set): n = len(idx) if n <= min_cluster_size or depth >= max_depth: attach_leaves(parent_id, idx) return sub = emb[idx] k_cap = min(branch_max, n - 1) # Branching floor: a node with n leaves and `remaining` levels left must # fan out enough to fit all its leaves into buckets of ~min_cluster_size # by the depth cap, i.e. k >= (n / min_cluster_size) ** (1/remaining). # Without this, silhouette keeps picking k=2 on overlapping data (e.g. # HCP), giving a near-binary tree that dumps ~100 leaves per bottom node. remaining = max(1, max_depth - depth) k_floor = int(np.ceil((n / max(min_cluster_size, 1)) ** (1.0 / remaining))) k_floor = max(2, min(k_floor, k_cap)) k = best_k(sub, n, k_min=k_floor, k_max=k_cap) if k <= 1: k = min(k_floor, k_cap) if n > min_cluster_size else 1 if k <= 1: attach_leaves(parent_id, idx) return cluster_labels = _cluster(sub, k) for c in range(k): mask = cluster_labels == c members = idx[mask] if len(members) == 0: continue if len(members) == 1: # don't create singleton internal nodes _add_child(nodes, parent_id, int(row_to_node[members[0]])) continue mset = set(members.tolist()) member_texts = [label_list[i] for i in members] sibling_texts = [label_list[i] for i in idx if i not in mset] # data-driven contrastive-term labelling label = cluster_term_label(member_texts, sibling_texts, used) nid = _next_id(nodes) nodes.append(_make_agg(nid, label, desc=f'Cluster of {len(members)} variables — ' f'label terms: {label}')) _add_child(nodes, parent_id, nid) recurse(nid, members, depth + 1, used | {label.lower()}) recurse(0, np.arange(len(can)), 0, set()) for n in nodes: n['related'] = list(dict.fromkeys(int(x) for x in n.get('related', []))) return nodes # ───────────────────────────────────────────────────────────────────────────── # VISUALISATION # ───────────────────────────────────────────────────────────────────────────── def _parent_map(nodes: list) -> dict: pm: dict = {} for n in nodes: for c in n.get('related', []): if int(c) not in pm: pm[int(c)] = int(n['id']) return pm # ───────────────────────────────────────────────────────────────────────────── # EVALUATION HELPERS # ───────────────────────────────────────────────────────────────────────────── def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list[int]: """Return predicted cluster id (depth-1 aggregation ancestor) for each row in can.""" pm = _parent_map(nodes) def depth1(nid: int) -> int: # Walk up until our parent is root (id==0) or we have no parent while pm.get(nid, -1) not in (-1, 0): nid = pm[nid] return nid lid_to_nid = {n['metadata']['leaf_id']: int(n['id']) for n in nodes if n.get('type') == 'attribute' and 'metadata' in n} return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1 for lid in can['_leaf_id']] def _purity(y_true, y_pred) -> float: from collections import Counter clusters: dict = {} for t, p in zip(y_true, y_pred): clusters.setdefault(p, []).append(t) correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values()) return correct / max(len(y_true), 1) def _structural_stats(nodes: list) -> dict: pm = _parent_map(nodes) def depth_of(nid: int) -> int: d = 0 while nid in pm: nid = pm[nid]; d += 1 return d agg = [n for n in nodes if n.get('type') == 'aggregation'] leafs = [n for n in nodes if n.get('type') == 'attribute'] depths = [depth_of(int(n['id'])) for n in leafs] branches = [len(n.get('related', [])) for n in agg] singletons = sum(1 for b in branches if b == 1) return { 'n_aggregation_nodes': len(agg), 'max_depth': int(max(depths, default=0)), 'avg_leaf_depth': round(float(np.mean(depths)), 2) if depths else 0.0, 'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0, 'singleton_nodes_%': round(100.0 * singletons / max(len(agg), 1), 1), } def _wrap(text: str, width: int = 70) -> str: """Wrap long hover text onto multiple
lines so it never runs off-screen.""" import textwrap text = str(text).replace('<', '<') lines: list = [] for para in text.split('\n'): wrapped = textwrap.wrap(para, width=width) or [''] lines.extend(wrapped) return '
'.join(lines) def plot_sunburst(nodes: list, max_depth: int = 4) -> go.Figure: pm = _parent_map(nodes) ids, labels, parents, values, hover = [], [], [], [], [] for n in nodes: nid = int(n['id']) lc = len(_leaf_ids(nodes, nid)) ids.append(str(nid)) labels.append(str(n.get('name', ''))[:40]) parents.append('' if nid == 0 else str(pm.get(nid, 0))) values.append(max(1, lc)) desc = _wrap(n.get('desc', '')) hover.append(f'{_wrap(n.get("name",""))}
Type: {n.get("type","")}' f'
Variables: {lc}

{desc}') fig = go.Figure(go.Sunburst( ids=ids, labels=labels, parents=parents, values=values, branchvalues='total', hovertext=hover, hoverinfo='text', maxdepth=max_depth, insidetextorientation='radial', marker=dict(colorscale='Greens', line=dict(width=1, color='white')), )) fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10), title='Click a sector to drill down — click centre to go back') return fig def plot_treemap(nodes: list) -> go.Figure: pm = _parent_map(nodes) ids, labels, parents, values, hover = [], [], [], [], [] for n in nodes: nid = int(n['id']) lc = len(_leaf_ids(nodes, nid)) ids.append(str(nid)) labels.append(str(n.get('name', ''))[:40]) parents.append('' if nid == 0 else str(pm.get(nid, 0))) values.append(max(1, lc)) desc = _wrap(n.get('desc', '')) hover.append(f'{_wrap(n.get("name",""))}
Variables: {lc}
{desc}') fig = go.Figure(go.Treemap( ids=ids, labels=labels, parents=parents, values=values, branchvalues='total', hovertext=hover, hoverinfo='text', textinfo='label+value', marker=dict(colorscale='Greens', line=dict(width=1, color='white')), )) fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10)) return fig # ───────────────────────────────────────────────────────────────────────────── # NODE-LINK TREE (Reingold–Tilford layout — matches Approach 1 / 2 interface) # ───────────────────────────────────────────────────────────────────────────── def _bl_node_color(n: dict) -> str: t = n.get('type', '') if t == 'root': return '#2a7d2a' if t == 'attribute': return '#74c476' if t == 'collapsed': return '#bbbbbb' return '#238b45' def _display_graph(nodes: list, max_depth: int = 4): """Walk the tree to the chosen depth, inserting 'collapsed' placeholders for branches cut off below max_depth (the Level-of-Detail control).""" m = _nmap(nodes) dnodes: dict = {} edges: list = [] counter = 10 ** 9 def rec(nid, depth): nonlocal counter n = m.get(int(nid)) if not n: return dnodes[int(nid)] = n if depth >= max_depth and n.get('related'): counter += 1 cid = counter n_leaves = len(_leaf_ids(nodes, nid)) dnodes[cid] = {'id': cid, 'name': f'… {n_leaves} variables', 'type': 'collapsed', 'related': [], 'desc': f"Collapsed: {n.get('name')}", 'isShown': True} edges.append((int(nid), cid)) return for c in n.get('related', []): if int(c) not in m: continue edges.append((int(nid), int(c))) rec(int(c), depth + 1) rec(0, 0) return list(dnodes.values()), edges def _positions(edges: list): """Reingold–Tilford style positions: x = depth, y = subtree-aware vertical.""" H_SCALE, V_SPACE = 3.0, 1.8 children: dict = defaultdict(list) for p, c in edges: children[p].append(c) pos: dict = {} counter = {'v': 0} def rec(nid, depth): ch = children.get(nid, []) if not ch: y = counter['v'] * V_SPACE counter['v'] += 1 pos[nid] = (depth * H_SCALE, y) return y y = float(np.mean([rec(c, depth + 1) for c in ch])) pos[nid] = (depth * H_SCALE, y) return y rec(0, 0) return pos def plot_node_link(nodes: list, max_depth: int = 4, show_leaf_labels: bool = False) -> go.Figure: """Node-link tree with elbow edges. Best for inspecting structure at moderate depth; Sunburst is recommended for large hierarchies (Taxonomizer).""" dnodes, edges = _display_graph(nodes, max_depth) pos = _positions(edges) ex, ey = [], [] for p, c in edges: if p not in pos or c not in pos: continue x0, y0 = pos[p]; x1, y1 = pos[c] xm = (x0 + x1) / 2 ex += [x0, xm, xm, x1, None] ey += [y0, y0, y1, y1, None] traces = [go.Scatter(x=ex, y=ey, mode='lines', line=dict(width=1, color='#c8c8c8'), hoverinfo='skip', showlegend=False)] agg_x, agg_y, agg_l, agg_c, agg_h = [], [], [], [], [] lf_x, lf_y, lf_l, lf_c, lf_h = [], [], [], [], [] for n in dnodes: nid = int(n['id']) if nid not in pos: continue x, y = pos[nid] lc = len(_leaf_ids(nodes, nid)) lab = str(n.get('name', nid)) htxt = (f"{_wrap(n.get('name',''))}
Type: {n.get('type','')}" f"
Variables: {lc}

{_wrap(n.get('desc',''))}") col = _bl_node_color(n) if n.get('type') in ('root', 'aggregation', 'collapsed'): agg_x.append(x); agg_y.append(y) agg_l.append((lab + (f' ({lc})' if lc else ''))[:50]) agg_c.append(col); agg_h.append(htxt) else: lf_x.append(x); lf_y.append(y) lf_l.append(lab[:40] if show_leaf_labels else '') lf_c.append(col); lf_h.append(htxt) if agg_x: traces.append(go.Scatter( x=agg_x, y=agg_y, mode='markers+text', text=agg_l, textposition='middle right', hovertext=agg_h, hoverinfo='text', marker=dict(size=16, color=agg_c, line=dict(color='white', width=2)), showlegend=False)) if lf_x: traces.append(go.Scatter( x=lf_x, y=lf_y, mode='markers+text', text=lf_l, textposition='middle right', hovertext=lf_h, hoverinfo='text', marker=dict(size=7, color=lf_c, symbol='circle', opacity=0.75, line=dict(color='white', width=1)), showlegend=False)) n_leaves = max(12, len(lf_x)) fig = go.Figure(traces) fig.update_layout( height=max(700, min(4000, int(n_leaves * 32))), margin=dict(l=20, r=220, t=30, b=20), plot_bgcolor='white', paper_bgcolor='white', xaxis=dict(visible=False, fixedrange=False), yaxis=dict(visible=False, autorange='reversed', fixedrange=False), dragmode='pan') return fig # ───────────────────────────────────────────────────────────────────────────── # SIDEBAR # ───────────────────────────────────────────────────────────────────────────── with st.sidebar: st.header('1. Upload') uploaded = st.file_uploader( 'Upload a metadata file', type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'], accept_multiple_files=False, ) st.header('2. Taxonomizer settings') tx_max_depth = st.slider('Max taxonomy depth', 2, 6, 3, 1, help='How many abstract-to-concrete levels to build') tx_min_size = st.slider('Min cluster size', 3, 20, 6, 1, help='Clusters smaller than this stop splitting (leaves attach directly)') tx_branch = st.slider('Max branches per node', 3, 12, 8, 1, help='Upper bound on clusters per split; the actual number is chosen by silhouette') st.header('3. Display') max_items = st.slider('Maximum variables', 25, 1200, 900, 25, help='Cap on variables included (lower only to speed up very large files). ' 'Default keeps full datasets like HCP (813).') group_filter = st.text_input('Row filter (optional)', value='', help='Filter rows by contextual path text before building') # ───────────────────────────────────────────────────────────────────────────── # MAIN # ───────────────────────────────────────────────────────────────────────────── if not uploaded: st.info('Upload a metadata CSV / XLSX / JSON file to begin.') st.markdown(""" ### Baseline algorithm — Taxonomizer (semantic space) Based on **Mahmood & Mueller, IEEE TVCG 2019** (Taxonomizer), adapted to a metadata-only setting. No hardcoded domain patterns, no external APIs. | Step | Method | Paper | |------|--------|-------| | Variable representation | **short attribute name** (description's name clause; codes are OOV) | Taxonomizer §3.2 / §4.1 | | Embedding | Word2Vec skip-gram — average of word vectors (`glove-wiki-gigaword-100`) | Taxonomizer §3.2 | | Semantic space | Cosine-distance matrix (no data space — schema has no raw values) | Taxonomizer §3.2 *(adapted)* | | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram | Taxonomizer §4.2 | | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer §4.3 *(adapted)* | This page is the pure Taxonomizer-style semantic-space reference method: variable meanings are embedded and recursively clustered into a hierarchy, with node labels generated from contrastive terms. **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement. **Approach 2** adds NMF/FASTopic aspect discovery + GMM clustering + optional LLM labels. """) st.stop() path = save_upload(uploaded) @st.cache_data(show_spinner=False) def _load_profile(path_str: str): df = load_any(Path(path_str)) cfg, prof = detect_roles(df) return df, cfg, prof with st.spinner('Loading file…'): df, auto_cfg, prof = _load_profile(str(path)) st.subheader('Step 1 — File preview') with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)', expanded=False): st.dataframe(df.head(10), use_container_width=True) score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score'] if c in prof.columns] st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False), use_container_width=True) st.subheader('Step 2 — Confirm column roles') cols = list(df.columns) # Scope widget keys to the uploaded file so a NEW file always shows its own # auto-detected defaults (Streamlit otherwise keeps the previous file's # selections under a fixed key, which silently overrides the new defaults). _fk = safe_name(uploaded.name) with st.expander('Column configuration', expanded=True): left, right = st.columns(2) with left: leaf_cols = st.multiselect('Leaf variable column(s)', cols, default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}') group_cols = st.multiselect('Context column(s) (optional)', cols, default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}', help='Optional contextual columns for display/filtering.') with right: text_cols = st.multiselect('Text/description column(s)', cols, default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}') meta_cols = st.multiselect('Metadata/type column(s)', cols, default=[c for c in auto_cfg.get('metadata_cols', []) if c in cols], key=f'meta_{_fk}') if not leaf_cols: st.error('Choose at least one leaf variable column.') st.stop() cfg = {'leaf_cols': leaf_cols, 'group_cols': group_cols, 'text_cols': text_cols, 'metadata_cols': meta_cols} if st.button('Build baseline hierarchy', type='primary'): # ── load Word2Vec model (cached after first call) ────────────────────── with st.spinner('Loading Word2Vec model (first run downloads ~66 MB)…'): _w2v = _load_w2v() if _w2v is None: st.stop() with st.spinner('Building hierarchy…'): _can = build_canonical(df, cfg, source=Path(uploaded.name).stem) if group_filter.strip(): _can = _can[_can['_group_path'].str.contains( group_filter.strip(), case=False, na=False)].copy() if len(_can) > max_items: _can = _can.head(max_items).copy() _can = _can.reset_index(drop=True) if len(_can) < 2: st.error('Need at least 2 variables after filtering.') st.stop() _pname = Path(uploaded.name).stem _nodes = build_hierarchy(_can, _w2v, project=_pname, max_depth=tx_max_depth, min_cluster_size=tx_min_size, branch_max=tx_branch) st.session_state['_bl_nodes'] = _nodes st.session_state['_bl_can'] = _can st.session_state['_bl_project'] = _pname if '_bl_nodes' not in st.session_state: st.info('Configure columns above then click **Build baseline hierarchy**.') st.stop() nodes = st.session_state['_bl_nodes'] can = st.session_state['_bl_can'] project_name = st.session_state['_bl_project'] _sm = _structural_stats(nodes) n_leaves = len([n for n in nodes if n['type'] == 'attribute']) n_internal = len([n for n in nodes if n['type'] == 'aggregation']) st.divider() c1, c2, c3, c4 = st.columns(4) c1.metric('Variables', n_leaves) c2.metric('Aggregation nodes', n_internal) c3.metric('Max depth', _sm['max_depth']) c4.metric('Avg branching', _sm['avg_branching_factor']) tabs = st.tabs(['Visualization', 'Node detail', 'Canonical table', 'Export', 'Evaluation']) with tabs[0]: # ── Visualization controls (above chart — matches Approach 1 / 2) ───────── vc1, vc2, vc3 = st.columns([3, 2, 1]) with vc1: viz_mode = st.radio( 'View mode', ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'], horizontal=True, index=0, help='Sunburst best for large hierarchies [Taxonomizer]. ' 'Node-link best for inspecting structure at moderate depth.') with vc2: display_depth = st.slider('Depth (Level of Detail)', 1, 8, 4, 1, help='How many levels to reveal at once.') with vc3: show_leaf_labels = st.checkbox('Leaf labels', value=False, help='Show variable names on the node-link tree.') st.divider() if viz_mode == 'Sunburst (drill-down)': st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth), use_container_width=True) st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.') elif viz_mode == 'Treemap': st.plotly_chart(plot_treemap(nodes), use_container_width=True) else: st.plotly_chart(plot_node_link(nodes, max_depth=display_depth, show_leaf_labels=show_leaf_labels), use_container_width=True) with tabs[1]: nm = _nmap(nodes) agg_nodes = [n for n in nodes if n['type'] in ('aggregation', 'root')] options = [f'{n["name"]} [{len(_leaf_ids(nodes, int(n["id"])))} vars]' for n in agg_nodes] if options: sel = st.selectbox('Select a node', options) sel_name = sel.split(' [')[0] sel_node = next((n for n in agg_nodes if n['name'] == sel_name), None) if sel_node: lids = _leaf_ids(nodes, int(sel_node['id'])) leaf_ids_set = {nm[i]['metadata']['leaf_id'] for i in lids if i in nm and 'metadata' in nm[i]} sub = can[can['_leaf_id'].isin(leaf_ids_set)] st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"') st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True), use_container_width=True) with tabs[2]: st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), use_container_width=True) with tabs[3]: _base = safe_name(project_name) col1, col2 = st.columns(2) with col1: st.download_button( 'Hierarchy JSON', data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'), file_name=f'{_base}_baseline_hierarchy.json', mime='application/json', use_container_width=True, ) with col2: st.download_button( 'Canonical CSV', data=can.to_csv(index=False).encode('utf-8'), file_name=f'{_base}_baseline_canonical.csv', mime='text/csv', use_container_width=True, ) st.divider() # ── Save directly into the project's outputs/baseline/ folder ────────────── _out_dir = Path(__file__).resolve().parent / 'outputs' / 'baseline' st.markdown('### Save to project folder') st.caption( "The download buttons above go to your browser's Downloads folder (a browser " f'restriction). This button instead writes the files into `{_out_dir}` with the ' 'dataset name — convenient for `evaluate_all.py`.' ) if st.button('Save all to outputs/baseline/', type='primary', use_container_width=True): try: _out_dir.mkdir(parents=True, exist_ok=True) (_out_dir / f'{_base}_baseline_hierarchy.json').write_text( json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8') can.to_csv(_out_dir / f'{_base}_baseline_canonical.csv', index=False) st.success(f'Saved to `{_out_dir}`:\n\n' f'- {_base}_baseline_hierarchy.json\n' f'- {_base}_baseline_canonical.csv') except Exception as _e: st.error(f'Could not save: {_e}') with tabs[4]: import hierarchy_eval as he st.subheader('Hierarchy Quality Evaluation') st.caption( 'No manually curated reference taxonomy is available for this experiment. ' 'The metrics below are reference-free: they assess hierarchy structure, ' 'label coherence and interpretability directly.' ) with st.spinner('Computing reference-free metrics…'): tm = he.traco_metrics(nodes) npmi = he.npmi_coherence(nodes, can['_text'].tolist()) # ── PRIMARY: reference-free hierarchy quality ───────────────────────────── st.markdown('#### Primary — reference-free hierarchy quality') p1, p2, p3 = st.columns(3) p1.metric('Parent–child coherence', tm['pc_coherence'], help='TraCo (Wu et al., AAAI 2024). Mean similarity of each node to its parent. ' 'Higher = children correctly nest under their parent theme.') p2.metric('Sibling diversity', tm['sibling_diversity'], help='TraCo (Wu et al., AAAI 2024). Mean distance between sibling nodes. ' 'Higher = siblings are distinct (LOW = redundant/repeated siblings).') p3.metric('NPMI label coherence', npmi, help='Lau et al., EACL 2014. Whether node-label terms genuinely co-occur in the ' 'data. Higher = meaningful labels, not arbitrary term salads.') st.caption(f'Embedding backend: **{tm["encoder"]}**. ' 'Coherence & diversity ∈ [−1, 1]; NPMI ∈ ≈[−1, 1].') # ── Label-quality proxies (interpretability) ────────────────────────────── st.markdown('#### Label quality *(interpretability — reference-free)*') lq = he.label_quality(nodes) l1, l2, l3 = st.columns(3) l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%", help='% of internal labels that read as a real concept (short noun ' 'phrase, WordNet head) rather than a "/"-joined term fragment.') l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%", help='% of internal labels duplicating a sibling label (lower is better).') l3.metric('Avg label words', lq['avg_label_words'], help='Mean label length in words (shorter = more name-like).') # ── Structural metrics ──────────────────────────────────────────────────── st.markdown('#### Structural statistics') sm = he.structural_stats(nodes) s1, s2, s3, s4, s5 = st.columns(5) s1.metric('Aggregation nodes', sm['n_aggregation_nodes']) s2.metric('Max leaf depth', sm['max_depth']) s3.metric('Avg leaf depth', sm['avg_leaf_depth']) s4.metric('Avg branching', sm['avg_branching_factor']) s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%", help='Aggregation nodes with a single child (sparse-hierarchy indicator)')