diff --git "a/version2/approach_2.py" "b/version2/approach_2.py"
new file mode 100644--- /dev/null
+++ "b/version2/approach_2.py"
@@ -0,0 +1,4405 @@
+# approach_2.py — Approach 2: Dataset-Constrained Multi-Aspect Hierarchy
+#
+# ALGORITHM (no domain hardcoding):
+#
+#   Step 1  Build metadata text objects                                  [GON §3]
+#   Step 2  Group-anchored L1/L2 from detected _group metadata          [BISE-26]
+#   Step 3  Per terminal group — routing in priority order:
+#           (a) Phrase-slot mining               [IE / slot induction]
+#               decomposes one variable into multiple alternative-phrase
+#               signals before clustering — fixes the document-level ceiling
+#               that NMF/BERTopic/FASTopic share.
+#           (b) FASTopic semantic aspect discovery [Wu et al. NeurIPS 2024]
+#               transformer-based Dual Semantic-relation Reconstruction with
+#               optimal transport — recent SOTA replacement for NMF.
+#           (c) NMF lexical fallback             [ZHU §3.1 adapted]
+#               retained for small groups or when FASTopic is unavailable.
+#   Step 4  Per-aspect variable representations                          [ZHU §3.1]
+#   Step 5  Independent per-aspect clustering: GMM+BIC small / KMeans large [ZHU §3.2]
+#   Step 6  Top-down LoD tree (simplified silhouette best-aspect split)  [ZHU §3.3 adapted]
+#   Step 7  Node labeling — deterministic by default:
+#           description-prefix → group anchor → IDF + FIELD_NAME filter →
+#           bigram-preferred discriminative TF-IDF suffix
+#           OPTIONAL: constrained LLM re-phrasing                         [TopicTag, DocEng 2024]
+#                     — every label word must appear in evidence (grounding check)
+#                     — provenance stored per node (audit trail)
+#   Step 8  Reference-free diagnostics: parent-child coherence, sibling diversity,
+#           label coherence, label quality, structure/provenance audit
+#                                                                        [TraCo, AAAI 2024]
+#
+# Facet trees (Castanet, 2007) removed in this version — a single coherent LoD tree.
+#
+# PAPERS:
+#   [ZHU]      Zhu et al. (2025). EMNLP 2025.   Main scaffold (adapted)
+#   [FASTopic] Wu et al. (2024). NeurIPS 2024.  Semantic aspect discovery
+#   [GON]      Gonçalves et al. (2019). ESWC.   Canonical text objects
+#   [TopicTag] Eren et al. (2024). DocEng.      Constrained LLM label refinement
+#   [TraCo]    Wu et al. (2024). AAAI.          Affinity + diversity metrics
+#   [TICL]     Kejriwal et al. (2022). EAAI.    Optional clustering diagnostics
+#   [BISE-26]  Motamedi et al. (2026). BISE.    Group-anchored entry validation
+#   [IE-Slot]  IE / slot-induction literature (surveyed Xu et al., FCS 2024).
+
+from __future__ import annotations
+import json
+import os
+import warnings
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Callable, Optional
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
+from sklearn.decomposition import NMF, TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.mixture import GaussianMixture
+from sklearn.preprocessing import LabelEncoder
+
+warnings.filterwarnings('ignore')
+
+try:
+    from sentence_transformers import SentenceTransformer
+    _SBERT_AVAILABLE = True
+except Exception:   # broken torch raises OSError (WinError 127), not ImportError
+    _SBERT_AVAILABLE = False
+
+try:
+    from openai import OpenAI as _OpenAIClient
+    _LLM_CLIENT_AVAILABLE = True
+except ImportError:
+    _LLM_CLIENT_AVAILABLE = False
+
+# Ollama defaults (overridable via env vars OLLAMA_URL / OLLAMA_MODEL).
+OLLAMA_URL_DEFAULT   = 'http://localhost:11434/v1'
+OLLAMA_MODEL_DEFAULT = 'qwen2.5:3b-instruct'
+GROQ_URL_DEFAULT     = 'https://api.groq.com/openai/v1'
+GROQ_MODEL_DEFAULT   = 'qwen/qwen3-32b'
+
+def _ping_ollama(base_url: str = OLLAMA_URL_DEFAULT, timeout: float = 1.5) -> bool:
+    """Quick reachability check for the local Ollama server."""
+    if not _LLM_CLIENT_AVAILABLE:
+        return False
+    try:
+        import urllib.request as _urlreq
+        # /v1/models is OpenAI-compat; Ollama also exposes /api/tags
+        with _urlreq.urlopen(base_url.rstrip('/v1') + '/api/tags',
+                              timeout=timeout) as r:
+            return r.status == 200
+    except Exception:
+        return False
+
+def _make_llm_client(provider: str, base_url: str, api_key: str = '') -> Optional[object]:
+    """
+    Build an OpenAI-compatible client for either local Ollama or cloud Groq.
+
+    Both providers expose an OpenAI-compatible REST endpoint, so the same
+    openai.OpenAI client class works for both — only the base_url and
+    auth differ.
+    """
+    if not _LLM_CLIENT_AVAILABLE:
+        return None
+    if provider == 'groq':
+        if not api_key:
+            return None
+        return _OpenAIClient(base_url=base_url, api_key=api_key)
+    # Ollama ignores the key but the SDK requires a non-empty string
+    if not _ping_ollama(base_url):
+        return None
+    return _OpenAIClient(base_url=base_url, api_key='ollama-local')
+
+def _parse_json_response(raw: str) -> dict:
+    """
+    Robustly extract a JSON object from an LLM response.
+
+    Handles the response shapes seen across providers/models:
+      • plain JSON:                  {"role": "measure"}
+      • markdown-fenced:             ```json\n{...}\n```
+      • reasoning-model preamble:    <think>...</think>\n{...}
+        (Qwen3 / DeepSeek-R1 style models emit thinking traces before the
+        answer when JSON mode is unavailable — e.g. qwen/qwen3-32b on Groq)
+      • prose then JSON:             Here is the result: {...}
+
+    Strategy: strip <think> blocks, then take the substring from the first
+    '{' to the last '}' and parse it.  Raises ValueError if no JSON found.
+    """
+    s = (raw or '').strip()
+    if not s:
+        raise ValueError('empty response')
+    # Strip reasoning blocks (Qwen3 / R1 style)
+    while '<think>' in s:
+        start = s.find('<think>')
+        end   = s.find('</think>', start)
+        if end == -1:
+            # Unclosed think block — drop everything from <think> onward,
+            # the JSON (if any) would be before it
+            s = s[:start]
+            break
+        s = s[:start] + s[end + len('</think>'):]
+    s = s.strip()
+    # Take first '{' .. last '}' — covers fences and prose prefixes/suffixes
+    i, j = s.find('{'), s.rfind('}')
+    if i == -1 or j == -1 or j <= i:
+        raise ValueError('no JSON object in response')
+    return json.loads(s[i:j + 1])
+
+def _safe_chat_completion(client, model: str, prompt: str,
+                            max_tokens: int = 200, temperature: float = 0.1):
+    """
+    Call an OpenAI-compatible chat completion endpoint with automatic
+    fallback for providers that don't support JSON mode on a given model.
+
+    Some models on Groq (notably some Qwen 3 variants) reject
+    `response_format={"type":"json_object"}` with HTTP 400 BadRequestError.
+    This wrapper first tries WITH JSON mode (better reliability when
+    supported), and if the provider rejects it with a bad-request error,
+    retries WITHOUT.  Prompts in this codebase already say 'Output JSON only'
+    and we strip ```json fences after parsing, so the retry path still
+    works deterministically.
+    """
+    base_args = {
+        'model':       model,
+        'messages':    [{'role': 'user', 'content': prompt}],
+        'temperature': temperature,
+        'max_tokens':  max_tokens,
+    }
+    try:
+        return client.chat.completions.create(
+            **base_args, response_format={'type': 'json_object'})
+    except Exception as e:
+        # Retry without JSON mode on bad-request / unsupported-feature errors
+        err_name = type(e).__name__
+        err_text = str(e)
+        if ('BadRequest' in err_name or '400' in err_text
+                or 'response_format' in err_text):
+            return client.chat.completions.create(**base_args)
+        raise
+
+try:
+    from fastopic import FASTopic                # type: ignore[import-not-found]
+    _FASTOPIC_AVAILABLE = True
+except Exception:   # fastopic pulls in torch; broken torch raises OSError, not ImportError
+    _FASTOPIC_AVAILABLE = False
+
+# ──────────────────────────────────────────────────────────────────────────────
+# ROLE KEYS  (domain-agnostic)
+# ──────────────────────────────────────────────────────────────────────────────
+LEAF_KEYS  = 'variable var field column attribute name code id item indicator question measure concept'.split()
+GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
+TEXT_KEYS  = 'description definition desc label title question meaning note notes text display full details explanation'.split()
+META_KEYS  = 'type dtype data_type datatype unit units format decimal precision values value coding range min max scale'.split()
+
+# ──────────────────────────────────────────────────────────────────────────────
+# FILE LOADING
+# ──────────────────────────────────────────────────────────────────────────────
+def safe_name(n: str) -> str:
+    return ''.join(c if c.isalnum() or c in '-_.' else '_' for c in n)
+
+def try_read_csv(path: Path) -> pd.DataFrame:
+    best, best_score = None, -1
+    # Try explicit comma first (most common), then let Python sniff, then other separators.
+    # Reject results with only 1 column — likely a parsing failure.
+    for enc in ['utf-8-sig', 'utf-8', 'latin1']:
+        for sep in [',', '\t', ';', '|', None]:
+            try:
+                df = pd.read_csv(path, sep=sep, engine='python', encoding=enc,
+                                 on_bad_lines='skip')
+                if df.shape[1] < 2:
+                    continue
+                s = df.shape[1] * 10 - float(df.isna().mean().mean())
+                if s > best_score:
+                    best, best_score = df, s
+            except Exception:
+                pass
+    if best is None:
+        raise ValueError(f'Could not read {path.name}')
+    best.columns = [str(c).strip().replace(';', '') for c in best.columns]
+    return best
+
+def load_any(path: Path) -> pd.DataFrame:
+    s = path.suffix.lower()
+    if s in ['.csv', '.tsv', '.txt']:
+        return try_read_csv(path)
+    if s in ['.xlsx', '.xls']:
+        return pd.read_excel(path)
+    if s == '.json':
+        obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
+        if isinstance(obj, list):
+            return pd.json_normalize(obj)
+        if isinstance(obj, dict):
+            for v in obj.values():
+                if isinstance(v, list):
+                    return pd.json_normalize(v)
+    raise ValueError(f'Unsupported: {s}')
+
+# ──────────────────────────────────────────────────────────────────────────────
+# COLUMN ROLE DETECTION  (domain-agnostic heuristic scoring)
+# ──────────────────────────────────────────────────────────────────────────────
+def _norm(c: str) -> str:
+    return ''.join(ch if ch.isalnum() else ' ' for ch in str(c).lower())
+
+def _ks(c: str, keys: list) -> int:
+    return sum(1 for k in keys if k in _norm(c))
+
+def detect_roles(df: pd.DataFrame) -> dict:
+    n = max(len(df), 1)
+    out = []
+    for col in df.columns:
+        s = df[col]
+        nun = int(s.nunique(dropna=True))
+        ur  = nun / n
+        avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
+        # Raised group uniqueness ceiling from 80 → 300 so large datasets (e.g. HCP
+        # with 100+ assessment categories) are not excluded.
+        out.append({
+            'column':      str(col),
+            'leaf_score':  4 * _ks(col, LEAF_KEYS)  + (3 if 0.5 <= ur <= 1 else 0),
+            'group_score': 4 * _ks(col, GROUP_KEYS) + (3 if 1 < nun < min(n * 0.5, 300) else 0),
+            'text_score':  5 * _ks(col, TEXT_KEYS)  + (4 if avg > 50 else 0),
+            'meta_score':  4 * _ks(col, META_KEYS)  + (2 if 1 < nun < min(n * 0.8, 100) else 0),
+        })
+    prof  = pd.DataFrame(out)
+    leaf  = prof.sort_values('leaf_score', ascending=False).head(1)['column'].tolist()
+    text  = (prof[prof.text_score >= 4]
+             .sort_values('text_score', ascending=False)['column'].tolist()) or leaf[:]
+    group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf))]
+             .sort_values('group_score', ascending=False).head(3)['column'].tolist())
+    meta  = (prof[(prof.meta_score >= 4) & (~prof.column.isin(text + leaf + group))]
+             .sort_values('meta_score', ascending=False).head(4)['column'].tolist())
+    return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'meta_cols': meta}
+
+def sv(x) -> str:
+    return '' if pd.isna(x) else str(x).strip()
+
+def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
+    """Build normalised per-variable rows with a unified _text field [GON §3]."""
+    leaf_cols  = cfg.get('leaf_cols', [])
+    group_cols = cfg.get('group_cols', [])
+    text_cols  = cfg.get('text_cols', [])
+    meta_cols  = cfg.get('meta_cols', [])
+    rows = []
+    for i, row in df.iterrows():
+        label = (' / '.join(p for p in [sv(row.get(c, '')) for c in leaf_cols] if p)
+                 or f'var_{i}')
+        group = (' > '.join(p for p in [sv(row.get(c, '')) for c in group_cols]
+                             if p and p.lower() not in ['nan', 'none'])
+                 or 'Ungrouped')
+        all_cols = list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols))
+        text = ' | '.join(f'{c}: {sv(row.get(c, ""))}' for c in all_cols
+                          if sv(row.get(c, '')))
+        rows.append({'_source': source, '_row': int(i), '_label': label,
+                     '_id': f'{group}.{label}', '_group': group, '_text': text})
+    can = pd.DataFrame(rows)
+    cnt: dict = defaultdict(int)
+    ids = []
+    for lid in can['_id']:
+        cnt[lid] += 1
+        ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
+    can['_id'] = ids
+    return can
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 3  — ASPECT DISCOVERY VIA NMF  [ZHU §3.1 adapted]
+# ──────────────────────────────────────────────────────────────────────────────
+def _elbow_k(errors: list, k_range: range) -> int:
+    if len(errors) <= 2:
+        return list(k_range)[0]
+    diffs = np.diff(errors)
+    drops = np.diff(diffs)
+    if drops.max() - drops.min() < 1e-8:
+        return max(2, int(np.sqrt(len(errors))))
+    elbow_idx = int(np.argmax(drops)) + 1
+    return list(k_range)[min(elbow_idx, len(k_range) - 1)]
+
+def discover_aspects(texts: list, max_aspects: int = 10):
+    """
+    Discover K latent semantic aspects via NMF on TF-IDF  [ZHU §3.1 adapted].
+
+    Replaces Zhu et al.'s LLM aspect generation with NMF (deterministic, no
+    hallucination).  K is selected by reconstruction-error elbow.
+
+    Returns tfidf, X, nmf, W, H, K, labels.
+    """
+    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
+                            max_features=3000, min_df=1)
+    X     = tfidf.fit_transform(texts)
+    terms = np.array(tfidf.get_feature_names_out())
+    n_vars = X.shape[0]
+
+    k_range = range(2, min(max_aspects + 1, n_vars // 2 + 1, 16))
+    if len(k_range) < 2:
+        k_range = range(2, 3)
+
+    errors = []
+    for k in k_range:
+        m = NMF(n_components=k, random_state=42, max_iter=400, init='nndsvda')
+        m.fit_transform(X)
+        errors.append(m.reconstruction_err_)
+
+    K   = _elbow_k(errors, k_range)
+    nmf = NMF(n_components=K, random_state=42, max_iter=400, init='nndsvda')
+    W   = nmf.fit_transform(X)
+    H   = nmf.components_
+
+    # Aspect labels: top-4 terms per NMF component
+    labels = []
+    for k in range(K):
+        top_idx = np.argsort(H[k])[-4:][::-1]
+        labels.append(' / '.join(terms[top_idx]))
+
+    return tfidf, X, nmf, W, H, K, labels
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 3 (FASTopic variant)  — semantic aspect discovery  [Wu et al. NeurIPS 2024]
+# ──────────────────────────────────────────────────────────────────────────────
+def discover_aspects_fastopic(texts: list,
+                               max_aspects: int = 10,
+                               fallback_tfidf: bool = True):
+    """
+    Recent SOTA semantic aspect discovery via FASTopic [Wu et al. NeurIPS 2024,
+    arXiv:2405.17978].
+
+    FASTopic uses a pretrained Transformer (SBERT) to embed documents, then
+    learns topic/word embeddings via Dual Semantic-relation Reconstruction
+    (DSR) with optimal transport.  Beats NMF, BERTopic, and CombinedTM on
+    standard topic benchmarks — semantic, not lexical, and reproducible.
+
+    Adapter shape matches `discover_aspects` so it is a drop-in replacement:
+        returns (tfidf, X, model, W, H, K, labels)
+    `tfidf` and `X` are still produced (used downstream by label_cluster +
+    masked TF-IDF representation fallbacks); FASTopic provides W (doc-topic),
+    H_proxy (topic-term scores derived from top words), K, and labels.
+
+    Falls back to NMF if FASTopic is not installed or the group is too small
+    for transformer training (< 6 documents).
+    """
+    n_vars = len(texts)
+    # Keep a TF-IDF matrix available for downstream code paths
+    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
+                             max_features=3000, min_df=1)
+    X = tfidf.fit_transform(texts)
+    terms = np.array(tfidf.get_feature_names_out())
+
+    if not _FASTOPIC_AVAILABLE or n_vars < 6:
+        if fallback_tfidf:
+            return discover_aspects(texts, max_aspects)
+        raise RuntimeError('FASTopic unavailable and TF-IDF fallback disabled')
+
+    # FASTopic K selection: bounded similar to NMF elbow but simpler — pick a
+    # reasonable K from corpus size (avoids overfitting tiny groups).
+    K = max(2, min(max_aspects, int(np.ceil(np.sqrt(n_vars))) + 1, n_vars - 1))
+
+    try:
+        model = FASTopic(num_topics=K,
+                          doc_embed_model='all-MiniLM-L6-v2',
+                          verbose=False)
+        # fit_transform returns (top_words_per_topic, doc_topic_dist)
+        result = model.fit_transform(texts)
+        if isinstance(result, tuple) and len(result) == 2:
+            top_words, doc_topic = result
+        else:
+            # Some FASTopic versions return only doc_topic; pull top words via API
+            doc_topic = result
+            top_words = [model.get_top_words(topic_id=k, num_top_words=10)
+                         for k in range(K)]
+    except Exception:
+        # Robust fallback if FASTopic fails (small corpus, OOM, etc.)
+        if fallback_tfidf:
+            return discover_aspects(texts, max_aspects)
+        raise
+
+    W = np.asarray(doc_topic, dtype=np.float32)
+    if W.ndim != 2 or W.shape[0] != n_vars:
+        if fallback_tfidf:
+            return discover_aspects(texts, max_aspects)
+
+    # Build H_proxy: K × n_terms with weight = position-decay of each top word
+    n_terms = len(terms)
+    term_to_idx = {t: i for i, t in enumerate(terms)}
+    H_proxy = np.zeros((K, n_terms), dtype=np.float32)
+    labels  = []
+    for k in range(K):
+        words_k = top_words[k] if k < len(top_words) else []
+        # Each entry may be 'word', or 'word score', or (word, score)
+        clean: list = []
+        for w in words_k:
+            if isinstance(w, (list, tuple)):
+                w = w[0]
+            w = str(w).split(' ')[0].strip().lower()
+            if w:
+                clean.append(w)
+        for rank, w in enumerate(clean):
+            if w in term_to_idx:
+                H_proxy[k, term_to_idx[w]] += 1.0 / (rank + 1)
+        labels.append(' / '.join(clean[:4]) if clean else f'aspect {k+1}')
+
+    return tfidf, X, model, W, H_proxy, K, labels
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 4  — PER-ASPECT VARIABLE REPRESENTATIONS  [ZHU §3.1]
+# ──────────────────────────────────────────────────────────────────────────────
+def per_aspect_representations(texts: list, H: np.ndarray,
+                                tfidf: TfidfVectorizer,
+                                sbert_model=None) -> list:
+    """
+    Build K independent representation matrices — one per aspect  [ZHU §3.1].
+
+    For each aspect k:
+      • identify top-T terms from H[k]
+      • filter variable texts to those terms → encode with SBERT (or masked TF-IDF)
+
+    Returns list of K arrays, each shape (n_vars, embed_dim).
+    """
+    terms  = np.array(tfidf.get_feature_names_out())
+    X_arr  = tfidf.transform(texts).toarray()
+    K      = H.shape[0]
+    T      = min(30, len(terms))
+    reprs  = []
+
+    for k in range(K):
+        top_idx   = np.argsort(H[k])[-T:]
+        top_terms = set(terms[top_idx])
+
+        if sbert_model is not None:
+            filtered = []
+            for txt in texts:
+                tokens = txt.lower().split()
+                kept   = ' '.join(t for t in tokens if t in top_terms)
+                filtered.append(kept if kept.strip() else txt)
+            emb = sbert_model.encode(filtered, show_progress_bar=False,
+                                     batch_size=64, normalize_embeddings=True)
+        else:
+            mask  = H[k]
+            emb   = X_arr * mask[np.newaxis, :]
+            norms = np.linalg.norm(emb, axis=1, keepdims=True)
+            norms[norms == 0] = 1.0
+            emb = emb / norms
+
+        reprs.append(emb.astype(np.float32))
+
+    return reprs
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 5  — INDEPENDENT PER-ASPECT CLUSTERING  [ZHU §3.2]
+# ──────────────────────────────────────────────────────────────────────────────
+def cluster_aspect_gmm(emb: np.ndarray, max_k: int = 8, fast_threshold: int = 30):
+    """
+    Cluster variables within one aspect space  [ZHU §3.2].
+
+    Hybrid strategy:
+      • n ≤ fast_threshold → GMM + BIC (accurate, recommended for small clusters)
+      • n  > fast_threshold → MiniBatchKMeans + silhouette selection
+        (engineering adaptation for large groups, e.g. HCP Cognition / FreeSurfer)
+
+    Both paths use diagonal covariance / SVD pre-reduction for numerical
+    stability on high-dimensional sparse embeddings.
+    """
+    n = emb.shape[0]
+    if n <= 2:
+        return np.zeros(n, dtype=int), 1, 0.0
+
+    d_target = min(20, emb.shape[1], n - 1)
+    if emb.shape[1] > d_target:
+        svd   = TruncatedSVD(n_components=d_target, random_state=42)
+        emb_r = svd.fit_transform(emb)
+    else:
+        emb_r = emb.copy()
+
+    best_score, best_labels, best_k = -np.inf, None, 2
+
+    if n > fast_threshold:
+        # Fast path: MiniBatchKMeans + silhouette  (large groups)
+        for k in range(2, min(max_k + 1, n)):
+            try:
+                km     = MiniBatchKMeans(n_clusters=k, random_state=42,
+                                          n_init=3, batch_size=min(256, n),
+                                          max_iter=100)
+                labels = km.fit_predict(emb_r)
+                if len(set(labels)) < 2:
+                    continue
+                sil = float(silhouette_score(emb_r, labels))
+                if sil > best_score:
+                    best_score, best_labels, best_k = sil, labels, k
+            except Exception:
+                continue
+    else:
+        # Accurate path: GMM + BIC  (small groups)
+        best_bic = np.inf
+        for k in range(2, min(max_k + 1, n)):
+            try:
+                gmm = GaussianMixture(n_components=k, random_state=42,
+                                      covariance_type='diag', reg_covar=1e-3,
+                                      max_iter=80, n_init=1,
+                                      init_params='random_from_data')
+                gmm.fit(emb_r)
+                bic    = gmm.bic(emb_r)
+                labels = gmm.predict(emb_r)
+                if bic < best_bic:
+                    best_bic, best_labels, best_k = bic, labels, k
+            except Exception:
+                continue
+
+    if best_labels is None:
+        best_labels = np.zeros(n, dtype=int)
+
+    sil = 0.0
+    if len(set(best_labels)) > 1:
+        try:
+            sil = float(silhouette_score(emb_r, best_labels))
+        except Exception:
+            pass
+
+    return best_labels.astype(int), best_k, sil
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 8a  — OPTIONAL CONSTRAINED LLM LABEL REFINEMENT  [TopicTag, DocEng 2024]
+# ──────────────────────────────────────────────────────────────────────────────
+def _light_stem(w: str) -> str:
+    """
+    Minimal English morphological normalisation — no NLTK dependency.
+
+    Used by the LLM grounding check so that 'latencies' matches 'latency',
+    'errors' matches 'error', 'completion' matches 'completed', etc.
+    Avoids rejecting plurals and common tense variants while still requiring
+    every label word to derive from evidence vocabulary.
+
+    Based on Porter-stemmer-style suffix stripping (Porter 1980, adapted).
+    """
+    w = w.lower().strip()
+    for suffix in ('ization', 'isation', 'ousness', 'iveness',
+                   'ization', 'ities', 'iness',
+                   'ation', 'ments', 'ness',
+                   'ies', 'ied', 'ing', 'ers',
+                   'ed', 'es', 'er', 'ly', 's'):
+        if w.endswith(suffix) and len(w) > len(suffix) + 2:
+            return w[:-len(suffix)]
+    return w
+
+def make_llm_label_fn(base_url: str = OLLAMA_URL_DEFAULT,
+                       model: str = OLLAMA_MODEL_DEFAULT,
+                       provider: str = 'ollama',
+                       api_key: str = '') -> Optional[Callable]:
+    """
+    Build a TopicTag-style constrained LLM label refinement function backed
+    by a local Ollama server (OpenAI-compatible API at /v1).
+
+    [TopicTag] Eren et al. (2024) run NMF to discover topics, then use an LLM
+    to generate human-readable concept labels from the NMF topic terms.
+    The LLM receives ONLY the extracted evidence from the CSV — it cannot
+    alter the tree, cannot introduce new vocabulary, and must pass a strict
+    grounding check (every word in the proposed label must appear in evidence).
+
+    Local-LLM choice (Qwen 2.5 3B Instruct via Ollama) is deliberate:
+      • zero cost, zero API dependency, fully reproducible
+      • no external data transmission (privacy + thesis defensibility)
+      • TopicTag itself benchmarks open models (Llama, Mistral) — using an
+        open local model matches the paper's evaluation setup more closely
+        than a closed hosted model.
+
+    Returns a callable (candidate, top_terms, parent_path, sample_texts)
+    → (label, metadata_dict).  Returns None if Ollama is unreachable or the
+    openai client package is missing.
+    """
+    client = _make_llm_client(provider, base_url, api_key)
+    if client is None:
+        return None
+
+    def _refine(candidate: str, top_terms: list, parent_path: str,
+                sample_texts: list):
+        meta = {'confidence': 0.0, 'evidence_terms': [],
+                'reason': '', 'raw_label': ''}
+        prompt = (
+            'You are labeling a cluster in a metadata variable hierarchy.\n'
+            'The label MUST be derived strictly from the evidence terms and '
+            'sample variable descriptions provided. Do not introduce concepts '
+            'or vocabulary that are not visible in the evidence.\n\n'
+            f'Parent path: {parent_path}\n'
+            f'Evidence terms (from NMF/TF-IDF over the cluster): {", ".join(top_terms[:10])}\n'
+            f'Sample variable descriptions:\n'
+            + '\n'.join(f'  - {str(t)[:160]}' for t in sample_texts[:4]) + '\n'
+            f'\nCurrent candidate label: {candidate}\n\n'
+            'Task: Return a concise 2–5 word concept label that PARAPHRASES '
+            'the evidence into a cleaner concept name.\n'
+            'Rules:\n'
+            '1. Every word in the label must appear in (or be an obvious '
+            'morphological variant of) the evidence terms or sample descriptions.\n'
+            '2. Do not invent domain concepts that are not in the evidence.\n'
+            '3. Prefer multi-word noun phrases over single keywords.\n'
+            '4. Avoid generic words: data, score, variable, assessment, total, '
+            'description, value, decimal.\n'
+            '5. Use base forms — singular nouns (Latency not Latencies), and '
+            'avoid -ing / -ed verb suffixes unless required.\n'
+            '6. Output strict JSON only — no prose, no markdown.\n\n'
+            'Output: {"label": "...", "evidence_terms": ["...", "..."], "confidence": 0.0}'
+        )
+        try:
+            # max_tokens generous: reasoning models (Qwen3) emit <think> traces
+            # that consume budget before the JSON appears.
+            resp = _safe_chat_completion(client, model, prompt,
+                                           max_tokens=1200, temperature=0.2)
+            raw = (resp.choices[0].message.content or '').strip()
+            result = _parse_json_response(raw)
+            label  = str(result.get('label', '')).strip()
+            conf   = float(result.get('confidence', 0))
+            evid   = result.get('evidence_terms', []) or []
+            meta['confidence'], meta['evidence_terms'] = conf, evid
+            meta['raw_label'] = label
+            if not label:
+                meta['reason'] = 'empty'
+                return candidate, meta
+            # NOTE: we IGNORE the LLM's self-reported confidence.  Qwen 3B
+            # routinely returns conf ≈ 0.5 on perfectly good labels — using it
+            # as a gate rejected useful refinements.  Grounding (below) is the
+            # real anti-hallucination check; if every label word stems back to
+            # the corpus, the label is accepted regardless of self-confidence.
+            #
+            # Stem-aware grounding accepts morphological variants
+            # (latencies ↔ latency, errors ↔ error, completion ↔ completed).
+            haystack_text = ' '.join(
+                str(s) for s in (
+                    top_terms[:10] + evid + list(sample_texts[:4]) + [parent_path]
+                )
+            ).lower()
+            haystack_stems = {_light_stem(w) for w in haystack_text.split()
+                               if len(w) >= 3}
+            label_words = [w for w in label.lower().split() if len(w) >= 3]
+            label_stems = {_light_stem(w) for w in label_words}
+            ungrounded  = label_stems - haystack_stems
+            if label_words and ungrounded:
+                meta['reason'] = f'ungrounded_words: {sorted(ungrounded)}'
+                return candidate, meta
+            meta['reason'] = 'accepted'
+            return label, meta
+        except Exception as e:
+            meta['reason'] = f'exception: {type(e).__name__}'
+            return candidate, meta
+
+    return _refine
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 5a — UPSTREAM LLM PHRASE-ROLE CLASSIFIER  [TopicGPT, NAACL 2024 adapted]
+#
+# Used to discover semantic roles (Measure / Statistic / Condition / Subtype
+# style dimensions) from the corpus and ASSIGN each mined phrase to a role.
+# This is fundamentally different from TopicTag-style label refinement:
+#   • TopicTag (and the make_llm_label_fn above) uses LLMs to RENAME clusters
+#     after the tree structure is already decided.  LLM is downstream.
+#   • This function uses an LLM to CREATE the role schema before the tree is
+#     built.  Slot ordering and parent-child placement are driven by the
+#     LLM-derived roles.  LLM is upstream — closer to TopicGPT (Pham et al.,
+#     NAACL 2024), where LLMs perform topic discovery, not topic labeling.
+#
+# Anti-hallucination contract (preserved):
+#   • Every phrase in the role assignment MUST come from the corpus verbatim.
+#     Outputs are validated against the input phrase list — anything new is
+#     dropped.
+#   • Role NAMES are proposed by the LLM but kept to 1–2 generic English
+#     words; long or fanciful role names are rejected.
+#   • If validation fails, the caller falls back to deterministic mutual-
+#     exclusion clustering (current slot mining behaviour).  No silent
+#     failure modes.
+# ──────────────────────────────────────────────────────────────────────────────
+def make_llm_role_classifier_fn(base_url: str = OLLAMA_URL_DEFAULT,
+                                  model: str = OLLAMA_MODEL_DEFAULT,
+                                  provider: str = 'ollama',
+                                  api_key: str = '') -> Optional[Callable]:
+    """
+    Build a callable that classifies repeated corpus phrases into semantic
+    roles using an LLM (Ollama or Groq).
+    """
+    client = _make_llm_client(provider, base_url, api_key)
+    if client is None:
+        return None
+
+    def _classify(phrases: list, sample_descs: list, group_name: str = ''):
+        meta = {'reason': '', 'raw': '', 'role_count': 0}
+        if not phrases:
+            meta['reason'] = 'no_phrases'
+            return None, meta
+        # Truncate to keep the prompt small and the model focused
+        phrases_list = list(dict.fromkeys(str(p) for p in phrases))[:60]
+        prompt = (
+            'You are analysing a list of repeated phrases mined from a '
+            'data-dictionary corpus.\n\n'
+            f'Group context: {group_name or "(unknown)"}\n\n'
+            'Repeated phrases (verbatim from the corpus):\n'
+            + '\n'.join(f'  - {p}' for p in phrases_list) + '\n\n'
+            'Sample variable descriptions for context:\n'
+            + '\n'.join(f'  - {str(s)[:180]}' for s in sample_descs[:4]) + '\n\n'
+            'TASK: Group these phrases into 2–5 SEMANTIC ROLES. Each role '
+            'represents one ORTHOGONAL DIMENSION of what the variable measures '
+            '(for example: what is measured, what statistic is used, under '
+            'what condition).\n\n'
+            'STRICT RULES:\n'
+            '1. Use 2 to 5 roles. Fewer is better when phrases do not really '
+            'belong to different dimensions.\n'
+            '2. Role names: ONE OR TWO generic English words, lowercase. '
+            'Examples of good role names: "measure", "statistic", "condition", '
+            '"outcome", "subtype", "modifier". Do not invent fancy or domain-'
+            'specific role names.\n'
+            '3. Every phrase from the input list MUST appear EXACTLY ONCE in '
+            'exactly one role. Copy phrases VERBATIM. Do not rephrase, '
+            'normalise, plural-strip, or invent new phrases.\n'
+            '4. Phrases that are alternatives (rarely co-occur in the same '
+            'variable) should go in the SAME role.\n'
+            '5. Phrases that describe DIFFERENT dimensions of the same '
+            'variable should go in DIFFERENT roles.\n'
+            '6. Output strict JSON only — no prose, no markdown fences.\n\n'
+            'Output schema:\n'
+            '{\n'
+            '  "roles": {\n'
+            '    "role_name_1": ["phrase a", "phrase b", ...],\n'
+            '    "role_name_2": ["phrase c", "phrase d", ...]\n'
+            '  }\n'
+            '}'
+        )
+        try:
+            resp = _safe_chat_completion(client, model, prompt,
+                                           max_tokens=2000, temperature=0.1)
+            raw = (resp.choices[0].message.content or '').strip()
+            meta['raw'] = raw[:400]
+            result = _parse_json_response(raw)
+            roles_raw = result.get('roles', {}) or {}
+            if not isinstance(roles_raw, dict):
+                meta['reason'] = 'roles_not_dict'
+                return None, meta
+
+            # ── Validation ────────────────────────────────────────────────────
+            input_set = {p.lower(): p for p in phrases_list}
+            roles_clean: dict = {}
+            seen_phrases: set = set()
+            for role_name, items in roles_raw.items():
+                # Role name must be 1–2 generic words
+                rn = str(role_name).strip().lower()
+                if not rn or len(rn.split()) > 2 or len(rn) > 24:
+                    continue
+                kept: list = []
+                if not isinstance(items, list):
+                    continue
+                for it in items:
+                    s = str(it).strip().lower()
+                    if not s or s in seen_phrases:
+                        continue
+                    if s not in input_set:
+                        # Phrase invented by LLM — drop (anti-hallucination)
+                        continue
+                    kept.append(input_set[s])   # original casing
+                    seen_phrases.add(s)
+                if len(kept) >= 2:
+                    roles_clean[rn] = kept
+            if len(roles_clean) < 2:
+                meta['reason'] = 'too_few_valid_roles'
+                return None, meta
+            meta['reason']     = 'accepted'
+            meta['role_count'] = len(roles_clean)
+            return roles_clean, meta
+        except Exception as e:
+            meta['reason'] = f'exception: {type(e).__name__}'
+            return None, meta
+
+    return _classify
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 8b–d  — NODE LABELING  [ZHU §4.3 / TopicTag]
+# ──────────────────────────────────────────────────────────────────────────────
+# Generic metadata field-name boilerplate that appears in essentially any
+# data-dictionary header (description, value, name, ...).  Dataset-specific
+# column tokens (e.g. 'fullDisplayName' for HCP, 'Decimal Places' for AI-MIND)
+# are added at runtime by build_field_noise() — derived from the user-confirmed
+# column names, never enumerated by hand.
+_GENERIC_FIELD_NOISE = frozenset({
+    'description', 'definition', 'value', 'metadata', 'desc',
+    'name', 'item', 'variable', 'field', 'attribute',
+    'code', 'type', 'dtype',
+})
+
+def build_field_noise(configs: dict) -> frozenset:
+    """
+    Build the field-name boilerplate set entirely from the user-confirmed
+    column roles.  For each detected column name we extract alphanumeric tokens
+    and add them to the noise set.
+
+    Examples (derived, NOT hardcoded):
+      HCP   'fullDisplayName' → {fulldisplayname}
+      HCP   'columnHeader'    → {columnheader}
+      AI-MIND 'Decimal Places' → {decimal, places}
+
+    Result: the same effective filter as a hand-crafted list, but produced
+    deterministically from whatever columns the current CSV has.  Datasets we
+    have never seen get an automatically-tailored noise set.
+    """
+    noise = set(_GENERIC_FIELD_NOISE)
+    for cfg in (configs or {}).values():
+        for col_list in cfg.values():
+            for col in col_list:
+                tokens = ''.join(c if c.isalnum() else ' '
+                                 for c in str(col)).split()
+                noise.update(t.lower() for t in tokens if len(t) >= 2)
+    return frozenset(noise)
+
+# Module-level fallback used when label_cluster is called without a configs-
+# derived noise set.  Replaced at build time by the Streamlit pipeline below.
+FIELD_NAME_NOISE: frozenset = _GENERIC_FIELD_NOISE
+
+def _extract_common_prefix_phrase(cluster_texts: list,
+                                    min_coverage: float = 0.6) -> str:
+    """
+    Many data dictionaries write 'Concept Name: definition...' in the
+    description.  If most cluster members share a concept-name prefix, that
+    prefix IS the concept label.  Fully data-driven — works on any dictionary
+    using the 'name: definition' convention.
+
+    Returns a Title-cased phrase, or '' if no shared prefix is strong enough.
+    """
+    # _text is "col1: val1 | col2: val2 | ...".  Find description-like field
+    # and take its prefix before the inner colon.
+    prefixes = []
+    for t in cluster_texts:
+        for chunk in str(t).split(' | '):
+            if ':' not in chunk:
+                continue
+            key, val = chunk.split(':', 1)
+            key_l = key.lower()
+            if 'descrip' in key_l or 'def' in key_l or 'full' in key_l:
+                phrase = val.split(':')[0].strip()
+                tokens = phrase.split()
+                if 2 <= len(tokens) <= 6:
+                    prefixes.append(tokens)
+                break
+
+    if not prefixes:
+        return ''
+
+    n_thresh = max(1, int(min_coverage * len(prefixes)))
+    max_len  = max(len(p) for p in prefixes)
+    for length in range(min(6, max_len), 1, -1):
+        starts = Counter(tuple(p[:length]) for p in prefixes if len(p) >= length)
+        if not starts:
+            continue
+        top, cnt = starts.most_common(1)[0]
+        if cnt >= n_thresh:
+            return ' '.join(top).title()
+    return ''
+
+def _bigram_preferred_terms(diff: np.ndarray, terms: np.ndarray,
+                             boilerplate: set, prefix_lower: str,
+                             n_terms: int) -> list:
+    """
+    Pick top-n discriminative terms, preferring bigrams and removing redundancy.
+
+    Rules:
+    1. Skip short tokens (<3 chars), boilerplate, field-name noise, and any
+       token already in the prefix.
+    2. When a bigram is selected, drop any previously-picked unigram that is
+       a substring of it.
+    3. Skip unigrams that are substrings of any already-picked bigram.
+    """
+    order  = np.argsort(diff)[::-1]
+    picked, picked_lower = [], []
+    for i in order:
+        t  = terms[i]; tl = t.lower()
+        if (len(t) < 3 or tl in boilerplate or tl in FIELD_NAME_NOISE
+                or (prefix_lower and tl in prefix_lower)):
+            continue
+        is_unigram = ' ' not in t
+        # Rule 3: unigram already covered by a picked bigram?
+        if is_unigram and any(tl in pl for pl in picked_lower if ' ' in pl):
+            continue
+        # Rule 2: replace picked unigrams subsumed by this new bigram
+        if not is_unigram:
+            keep = [(p, pl) for p, pl in zip(picked, picked_lower)
+                    if not (' ' not in pl and pl in tl)]
+            picked, picked_lower = [k[0] for k in keep], [k[1] for k in keep]
+        picked.append(t); picked_lower.append(tl)
+        if len(picked) >= n_terms:
+            break
+    return picked
+
+def label_cluster(cluster_texts: list, all_texts: list,
+                  tfidf: TfidfVectorizer, n_terms: int = 3,
+                  cluster_groups: Optional[list] = None,
+                  parent_path: str = '',
+                  llm_label_fn: Optional[Callable] = None,
+                  return_provenance: bool = False):
+    """
+    Label a cluster node using a five-stage deterministic pipeline.
+
+    A) Description-prefix candidate: extract concept phrase from 'Name: def...'
+       pattern shared by ≥60% of cluster members.
+    B) Group-purity prefix: if ≥70% share a top-level _group value, prepend it.
+    C) Boilerplate filter: drop terms with TF-IDF IDF ≤ 1.7 plus FIELD_NAME_NOISE.
+    D) Bigram-preferred discriminative suffix [ZHU §4.3].
+    E) Optional constrained LLM refinement [TopicTag, DocEng 2024].
+
+    When return_provenance=True returns a (label, provenance_dict) tuple where
+    provenance records which stage produced the label (description_prefix /
+    tfidf_bigram / group_anchor / llm) plus evidence terms and (for LLM) the
+    confidence score and a grounding check result.
+    """
+    prov: dict = {'label_source': 'fallback',
+                  'evidence_terms': [],
+                  'confidence': 1.0,
+                  'llm_used': False,
+                  'llm_rejected': False}
+
+    if not cluster_texts:
+        return ('Group', prov) if return_provenance else 'Group'
+
+    terms    = np.array(tfidf.get_feature_names_out())
+    idf_vals = tfidf.idf_
+    boilerplate = set(terms[idf_vals <= 1.7].tolist())
+
+    # ── B) group-purity prefix ───────────────────────────────────────────────
+    prefix = ''
+    if cluster_groups:
+        top_lvl = [str(g).split(' > ')[0].strip() for g in cluster_groups]
+        top_grp, cnt = Counter(top_lvl).most_common(1)[0]
+        if cnt / len(top_lvl) >= 0.70 and top_grp not in ('', 'Ungrouped'):
+            prefix = top_grp
+
+    # ── A) description-prefix candidate phrase ───────────────────────────────
+    phrase = _extract_common_prefix_phrase(cluster_texts)
+    if phrase and prefix and phrase.lower().startswith(prefix.lower()):
+        phrase = phrase[len(prefix):].strip()
+
+    # ── D) bigram-preferred discriminative terms ─────────────────────────────
+    top_terms_raw, words = [], []
+    try:
+        X_all  = tfidf.transform(all_texts).toarray()
+        X_clus = tfidf.transform(cluster_texts).toarray()
+        diff   = X_clus.mean(axis=0) - X_all.mean(axis=0)
+        top_terms_raw = [terms[i] for i in np.argsort(diff)[::-1][:20]]
+        words = _bigram_preferred_terms(diff, terms, boilerplate,
+                                          prefix.lower(), n_terms)
+    except Exception:
+        pass
+
+    # ── compose deterministic candidate (records which stage produced it) ────
+    if phrase:
+        candidate = f'{prefix} — {phrase}' if prefix else phrase
+        prov['label_source']   = 'description_prefix'
+        prov['evidence_terms'] = [phrase] + ([prefix] if prefix else [])
+    elif words:
+        suffix    = ' / '.join(w.title() for w in words)
+        candidate = f'{prefix} — {suffix}' if prefix else suffix
+        prov['label_source']   = 'tfidf_bigram'
+        prov['evidence_terms'] = list(words) + ([prefix] if prefix else [])
+    elif prefix:
+        candidate = prefix
+        prov['label_source']   = 'group_anchor'
+        prov['evidence_terms'] = [prefix]
+    else:
+        candidate = 'Group'
+
+    # ── E) optional constrained LLM refinement [TopicTag] ────────────────────
+    # LLM is used ONLY as a re-phraser of evidence already present in the
+    # cluster.  Refinement is rejected if the returned label is not grounded
+    # in the evidence terms — keeping the user's contract that "labels should
+    # come from the csv itself".
+    if llm_label_fn and candidate != 'Group':
+        prov['llm_used'] = True
+        try:
+            refined, llm_meta = llm_label_fn(
+                candidate, top_terms_raw[:10],
+                parent_path, cluster_texts[:4])
+            # Always record the raw LLM proposal and the grounding-check reason
+            # — even when rejected — so the Provenance tab can show "what did
+            # the LLM suggest and why was it dropped?"
+            prov['llm_raw_label'] = llm_meta.get('raw_label', '')
+            prov['llm_reason']    = llm_meta.get('reason', '')
+            if refined and refined != candidate:
+                candidate = refined
+                prov['label_source']   = 'llm'
+                prov['confidence']     = llm_meta.get('confidence', 0.0)
+                prov['evidence_terms'] = llm_meta.get('evidence_terms',
+                                                       prov['evidence_terms'])
+            else:
+                prov['llm_rejected'] = True
+        except Exception as _e:
+            prov['llm_rejected'] = True
+            prov['llm_reason']   = f'exception: {type(_e).__name__}'
+
+    if return_provenance:
+        return candidate, prov
+    return candidate
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 5b  — PHRASE-SLOT MINING  (data-driven IE-style slot induction)
+#
+# Information-extraction adaptation for structured metadata dictionaries.
+# Many data dictionaries (e.g. AI-MIND cognitive tests, HCP Study Completion)
+# express each variable as a regular phrase combining several semantic
+# dimensions (measure type, statistic, condition, etc.).  Document-level
+# methods (NMF, BERTopic, CTM) cannot separate these because all dimensions
+# collapse into one vector.  Phrase-slot mining decomposes each description
+# into multiple phrases, identifies which phrases are mutually-exclusive
+# alternatives across the corpus, and uses each alternative-set as a slot.
+#
+# No domain hardcoding: slot phrases are discovered from n-gram co-occurrence
+# patterns in the actual descriptions.  Slot names are derived from the
+# phrases themselves; if structure is too weak, the algorithm reports
+# `valid=False` and the caller falls back to NMF.
+# ──────────────────────────────────────────────────────────────────────────────
+# ── Generic dictionary markers stripped before slot mining ────────────────────
+# These are universal data-dictionary conventions (`KEY: ...`, `Note: ...`,
+# `Question: ...`) — not domain knowledge.  Without stripping, they become
+# false slot phrases (e.g. AI-MIND output contained `Key` nodes promoted from
+# `KEY: DMS Percent Correct ...`).
+_LEADING_MARKERS = frozenset({
+    'key', 'note', 'notes', 'definition', 'description', 'desc',
+    'question', 'q', 'item', 'value', 'meaning', 'label',
+})
+
+def _strip_leading_markers(text: str) -> str:
+    """Iteratively strip leading dictionary markers like 'KEY:' or 'Note:'."""
+    out = text
+    for _ in range(4):  # bounded — never more than a few stacked markers
+        if ':' not in out:
+            break
+        head, rest = out.split(':', 1)
+        if head.strip().lower() in _LEADING_MARKERS:
+            out = rest.strip()
+        else:
+            break
+    return out
+
+def _split_concept_and_body(text: str) -> tuple:
+    """
+    Split a 'Concept Name: definition sentence' description into its concept
+    prefix and its definition body.
+
+    Data dictionaries overwhelmingly use the convention
+        <short concept phrase> : <longer explanatory definition>
+    e.g.  'DMS Correct Latency Standard Deviation (SD) (0 second delay):
+           The standard deviation of response latencies for trials ...'
+
+    The concept phrase is the clean, canonical label; the body is explanatory
+    prose that introduces boilerplate ('the number of times', 'a subject
+    revisits a box ...') and weaker surface phrases ('response latencies').
+
+    Returns (concept, body).  If no clear concept/body boundary exists, concept
+    is the whole text and body is ''.  Generic — no domain knowledge.
+    """
+    s = _strip_leading_markers(str(text).strip())
+    if ':' not in s:
+        return s, ''
+    head, body = s.split(':', 1)
+    head = head.strip()
+    body = body.strip()
+    # Only treat `head` as a concept prefix if it's phrase-length (not a whole
+    # sentence) — a real concept name is short.
+    if 1 <= len(head.split()) <= 14:
+        return head, body
+    return s, ''
+
+# Generic data-dictionary definition-prose openers.  Role values starting with
+# these are explanatory fragments, not concept labels — reject them.  Generic
+# English, not domain vocabulary.
+_BOILERPLATE_VALUE_PREFIXES = (
+    'the number of', 'number of', 'the subject', 'a subject', 'the participant',
+    'a participant', 'this measure', 'this variable', 'calculated across',
+    'calculated as', 'the percentage of', 'the proportion of', 'the total number',
+    'the mean of', 'the median of', 'the standard deviation of', 'the amount of',
+    'the time', 'the length of', 'expressed as', 'defined as', 'measured as',
+)
+
+def _is_boilerplate_value(val: str) -> bool:
+    """True if a role value looks like a definition fragment rather than a label."""
+    v = str(val).strip().lower()
+    if not v:
+        return True
+    if any(v.startswith(p) for p in _BOILERPLATE_VALUE_PREFIXES):
+        return True
+    # Sentence-length values are definitions, not concept labels
+    if len(v.split()) > 6:
+        return True
+    return False
+
+# ── English stopwords for phrase-quality filtering ────────────────────────────
+# Reject phrases like 'and', 'them', 'to be', 'have a lot of' from becoming
+# slot nodes.  This is general English filtering, not domain knowledge.
+_STOPWORDS = frozenset(
+    'a an the and or but of in on at by for with about into during '
+    'through over under above below from to as is are was were be being '
+    'been have has had do does did this that these those they them their '
+    'it its he she his her you your we our i my me us not no nor '
+    'how often when where which who why what '
+    'lot lots much many more most some any all none '
+    'very also too just so such only even still even also '
+    'one two three first second '   # 'second' as standalone — careful: kept in n-grams via context
+    .split()
+)
+
+def _phrase_is_meaningful(phrase: str, group_name_lower: str = '') -> bool:
+    """
+    Reject phrases that should never be a hierarchy node.
+
+    Rules (all generic, no domain knowledge):
+      • all-stopword phrases ('and', 'to be', 'have a lot of')
+      • boundary stopwords ('the response time' → 'the' boundary)
+      • phrase equals the parent group name ('DMS' inside DMS branch)
+      • single dictionary-marker words ('key', 'note')
+      • pure numeric tokens
+    """
+    p = phrase.strip().lower()
+    if not p:
+        return False
+    if p == group_name_lower:
+        return False
+    if p in _LEADING_MARKERS:
+        return False
+    tokens = p.split()
+    if not tokens:
+        return False
+    # Boundary stopwords
+    if tokens[0] in _STOPWORDS or tokens[-1] in _STOPWORDS:
+        return False
+    # Need at least one content token (non-stopword, length ≥ 2)
+    content = [t for t in tokens if t not in _STOPWORDS and len(t) >= 2]
+    if not content:
+        return False
+    # Single-token phrases must be a meaningful word, not a bare digit
+    if len(tokens) == 1 and tokens[0].isdigit():
+        return False
+    return True
+
+# ──────────────────────────────────────────────────────────────────────────────
+# OPTION D — SBERT PHRASE CLUSTERING + CONSTRAINED LLM ROLE NAMING
+#
+# Following the EDC pattern (Zhang & Soh, EMNLP 2024) and ZOES (arXiv
+# 2506.04458, 2025):
+#   Extract   → mine repeated phrases from each variable description
+#   Define    → semantically cluster phrases via SBERT + AgglomerativeClustering
+#   Canonicalize → constrained-vocabulary LLM names each cluster with a role
+#                  (measure / statistic / condition / subtype / outcome / modifier)
+#
+# Anti-hallucination contract:
+#   • Phrases are extracted verbatim from the corpus, never invented
+#   • LLM only NAMES discovered clusters — cannot move phrases
+#   • Role names are constrained to a fixed generic English vocabulary
+#     (configurable via constrained=True/False)
+#   • Anonymous fallback when LLM is unavailable or invalid (cluster_0, ...)
+#
+# Different from Approach 1: Approach 1 embeds the WHOLE description as one
+# vector; this embeds each EXTRACTED PHRASE as its own vector.  Approach 1
+# clusters variables; this clusters phrases.  Approach 1 produces a single
+# similarity tree; this produces a multi-role decomposition + nested tree.
+# ──────────────────────────────────────────────────────────────────────────────
+_ALLOWED_ROLES_CONSTRAINED = frozenset({
+    'measure', 'statistic', 'condition', 'subtype', 'outcome', 'modifier',
+})
+
+# Generic role priority for hierarchy nesting (no domain knowledge).
+# Higher priority = outermost (closest to root) level inside the group.
+_ROLE_PRIORITY = {
+    'measure':   6,
+    'outcome':   5,
+    'statistic': 4,
+    'condition': 3,
+    'modifier':  2,
+    'subtype':   1,
+}
+
+def make_llm_role_namer_fn(base_url: str = OLLAMA_URL_DEFAULT,
+                            model: str = OLLAMA_MODEL_DEFAULT,
+                            constrained: bool = True,
+                            provider: str = 'ollama',
+                            api_key: str = '') -> Optional[Callable]:
+    """
+    Returns a callable (phrases, sample_descs) → (role_name, meta).
+    Works with Ollama (local) or Groq (cloud).
+    """
+    client = _make_llm_client(provider, base_url, api_key)
+    if client is None:
+        return None
+
+    def _name_cluster(phrases: list, sample_descs: list = None):
+        meta = {'reason': '', 'raw': '', 'constrained': constrained}
+        if not phrases:
+            meta['reason'] = 'empty_phrases'
+            return None, meta
+        sample_descs = sample_descs or []
+
+        head = (
+            'Below is a cluster of semantically related phrases mined from a '
+            'data-dictionary corpus. The phrases were grouped by sentence-'
+            'transformer embedding similarity.\n\n'
+            'Phrases in cluster:\n'
+            + '\n'.join(f'  - {p}' for p in phrases[:20]) + '\n\n'
+            'Sample variable descriptions for context:\n'
+            + '\n'.join(f'  - {str(s)[:160]}' for s in sample_descs[:3]) + '\n\n'
+        )
+        if constrained:
+            prompt = head + (
+                'Choose EXACTLY ONE semantic role from this fixed list that '
+                'best names what these phrases share. Pick the SINGLE best fit:\n'
+                '  - measure   (the base concept being measured)\n'
+                '  - statistic (mean, median, standard deviation, total, ...)\n'
+                '  - condition (when/where/under what circumstances)\n'
+                '  - outcome   (result/output type)\n'
+                '  - subtype   (a specific kind of measure)\n'
+                '  - modifier  (a qualifier)\n\n'
+                'Output strict JSON only — no markdown:\n'
+                '{"role": "measure"}'
+            )
+        else:
+            prompt = head + (
+                'Propose ONE generic 1–2 word lowercase English noun that names '
+                'the semantic role these phrases share. Examples of good roles: '
+                'measure, statistic, condition, subtype, location, time, '
+                'identifier, frequency, severity, quality.\n\n'
+                'Output strict JSON only:\n'
+                '{"role": "..."}'
+            )
+
+        try:
+            # max_tokens must cover reasoning-model <think> traces (Qwen3)
+            # before the tiny JSON answer appears.
+            resp = _safe_chat_completion(client, model, prompt,
+                                           max_tokens=800, temperature=0.1)
+            raw = (resp.choices[0].message.content or '').strip()
+            meta['raw'] = raw[:200]
+            result = _parse_json_response(raw)
+            role = str(result.get('role', '')).strip().lower()
+            if not role:
+                meta['reason'] = 'empty_role'
+                return None, meta
+            if constrained:
+                if role not in _ALLOWED_ROLES_CONSTRAINED:
+                    meta['reason'] = f'role_not_allowed: {role}'
+                    return None, meta
+            else:
+                toks = role.split()
+                if len(toks) > 2 or len(role) > 24:
+                    meta['reason'] = 'role_too_long'
+                    return None, meta
+                if not all(t.isalpha() and t.islower() for t in toks):
+                    meta['reason'] = 'role_not_alpha_lower'
+                    return None, meta
+            meta['reason'] = 'accepted'
+            return role, meta
+        except Exception as e:
+            meta['reason'] = f'exception: {type(e).__name__}'
+            return None, meta
+
+    return _name_cluster
+
+def _extract_phrases_for_role_clustering(texts: list,
+                                           text_col_names: Optional[list],
+                                           group_name: str,
+                                           min_phrase_count: int = 2
+                                           ) -> tuple:
+    """
+    Shared helper: extract repeated meaningful phrases from a group of
+    descriptions. Returns (phrases_list, per_row_phrase_sets, regularity).
+    Regularity = fraction of rows that contain ≥2 mined phrases.
+    """
+    text_keys = ({c.strip().lower() for c in (text_col_names or [])}
+                 if text_col_names else None)
+    n = len(texts)
+
+    # 1. extract description-prefix per row
+    prefixes = []
+    for t in texts:
+        prefix = ''
+        for chunk in str(t).split(' | '):
+            if ':' not in chunk:
+                continue
+            key, val = chunk.split(':', 1)
+            key_l = key.strip().lower()
+            if text_keys is not None and key_l not in text_keys:
+                continue
+            prefix = val.split(':')[0].strip()
+            break
+        if not prefix:
+            prefix = str(t).strip()
+        prefix = _strip_leading_markers(prefix)
+        prefixes.append(prefix)
+
+    # 2. normalise
+    def _norm(p):
+        s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
+                    for ch in str(p).lower())
+        return ' '.join(s.split())
+    norm = [_norm(p) for p in prefixes]
+
+    # 3. mine n-grams 1..4 ≥3 chars
+    row_phrases: list = []
+    phrase_count: Counter = Counter()
+    for text in norm:
+        tokens = text.split()
+        row_set = set()
+        for ngram_n in range(1, 5):
+            for i in range(len(tokens) - ngram_n + 1):
+                phrase = ' '.join(tokens[i:i + ngram_n])
+                if len(phrase) >= 3:
+                    row_set.add(phrase)
+        row_phrases.append(row_set)
+        for p in row_set:
+            phrase_count[p] += 1
+
+    # 4. keep repeated, retain maximal per row
+    keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
+    refined = []
+    for row in row_phrases:
+        sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
+        kept: list = []
+        for p in sorted_by_len:
+            if not any(p != q and p in q for q in kept):
+                kept.append(p)
+        refined.append(set(kept))
+
+    phrase_count = Counter()
+    for r in refined:
+        for p in r:
+            phrase_count[p] += 1
+
+    group_lower = (group_name or '').strip().lower()
+    phrases = sorted([p for p, c in phrase_count.items()
+                       if c >= min_phrase_count
+                       and _phrase_is_meaningful(p, group_lower)])
+
+    regularity = sum(1 for r in refined if len(r) >= 2) / max(1, n)
+    return phrases, refined, round(regularity, 4)
+
+# ──────────────────────────────────────────────────────────────────────────────
+# PER-ROW LLM ROLE EXTRACTOR  [Zhu et al. EMNLP 2025 — proper implementation]
+#
+# For each variable description, ONE LLM call extracts role values directly:
+#   measure   = base concept being measured (e.g. "Correct Latency")
+#   statistic = statistical aggregation (e.g. "Standard Deviation")
+#   condition = experimental condition (e.g. "0 second delay")
+#   subtype   = specific error / outcome subtype (e.g. "Incorrect Colour")
+#
+# Anti-hallucination contract:
+#   • Each returned value MUST be a literal substring of the input description
+#     (grounding check rejects anything else).
+#   • LLM cannot invent new roles — only the four canonical roles are output.
+#   • Empty string is a valid output ("this row has no statistic" etc.).
+#
+# Cost: ONE call per variable.  AI-MIND ≈ 108 calls, HCP ≈ 813.
+# ──────────────────────────────────────────────────────────────────────────────
+_PER_ROW_ROLES = ('measure', 'statistic', 'condition', 'subtype')
+
+def make_per_row_role_extractor_fn(base_url: str = OLLAMA_URL_DEFAULT,
+                                     model: str = OLLAMA_MODEL_DEFAULT,
+                                     provider: str = 'ollama',
+                                     api_key: str = '') -> Optional[Callable]:
+    """
+    Returns a callable (description, sample_descs_in_group) → (roles_dict, meta).
+
+    Implements the Zhu et al. (2025, EMNLP) multi-aspect encoding pattern:
+    instead of clustering phrases and naming clusters, ask the LLM to extract
+    each role value directly from each row's description.  Each extracted
+    value must be a verbatim substring (anti-hallucination).
+    """
+    client = _make_llm_client(provider, base_url, api_key)
+    if client is None:
+        return None
+
+    def _extract(concept: str, sample_concepts: list = None,
+                 full_text: str = None):
+        meta = {'reason': '', 'raw': ''}
+        if not concept or len(str(concept).strip()) < 3:
+            return {}, {'reason': 'empty_description'}
+
+        # `concept` = the clean concept-name phrase (preferred extraction source)
+        # `full_text` = concept + definition body (used for subtype + grounding)
+        concept = str(concept)[:300]
+        full    = str(full_text or concept)[:600]
+        sample_concepts = sample_concepts or []
+        prompt = (
+            'You are extracting semantic role values from one variable in a '
+            'data dictionary.\n\n'
+            f'CONCEPT NAME (use this for measure / statistic / condition):\n'
+            f'  {concept}\n\n'
+            f'FULL DEFINITION (use ONLY for subtype, and only if needed):\n'
+            f'  {full}\n\n'
+            'Similar concept names in the same group (context):\n'
+            + '\n'.join(f'  - {str(s)[:120]}' for s in sample_concepts[:3]) + '\n\n'
+            'TASK: Extract values for these four semantic roles. Take '
+            'measure, statistic and condition from the CONCEPT NAME wherever '
+            'possible. Every value MUST be a verbatim substring of the CONCEPT '
+            'NAME or FULL DEFINITION. Empty string if a role does not apply.\n\n'
+            '  measure   = the base quantity being measured\n'
+            '              (e.g. "reaction time", "accuracy", "score")\n'
+            '  statistic = a statistical aggregation operator\n'
+            '              (e.g. "mean", "median", "standard deviation", "total")\n'
+            '  condition = an experimental condition or scope\n'
+            '              (e.g. "baseline", "follow-up", "task condition")\n'
+            '  subtype   = a specific subtype / kind / error type\n'
+            '              (e.g. "error type", "response type", "trial type")\n\n'
+            'STRICT RULES:\n'
+            '1. Each value COPIED VERBATIM — do not invent, summarise, paraphrase.\n'
+            '2. Prefer short concept phrases over long definition fragments.\n'
+            '3. Do NOT return a value that is a sentence or starts with "the '
+            'number of", "the subject", "calculated across" — those are '
+            'definition prose, not labels.\n'
+            '4. Empty string "" for roles that do not apply.\n'
+            '5. Output strict JSON only:\n\n'
+            '{"measure": "...", "statistic": "...", "condition": "...", "subtype": ""}'
+        )
+
+        try:
+            resp = _safe_chat_completion(client, model, prompt,
+                                           max_tokens=1500, temperature=0.1)
+            raw = (resp.choices[0].message.content or '').strip()
+            meta['raw'] = raw[:300]
+            result = _parse_json_response(raw)
+        except Exception as e:
+            meta['reason'] = f'exception: {type(e).__name__}: {str(e)[:80]}'
+            return {}, meta
+
+        # Grounding is checked against the FULL text (concept + body) so that
+        # subtype values living in the definition body still pass.
+        ground_lower  = full.lower()
+        ground_tokens = [w.strip(',.()[]{}"\'') for w in ground_lower.split()]
+        ground_stems  = {_light_stem(w) for w in ground_tokens
+                          if len(w) >= 3 and w not in _STOPWORDS}
+
+        roles: dict = {}
+        rejected: list = []
+        for role in _PER_ROW_ROLES:
+            val = result.get(role, '')
+            if not isinstance(val, str):
+                continue
+            val_clean = val.strip().strip('"').strip("'")
+            if not val_clean:
+                continue
+            # P2: reject definition-prose fragments before grounding
+            if _is_boilerplate_value(val_clean):
+                rejected.append((role, val_clean, ['boilerplate']))
+                continue
+            # Strict substring (cheapest, most common)
+            if val_clean.lower() in ground_lower:
+                roles[role] = val_clean
+                continue
+            # Token-stem grounding fallback (morphological variants)
+            val_tokens = [w.strip(',.()[]{}"\'') for w in val_clean.lower().split()]
+            val_stems  = {_light_stem(w) for w in val_tokens
+                           if len(w) >= 3 and w not in _STOPWORDS}
+            if val_stems and val_stems.issubset(ground_stems):
+                roles[role] = val_clean
+            else:
+                missing = sorted(val_stems - ground_stems) if val_stems else ['(no content tokens)']
+                rejected.append((role, val_clean, missing))
+
+        meta['reason']   = 'accepted' if roles else 'all_rejected'
+        meta['rejected'] = rejected
+        meta['n_extracted'] = len(roles)
+        meta['n_rejected']  = len(rejected)
+        return roles, meta
+
+    return _extract
+
+def discover_roles_via_per_row_extraction(
+        texts: list,
+        text_col_names: Optional[list],
+        per_row_extractor_fn: Callable,
+        group_name: str = '',
+        regularity_threshold: float = 0.40) -> dict:
+    """
+    Zhu et al. (EMNLP 2025) style: one LLM call per row extracts measure /
+    statistic / condition / subtype values directly from each description.
+
+    Returns the same dict shape as discover_roles_via_sbert_phrase_clustering()
+    so it can flow into build_role_hierarchy().
+    """
+    n = len(texts)
+    if n < 4:
+        return {'valid': False, 'reason': 'too_few_rows',
+                'regularity': 0.0, 'roles': {}, 'row_assignments': []}
+
+    # Extract a "concept-prefix" snippet per row for context to the extractor
+    text_keys = ({c.strip().lower() for c in (text_col_names or [])}
+                 if text_col_names else None)
+
+    def _row_desc_value(t: str) -> str:
+        """Return the raw description-column value (everything after 'desc:')."""
+        for chunk in str(t).split(' | '):
+            if ':' not in chunk:
+                continue
+            key, val = chunk.split(':', 1)
+            key_l = key.strip().lower()
+            if text_keys is not None and key_l not in text_keys:
+                continue
+            return val.strip()
+        return str(t).strip()
+
+    # For each row split into (concept-name prefix, definition body).
+    # The concept feeds measure/statistic/condition; the full text grounds
+    # subtype and the grounding check.
+    row_concepts: list = []
+    row_fulls:    list = []
+    for t in texts:
+        dv = _row_desc_value(t)
+        concept, body = _split_concept_and_body(dv)
+        row_concepts.append(concept)
+        row_fulls.append((concept + ' ' + body).strip() if body else concept)
+
+    sample_descs = row_concepts[:3]
+
+    # Call extractor per row.  Use Streamlit progress bar so the user sees
+    # what's happening (especially important for HCP).
+    try:
+        import streamlit as _st_p
+        pbar = _st_p.progress(0.0, text=f'Extracting roles for "{group_name}"…')
+        show_progress = True
+    except Exception:
+        pbar = None
+        show_progress = False
+
+    per_row_roles: list = []
+    per_row_audit: list = []   # full audit trail (proposed + rejected per row)
+    all_phrases_by_role: dict = defaultdict(list)
+    extractor_meta_summary: dict = defaultdict(int)
+
+    for i, concept in enumerate(row_concepts):
+        if show_progress and pbar is not None:
+            try:
+                pbar.progress((i + 1) / max(1, n),
+                               text=f'[{group_name}] row {i+1}/{n}')
+            except Exception:
+                pass
+        try:
+            roles, meta = per_row_extractor_fn(
+                concept, sample_descs, full_text=row_fulls[i])
+        except Exception as e:
+            roles, meta = {}, {'reason': f'exception: {type(e).__name__}'}
+        per_row_roles.append(roles)
+        # Audit trail: store per-row details for the Role Decomposition tab
+        per_row_audit.append({
+            'row_idx':    i,
+            'description_snippet': str(concept)[:120],
+            'accepted_roles':      dict(roles),
+            'rejected':            meta.get('rejected', []),
+            'reason':              meta.get('reason', ''),
+            'raw':                 meta.get('raw', ''),
+        })
+        extractor_meta_summary[meta.get('reason', 'unknown')] += 1
+        for role, val in roles.items():
+            if val and val not in all_phrases_by_role[role]:
+                all_phrases_by_role[role].append(val)
+
+    try:
+        if show_progress and pbar is not None:
+            pbar.empty()
+    except Exception:
+        pass
+
+    # Keep only roles that have ≥ 2 distinct values across the corpus
+    roles_final: dict = {}
+    for role in _PER_ROW_ROLES:
+        vals = all_phrases_by_role.get(role, [])
+        if len(vals) >= 2:
+            roles_final[role] = vals
+
+    # Coverage = fraction of rows with ≥1 non-empty role assignment
+    covered = sum(1 for r in per_row_roles if any(r.get(rl) for rl in roles_final))
+    coverage = covered / max(1, n)
+    regularity = coverage   # for per-row extractor, coverage is regularity
+
+    valid = (len(roles_final) >= 2 and coverage >= regularity_threshold)
+
+    return {
+        'roles':             roles_final,
+        'row_assignments':   per_row_roles,
+        'coverage':          round(coverage, 4),
+        'regularity':        round(regularity, 4),
+        'valid':             valid,
+        'role_source':       'per_row_llm_extraction',
+        'extractor_summary': dict(extractor_meta_summary),
+        'per_row_audit':     per_row_audit,
+        'group_name':        group_name,
+    }
+
+def discover_roles_via_sbert_phrase_clustering(
+        texts: list,
+        text_col_names: Optional[list],
+        sbert_model,
+        llm_role_namer_fn: Optional[Callable] = None,
+        min_phrase_count: int = 2,
+        min_role_size: int = 2,
+        n_clusters_range: tuple = (2, 6),
+        group_name: str = '',
+        regularity_threshold: float = 0.40) -> dict:
+    """
+    Option D core: discover semantic-role schema for a group via
+        SBERT phrase clustering  +  constrained-vocab LLM cluster naming.
+
+    Pipeline (EDC / ZOES style):
+      1. Extract repeated phrases per row (shared helper).
+      2. Compute group regularity = fraction of rows with ≥2 mined phrases.
+         If < regularity_threshold → return invalid (caller falls back to
+         existing slot mining / FASTopic / NMF path).
+      3. SBERT-embed each unique phrase.
+      4. Agglomerative-cluster phrases by cosine similarity; select K by
+         silhouette score (range 2..6).
+      5. Name each cluster via LLM (constrained vocab).  Anonymous fallback
+         when LLM is off or rejects.
+      6. Per-row: assign one phrase per role (longest mined phrase wins ties).
+      7. Return roles + row_assignments + diagnostics.
+
+    Returns a dict in the same shape as mine_phrase_slots() so it can flow
+    straight into build_slot_hierarchy / build_role_hierarchy.
+    """
+    n = len(texts)
+    if n < 4 or sbert_model is None:
+        return {'valid': False, 'reason': 'too_few_rows_or_no_sbert',
+                'regularity': 0.0, 'roles': {}, 'row_assignments': []}
+
+    # 1. extract phrases
+    phrases, refined, regularity = _extract_phrases_for_role_clustering(
+        texts, text_col_names, group_name, min_phrase_count)
+
+    if regularity < regularity_threshold:
+        return {'valid': False, 'reason': f'low_regularity ({regularity:.2f})',
+                'regularity': regularity, 'roles': {}, 'row_assignments': []}
+    if len(phrases) < 4:
+        return {'valid': False, 'reason': 'too_few_phrases',
+                'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+    # 2. SBERT embed
+    try:
+        embs = sbert_model.encode(phrases, normalize_embeddings=True,
+                                    show_progress_bar=False, batch_size=64)
+    except Exception as e:
+        return {'valid': False, 'reason': f'sbert_failed: {type(e).__name__}',
+                'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+    # 3. agglomerative + silhouette K selection
+    best_score, best_labels, best_k = -1.0, None, 2
+    for k in range(n_clusters_range[0],
+                    min(n_clusters_range[1] + 1, len(phrases))):
+        try:
+            ac = AgglomerativeClustering(n_clusters=k, metric='cosine',
+                                          linkage='average')
+            labels = ac.fit_predict(embs)
+            if len(set(labels)) < 2:
+                continue
+            sil = float(silhouette_score(embs, labels, metric='cosine'))
+            if sil > best_score:
+                best_score, best_labels, best_k = sil, labels, k
+        except Exception:
+            continue
+
+    if best_labels is None:
+        return {'valid': False, 'reason': 'no_clusters',
+                'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+    # 4. group phrases by cluster id
+    clusters_by_id: dict = defaultdict(list)
+    for p, lbl in zip(phrases, best_labels):
+        clusters_by_id[int(lbl)].append(p)
+    valid_clusters = {cid: ps for cid, ps in clusters_by_id.items()
+                      if len(ps) >= min_role_size}
+    if len(valid_clusters) < 2:
+        return {'valid': False, 'reason': 'too_few_valid_clusters',
+                'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+    # 5. name each cluster
+    sample_descs = [str(t)[:200] for t in texts[:3]]
+    roles: dict = {}
+    naming_meta: dict = {}
+    used_names: set = set()
+    for cid, cluster_phrases in valid_clusters.items():
+        role_name = None
+        if llm_role_namer_fn:
+            role_name, name_meta = llm_role_namer_fn(cluster_phrases, sample_descs)
+            naming_meta[cid] = name_meta
+        if not role_name:
+            role_name = f'cluster_{cid}'
+        # Disambiguate if LLM gave the same name to two clusters
+        original = role_name
+        suffix = 2
+        while role_name in used_names:
+            role_name = f'{original}_{suffix}'
+            suffix += 1
+        used_names.add(role_name)
+        roles[role_name] = cluster_phrases
+
+    # 6. per-row role assignment (longest phrase per role per row)
+    phrase_to_role = {p: r for r, ps in roles.items() for p in ps}
+    row_assignments: list = []
+    covered = 0
+    for r in refined:
+        assignment: dict = {}
+        for p in r:
+            role = phrase_to_role.get(p)
+            if not role:
+                continue
+            if role not in assignment or len(p) > len(assignment[role]):
+                assignment[role] = p
+        if assignment:
+            covered += 1
+        row_assignments.append(assignment)
+
+    coverage = covered / n
+    valid = (coverage >= 0.50 and len(roles) >= 2)
+
+    return {
+        'roles':             roles,
+        'row_assignments':   row_assignments,
+        'coverage':          round(coverage, 4),
+        'regularity':        regularity,
+        'valid':             valid,
+        'role_source':       'sbert_phrase_clustering',
+        'phrase_silhouette': round(float(best_score), 4),
+        'n_clusters':        best_k,
+        'naming_meta':       naming_meta,
+    }
+
+def build_role_hierarchy(vi_list: list,
+                          role_result: dict,
+                          can: pd.DataFrame,
+                          parent_id: int,
+                          parent_path: str,
+                          nodes: list,
+                          node_map: dict,
+                          var_nodes: dict,
+                          max_depth_remaining: int = 4,
+                          post_split_fn: Optional[Callable] = None,
+                          min_post_split_size: int = 4) -> bool:
+    """
+    Build a role-nested hierarchy from the Option D role decomposition.
+    Roles ordered by generic priority:
+        measure > outcome > statistic > condition > modifier > subtype
+    Variables without a value for a given role skip that level.
+
+    Returns True on success (≥ 2 aggregation nodes added), else False.
+    """
+    if not role_result.get('valid'):
+        return False
+    roles = role_result['roles']
+    row_assigns = role_result['row_assignments']
+    if len(vi_list) != len(row_assigns):
+        return False
+
+    role_names = sorted(roles.keys(),
+                         key=lambda r: _ROLE_PRIORITY.get(r, 0),
+                         reverse=True)
+    if not role_names:
+        return False
+
+    aggregations_made = [0]
+    vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}
+
+    def _terminal_attach(vi_grp: list, parent: int, depth_left: int):
+        if (post_split_fn is not None
+                and len(vi_grp) >= min_post_split_size
+                and depth_left > 0):
+            try:
+                added = post_split_fn(vi_grp, parent, depth_left)
+                if added > 0:
+                    aggregations_made[0] += added
+                    return
+            except Exception:
+                pass
+        for vi in vi_grp:
+            _add_child(node_map, parent, var_nodes[vi])
+
+    def _split(vi_subset: list, role_idx: int, current_parent: int,
+                depth_remaining: int):
+        if (depth_remaining <= 0 or role_idx >= len(role_names)
+                or len(vi_subset) <= 1):
+            _terminal_attach(vi_subset, current_parent, depth_remaining)
+            return
+
+        role = role_names[role_idx]
+        groups: dict = defaultdict(list)
+        unassigned: list = []
+        for vi in vi_subset:
+            local = vi_to_local.get(vi)
+            val = row_assigns[local].get(role) if local is not None else None
+            if val:
+                groups[val].append(vi)
+            else:
+                unassigned.append(vi)
+
+        if len(groups) <= 1:
+            _split(vi_subset, role_idx + 1, current_parent, depth_remaining)
+            return
+
+        # BUGFIX: read role_source from the result so the same builder
+        # correctly labels per-row LLM extraction nodes vs SBERT clustering
+        # nodes.  Previously this was hardcoded to 'sbert_phrase_clustering',
+        # hiding which route actually ran in the exported provenance.
+        _route_src = role_result.get('role_source', 'sbert_phrase_clustering')
+        _label_src = ('per_row_llm_role'
+                       if _route_src == 'per_row_llm_extraction'
+                       else 'sbert_phrase_role')
+        _node_source_str = ('per-row LLM extraction (Zhu et al. 2025)'
+                              if _route_src == 'per_row_llm_extraction'
+                              else 'SBERT phrase cluster + LLM role naming')
+        for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
+            if len(vi_grp) == 1:
+                _add_child(node_map, current_parent, var_nodes[vi_grp[0]])
+                continue
+            nid = _next_id(nodes)
+            nd  = _make_node(nid, val.title(),
+                              desc=(f'Role: {role} | Value: "{val}" | '
+                                    f'Variables: {len(vi_grp)} | '
+                                    f'Source: {_node_source_str}'))
+            nd['label_provenance'] = {
+                'label_source':  _label_src,
+                'evidence_terms': [val],
+                'confidence':    1.0,
+                'llm_used':      True,
+                'llm_rejected':  False,
+                'role':          role,
+            }
+            nd['structure_provenance'] = {
+                'route':            _route_src,
+                'aspect_method':    _route_src,
+                'slot_role':        role,
+                'phrase_silhouette': role_result.get('phrase_silhouette'),
+                'regularity':       role_result.get('regularity'),
+                'n_clusters':       role_result.get('n_clusters'),
+            }
+            nodes.append(nd)
+            node_map[nid] = nd
+            _add_child(node_map, current_parent, nid)
+            aggregations_made[0] += 1
+            _split(vi_grp, role_idx + 1, nid, depth_remaining - 1)
+
+        if unassigned:
+            _terminal_attach(unassigned, current_parent, depth_remaining)
+
+    _split(vi_list, 0, parent_id, max_depth_remaining)
+    return aggregations_made[0] >= 2
+
+def strip_group_prefix_from_labels(nodes: list) -> int:
+    """
+    Post-build pass: for every aggregation node, if its name starts with the
+    parent group's name (case-insensitive), strip the prefix.
+
+    Effect: 'DMS — Total Errors' under DMS becomes 'Total Errors'.
+            'Pal Total Errors' under PAL becomes 'Total Errors'.
+
+    Returns the number of labels modified.
+    """
+    node_map = {int(n['id']): n for n in nodes}
+    modified = 0
+
+    def _walk(nid: int, parent_name: str):
+        nonlocal modified
+        n = node_map.get(int(nid))
+        if not n:
+            return
+        if n.get('type') == 'aggregation' and parent_name:
+            current = str(n.get('name', '')).strip()
+            cn_lower = current.lower()
+            pn_lower = parent_name.strip().lower()
+            if pn_lower and (cn_lower.startswith(pn_lower + ' ')
+                              or cn_lower.startswith(pn_lower + '—')
+                              or cn_lower.startswith(pn_lower + '-')
+                              or cn_lower.startswith(pn_lower + ':')
+                              or cn_lower.startswith(pn_lower + '/')):
+                stripped = current[len(parent_name):].lstrip(' —-—:/').strip()
+                if stripped and len(stripped) >= 2:
+                    n['name'] = stripped
+                    modified += 1
+        new_parent = (n.get('name', '') if n.get('type') in ('aggregation', 'root')
+                       else parent_name)
+        for cid in n.get('related', []):
+            _walk(int(cid), new_parent)
+
+    _walk(0, '')
+    return modified
+
+def enforce_single_parent(nodes: list) -> int:
+    """
+    POST-BUILD PASS 4 — guarantee the result is a tree (each node has exactly
+    one parent).
+
+    The role builder can attach a variable both directly to a group and again
+    under a sub-aggregation of that same group — e.g. a leaf under '3 Targets'
+    *and* under '3 Targets > False Alarm Sequences'.  That makes the branch a
+    DAG, which (a) diverges from the single-parent tree format, (b) fragments
+    the branch, and (c) breaks proportional ('total') sunburst/treemap sizing.
+
+    For every node with more than one parent, keep the MOST SPECIFIC (deepest)
+    parent and detach it from the shallower ones.  Keeping the deepest parent
+    removes the redundant direct attachment while preserving the finer
+    sub-grouping the role extractor discovered.  Fully generic — no domain
+    knowledge, no hardcoded labels.
+
+    Returns the number of redundant parent links removed.
+    """
+    from collections import deque
+    node_map = {int(n['id']): n for n in nodes}
+    # depth = shortest distance from root (id 0) along child edges
+    depth = {0: 0}
+    dq = deque([0])
+    while dq:
+        cur = dq.popleft()
+        for c in node_map.get(cur, {}).get('related', []):
+            c = int(c)
+            if c not in depth:
+                depth[c] = depth[cur] + 1
+                dq.append(c)
+    parents = defaultdict(list)
+    for n in nodes:
+        for c in n.get('related', []):
+            parents[int(c)].append(int(n['id']))
+    removed = 0
+    for child, ps in parents.items():
+        if len(ps) <= 1:
+            continue
+        keep = max(ps, key=lambda p: depth.get(p, 0))  # deepest = most specific
+        for p in ps:
+            if p == keep:
+                continue
+            par = node_map.get(p)
+            if par and int(child) in par['related']:
+                par['related'] = [x for x in par['related'] if int(x) != int(child)]
+                removed += 1
+    return removed
+
+def prune_empty_aggregations(nodes: list) -> int:
+    """
+    POST-BUILD PASS 5 — drop aggregation nodes that ended up with no children.
+
+    `enforce_single_parent` can empty a shallow aggregation when all of its
+    variables were kept under a deeper/sibling parent (e.g. 'RVP Response
+    Latency' losing every leaf to a more specific group).  An empty category
+    node is export noise — it renders as a blank sector and has no members.
+
+    Iteratively removes childless aggregation nodes and detaches them from
+    their parents (removal can empty a parent in turn).  Root and attribute
+    nodes are never touched.  Returns the number of nodes removed.
+    """
+    removed = 0
+    while True:
+        node_map = {int(n['id']): n for n in nodes}
+        empties = {int(n['id']) for n in nodes
+                   if n.get('type') == 'aggregation' and not n.get('related')}
+        if not empties:
+            break
+        nodes[:] = [n for n in nodes if int(n['id']) not in empties]
+        for n in nodes:
+            if any(int(c) in empties for c in n.get('related', [])):
+                n['related'] = [int(c) for c in n['related'] if int(c) not in empties]
+        removed += len(empties)
+    return removed
+
+def mine_phrase_slots(texts: list,
+                       text_col_names: Optional[list] = None,
+                       min_phrase_count: int = 2,
+                       min_slot_size: int = 2,
+                       coverage_threshold: float = 0.55,
+                       excl_threshold: float = 0.75,
+                       group_name: str = '',
+                       llm_role_classifier_fn: Optional[Callable] = None) -> dict:
+    """
+    Discover phrase slots in a group of variable descriptions.
+
+    Algorithm:
+      1. Extract concept-prefix from each description (text before ':' in a
+         description-like column; full text if no such column).
+      2. Tokenise + lowercase; generate n-grams (1–4 tokens, ≥3 chars).
+      3. Keep n-grams that appear in ≥ min_phrase_count rows.
+      4. For each row, retain only maximal phrases (drop sub-phrases of
+         longer phrases present in the same row).
+      5. Compute mutual-exclusion score per phrase pair:
+            M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
+         Phrases with M[a,b] ≥ excl_threshold are 'alternatives' (rarely
+         appear together → likely fill the same slot in different rows).
+      6. Cluster phrases into slots via greedy mutual-exclusion BFS, starting
+         from the most-frequent phrase.
+      7. A slot is valid if it has ≥ min_slot_size distinct phrases.
+      8. Coverage: fraction of rows that contain ≥1 phrase from ≥1 slot.
+         If coverage < threshold or < 2 slots survive → valid=False.
+
+    Returns:
+      {
+        'slots':            list[ {phrases: set[str], best: str} ],
+        'row_assignments':  list[ dict[slot_id → phrase] ] for each row,
+        'coverage':         float in [0,1],
+        'valid':            bool — True if slot structure is strong enough.
+      }
+    """
+    n = len(texts)
+    if n < 4:
+        return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
+
+    text_keys = ({c.strip().lower() for c in (text_col_names or [])}
+                 if text_col_names else None)
+
+    # ── 1. extract concept prefixes ──────────────────────────────────────────
+    prefixes = []
+    for t in texts:
+        prefix = ''
+        for chunk in str(t).split(' | '):
+            if ':' not in chunk:
+                continue
+            key, val = chunk.split(':', 1)
+            key_l = key.strip().lower()
+            if text_keys is not None and key_l not in text_keys:
+                continue
+            prefix = val.split(':')[0].strip()
+            break
+        if not prefix:
+            prefix = str(t).strip()
+        # Strip 'KEY:', 'Note:', etc. before tokenising
+        prefix = _strip_leading_markers(prefix)
+        prefixes.append(prefix)
+
+    # ── 2. normalise: alphanumerics + parens preserved, others → spaces ──────
+    def _normalize(p: str) -> str:
+        s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
+                    for ch in str(p).lower())
+        return ' '.join(s.split())
+
+    norm = [_normalize(p) for p in prefixes]
+
+    # ── 3. extract n-grams (1..4) per row, keep ≥3 chars ─────────────────────
+    row_phrases: list = []
+    phrase_count: Counter = Counter()
+    for text in norm:
+        tokens = text.split()
+        row_set = set()
+        for ngram_n in range(1, 5):
+            for i in range(len(tokens) - ngram_n + 1):
+                phrase = ' '.join(tokens[i:i + ngram_n])
+                if len(phrase) >= 3:
+                    row_set.add(phrase)
+        row_phrases.append(row_set)
+        for p in row_set:
+            phrase_count[p] += 1
+
+    # ── 4. keep repeated phrases, retain only maximal phrases per row ────────
+    keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
+    refined = []
+    for row in row_phrases:
+        sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
+        kept: list = []
+        for p in sorted_by_len:
+            if not any(p != q and p in q for q in kept):
+                kept.append(p)
+        refined.append(set(kept))
+
+    # Recount after refinement, re-filter
+    phrase_count = Counter()
+    for r in refined:
+        for p in r:
+            phrase_count[p] += 1
+    # NEW: filter out non-meaningful phrases (stopwords, group-name echoes,
+    # dictionary markers) before they enter mutual-exclusion clustering.
+    group_lower = (group_name or '').strip().lower()
+    phrases = sorted([p for p, c in phrase_count.items()
+                       if c >= min_phrase_count
+                       and _phrase_is_meaningful(p, group_lower)])
+    if len(phrases) < 2 * min_slot_size:
+        return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
+
+    p_idx  = {p: i for i, p in enumerate(phrases)}
+    n_p    = len(phrases)
+    counts = np.array([phrase_count[p] for p in phrases])
+
+    # ── 5. co-occurrence + mutual-exclusion matrix ───────────────────────────
+    cooc = np.zeros((n_p, n_p), dtype=int)
+    for r in refined:
+        idxs = [p_idx[p] for p in r if p in p_idx]
+        for i in idxs:
+            for j in idxs:
+                if i != j:
+                    cooc[i, j] += 1
+    min_counts = np.minimum.outer(counts, counts).astype(float)
+    min_counts[min_counts == 0] = 1.0
+    mut_excl   = 1.0 - cooc / min_counts
+    np.fill_diagonal(mut_excl, 0)
+
+    # ── 6a. UPSTREAM ROUTE: ask the LLM to classify phrases into roles ───────
+    # The LLM proposes a role schema (e.g. {measure: [...], statistic: [...],
+    # condition: [...]}) — phrases are assigned to roles, role names provide
+    # semantic ordering for the hierarchy.  Anti-hallucination: every phrase
+    # must come back verbatim, otherwise rejected by the validator inside
+    # make_llm_role_classifier_fn.
+    slot_source = 'mutual_exclusion'
+    role_names: list = []
+    slots: list = []  # list[set[int]]  — phrase indices per slot
+
+    if llm_role_classifier_fn is not None:
+        try:
+            classified, classifier_meta = llm_role_classifier_fn(
+                phrases, texts, group_name)
+        except Exception:
+            classified, classifier_meta = None, {'reason': 'exception'}
+        if classified:
+            # Build slots in the order the LLM proposed them.  Each role is
+            # one slot containing the phrase-index set.
+            for role_name, role_phrases in classified.items():
+                idx_set = {p_idx[p] for p in role_phrases if p in p_idx}
+                if len(idx_set) >= min_slot_size:
+                    slots.append(idx_set)
+                    role_names.append(role_name)
+            if len(slots) >= 2:
+                slot_source = 'llm_role_classification'
+
+    # ── 6b. FALLBACK: greedy mutual-exclusion BFS ────────────────────────────
+    if slot_source == 'mutual_exclusion':
+        visited: set = set()
+        slots = []
+        order   = np.argsort(-counts)
+        for seed in order:
+            if seed in visited:
+                continue
+            slot = {int(seed)}
+            queue = [int(seed)]
+            while queue:
+                cur = queue.pop()
+                for j in np.where(mut_excl[cur] >= excl_threshold)[0]:
+                    j = int(j)
+                    if j in slot:
+                        continue
+                    if all(mut_excl[j, k] >= excl_threshold - 0.15 for k in slot):
+                        slot.add(j)
+                        queue.append(j)
+            if len(slot) >= min_slot_size:
+                slots.append(slot)
+                visited |= slot
+        # Synthesise anonymous role names from the most-frequent phrase in each
+        # slot — these become the visible slot tags in provenance.
+        role_names = [phrases[max(s, key=lambda i: counts[i])] for s in slots]
+
+    if len(slots) < 2:
+        return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
+
+    # ── 7. assign per-row phrase per slot ────────────────────────────────────
+    row_assignments: list = []
+    covered = 0
+    for r in refined:
+        row_idx = {p_idx[p] for p in r if p in p_idx}
+        assignment: dict = {}
+        any_match = False
+        for slot_id, slot in enumerate(slots):
+            matched = row_idx & slot
+            if matched:
+                best = max(matched, key=lambda i: counts[i])
+                assignment[slot_id] = phrases[best]
+                any_match = True
+        if any_match:
+            covered += 1
+        row_assignments.append(assignment)
+
+    coverage = covered / n
+    valid    = (coverage >= coverage_threshold and len(slots) >= 2)
+
+    return {
+        'slots': [{'phrases':  {phrases[i] for i in s},
+                    'best':     phrases[max(s, key=lambda i: counts[i])],
+                    'role_name': role_names[idx] if idx < len(role_names) else ''}
+                   for idx, s in enumerate(slots)],
+        'row_assignments': row_assignments,
+        'coverage':        round(coverage, 4),
+        'valid':           valid,
+        'slot_source':     slot_source,    # 'llm_role_classification' or 'mutual_exclusion'
+    }
+
+
+def build_slot_hierarchy(vi_list: list,
+                          slot_result: dict,
+                          can: pd.DataFrame,
+                          parent_id: int,
+                          parent_path: str,
+                          nodes: list,
+                          node_map: dict,
+                          var_nodes: dict,
+                          max_depth_remaining: int = 4,
+                          post_slot_split_fn: Optional[Callable] = None,
+                          min_post_slot_size: int = 4) -> bool:
+    """
+    Build a hierarchy for `vi_list` using inferred phrase slots.
+
+    Slots are ordered by partition quality (more distinct values + fewer
+    singletons = higher priority).  Each slot becomes one tree level.
+    Variables that lack a phrase at a given slot level skip that level.
+    Returns True on success; False if the result is too shallow to be useful
+    (caller should then fall back to NMF/GMM).
+    """
+    if not slot_result.get('valid'):
+        return False
+
+    slots       = slot_result['slots']
+    row_assigns = slot_result['row_assignments']
+    if len(vi_list) != len(row_assigns):
+        return False
+
+    # Index map: global vi → local position in row_assigns
+    vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}
+
+    # Order slots by partition-quality + semantic-shape heuristics.
+    # Higher score → used at a shallower level in the hierarchy.
+    #
+    # Heuristics (all generic, no domain knowledge):
+    #   + many distinct values, low singleton fraction (existing)
+    #   + average phrase token-length (multi-word noun phrases preferred)
+    #   + total row coverage of the slot
+    #   − slots whose top phrases look like pure conditions
+    #     (numeric token + temporal/quantity word)
+    #   − slots where every phrase is just a number or 'all' / 'none' modifier
+    _CONDITION_HINTS = {'second', 'seconds', 'minute', 'minutes', 'hour',
+                        'hours', 'day', 'days', 'month', 'months', 'year',
+                        'years', 'week', 'weeks', 'box', 'boxes', 'token',
+                        'tokens', 'pattern', 'patterns', 'trial', 'trials'}
+
+    def _looks_like_condition(phrase: str) -> bool:
+        toks = phrase.split()
+        if not toks:
+            return False
+        has_num = any(t.isdigit() or t in {'all','none','simultaneous'} for t in toks)
+        has_hint = any(t in _CONDITION_HINTS for t in toks)
+        return has_num and has_hint
+
+    def _slot_score(slot_id: int) -> float:
+        vals = Counter()
+        for a in row_assigns:
+            v = a.get(slot_id)
+            if v:
+                vals[v] += 1
+        if not vals:
+            return -1.0
+        n_distinct   = len(vals)
+        n_singletons = sum(1 for c in vals.values() if c == 1)
+        coverage     = sum(vals.values()) / max(1, len(row_assigns))
+        avg_tokens   = float(np.mean([len(p.split()) for p in vals]))
+        condition_frac = sum(1 for p in vals if _looks_like_condition(p)) / n_distinct
+
+        base = n_distinct - 0.6 * n_singletons
+        base += 0.4 * coverage
+        base += 0.3 * (avg_tokens - 1)
+        base -= 0.8 * condition_frac
+        return base
+
+    # When the LLM produced the role schema, trust its role ordering for slots
+    # whose role name is broadly "measure-like" (base concept) over
+    # "statistic/condition/subtype" (modifiers).  This is generic English
+    # vocabulary, not domain knowledge — same heuristic used by IE slot-
+    # induction work (cf. Xu et al., FCS 2024 IE survey).
+    slot_source = slot_result.get('slot_source', 'mutual_exclusion')
+    _MEASURE_LIKE = {'measure', 'outcome', 'metric', 'variable', 'quantity'}
+    _STATISTIC_LIKE = {'statistic', 'stat', 'aggregate', 'summary'}
+    _CONDITION_LIKE = {'condition', 'modifier', 'context', 'setting'}
+    _SUBTYPE_LIKE   = {'subtype', 'type', 'kind', 'category'}
+
+    def _role_priority(role: str) -> int:
+        r = (role or '').strip().lower()
+        if any(k in r for k in _MEASURE_LIKE):   return 4
+        if any(k in r for k in _STATISTIC_LIKE): return 3
+        if any(k in r for k in _CONDITION_LIKE): return 2
+        if any(k in r for k in _SUBTYPE_LIKE):   return 1
+        return 0   # unknown role — fall back to data-driven score
+
+    if slot_source == 'llm_role_classification':
+        ordered_slots = sorted(
+            range(len(slots)),
+            key=lambda i: (_role_priority(slots[i].get('role_name', '')),
+                            _slot_score(i)),
+            reverse=True)
+    else:
+        ordered_slots = sorted(range(len(slots)), key=_slot_score, reverse=True)
+    ordered_slots = [s for s in ordered_slots if _slot_score(s) > 0]
+    if not ordered_slots:
+        return False
+
+    aggregations_made = [0]   # mutable counter for fallback decision
+
+    def _attach_or_sub_recurse(vi_grp: list, parent: int, depth_left: int):
+        """
+        Terminal-leaf attach point inside slot mining.  When slots are
+        exhausted but the cluster still has enough variables AND we have a
+        post-slot callback (NMF/FASTopic + GMM splitter), recurse further to
+        deepen the tree.  Otherwise attach leaves directly.
+        """
+        if (post_slot_split_fn is not None
+                and len(vi_grp) >= min_post_slot_size
+                and depth_left > 0):
+            try:
+                added = post_slot_split_fn(vi_grp, parent, depth_left)
+                if added > 0:
+                    aggregations_made[0] += added
+                    return
+            except Exception:
+                pass
+        for vi in vi_grp:
+            _add_child(node_map, parent, var_nodes[vi])
+
+    def _split(vi_subset: list, slot_ord_pos: int, current_parent: int,
+               depth_remaining: int):
+        if (depth_remaining <= 0 or slot_ord_pos >= len(ordered_slots)
+                or len(vi_subset) <= 1):
+            _attach_or_sub_recurse(vi_subset, current_parent, depth_remaining)
+            return
+
+        slot_id = ordered_slots[slot_ord_pos]
+        groups: dict = defaultdict(list)
+        unassigned: list = []
+        for vi in vi_subset:
+            local = vi_to_local.get(vi)
+            val = row_assigns[local].get(slot_id) if local is not None else None
+            if val:
+                groups[val].append(vi)
+            else:
+                unassigned.append(vi)
+
+        # If this slot doesn't partition the subset, move to next slot
+        if len(groups) <= 1:
+            _split(vi_subset, slot_ord_pos + 1, current_parent, depth_remaining)
+            return
+
+        # The slot's role name (from LLM classification, when applicable)
+        slot_role = slots[slot_id].get('role_name', '') if slot_id < len(slots) else ''
+        for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
+            if len(vi_grp) == 1:
+                _add_child(node_map, current_parent, var_nodes[vi_grp[0]])
+                continue
+            nid = _next_id(nodes)
+            nd  = _make_node(nid, val.title(),
+                             desc=(f'Role: {slot_role or "—"} | '
+                                   f'Slot phrase: "{val}" | '
+                                   f'Variables: {len(vi_grp)} | '
+                                   f'Source: phrase-slot mining ({slot_source})'))
+            nd['label_provenance'] = {
+                'label_source': 'phrase_slot',
+                'evidence_terms': [val],
+                'confidence': 1.0,
+                'llm_used': slot_source == 'llm_role_classification',
+                'llm_rejected': False,
+                'role':        slot_role,
+            }
+            nd['structure_provenance'] = {
+                'route':           'slot_mining',
+                'aspect_method':   slot_source,
+                'silhouette':      None,
+                'slot_coverage':   round(float(slot_result.get('coverage', 0)), 3),
+                'slot_role':       slot_role,
+            }
+            nodes.append(nd); node_map[nid] = nd
+            _add_child(node_map, current_parent, nid)
+            aggregations_made[0] += 1
+            _split(vi_grp, slot_ord_pos + 1, nid, depth_remaining - 1)
+
+        if unassigned:
+            _attach_or_sub_recurse(unassigned, current_parent, depth_remaining)
+
+    _split(vi_list, 0, parent_id, max_depth_remaining)
+
+    # Reject the slot-built tree if it added almost no structure (likely the
+    # slots were not actually useful for this group).
+    return aggregations_made[0] >= 2
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 6  — DYNAMIC TOP-DOWN LOD TREE  [ZHU §3.3 adapted]
+# ──────────────────────────────────────────────────────────────────────────────
+def _next_id(nodes: list) -> int:
+    return max((int(n['id']) for n in nodes), default=0) + 1
+
+def _add_child(node_map: dict, parent_id: int, child_id: int):
+    p = node_map.get(int(parent_id))
+    if p and int(child_id) not in p['related']:
+        p['related'].append(int(child_id))
+
+def _make_node(nid, name, ntype='aggregation', desc='', dtype='determine') -> dict:
+    return {'id': int(nid), 'name': str(name), 'related': [],
+            'type': ntype, 'desc': str(desc), 'dtype': dtype, 'isShown': True}
+
+# ──────────────────────────────────────────────────────────────────────────────
+# POST-BUILD PASS 1  — SIBLING COMMON-PREFIX FACTORING
+# ──────────────────────────────────────────────────────────────────────────────
+def factor_sibling_common_prefixes(nodes: list,
+                                     min_siblings: int = 3,
+                                     min_prefix_tokens: int = 2) -> int:
+    """
+    For each parent whose ≥`min_siblings` aggregation children share a
+    common multi-token title prefix, insert a new intermediate parent named
+    by that prefix and re-attach the matching siblings under it (with the
+    prefix stripped from each name).
+
+    Generic, no domain knowledge.  Inspired by sibling-label factoring
+    common in faceted-classification systems (Stoica & Hearst, NAACL 2007,
+    'Castanet') — collapsing redundant repeated tokens in sibling names.
+
+    Returns the number of factor-parents inserted.
+    """
+    node_map = {int(n['id']): n for n in nodes}
+    inserted = 0
+    # We iterate over a snapshot of current aggregation nodes
+    queue = [int(n['id']) for n in nodes
+             if n.get('type') in ('aggregation', 'root')]
+    while queue:
+        parent_id = queue.pop(0)
+        parent = node_map.get(parent_id)
+        if not parent:
+            continue
+        # Gather aggregation children with their tokenised names
+        agg_children = []
+        for cid in parent.get('related', []):
+            child = node_map.get(int(cid))
+            if not child or child.get('type') != 'aggregation':
+                continue
+            toks = str(child.get('name', '')).split()
+            if len(toks) >= min_prefix_tokens:
+                agg_children.append((int(cid), toks))
+        if len(agg_children) < min_siblings:
+            continue
+        # Greedy: find the longest prefix shared by ≥ min_siblings children
+        best_prefix: list = []
+        best_group: list = []
+        # Sort children by name tokens for stable grouping
+        agg_children.sort(key=lambda x: x[1])
+        # Try each possible prefix length from longest down
+        max_len = max(len(t) for _, t in agg_children)
+        for length in range(max_len, min_prefix_tokens - 1, -1):
+            prefix_counts: Counter = Counter()
+            for cid, toks in agg_children:
+                if len(toks) > length:   # must have something AFTER the prefix
+                    prefix_counts[tuple(t.lower() for t in toks[:length])] += 1
+            for pfx, cnt in prefix_counts.most_common():
+                if cnt >= min_siblings:
+                    # Reject low-quality prefixes: all-stopword, or starting/
+                    # ending with a stopword (e.g. "the number of", "the").
+                    # A good factored parent is a real concept phrase.
+                    pfx_l = [t.lower() for t in pfx]
+                    if all(t in _STOPWORDS for t in pfx_l):
+                        continue
+                    if pfx_l[0] in _STOPWORDS or pfx_l[-1] in _STOPWORDS:
+                        continue
+                    group = [(cid, toks) for cid, toks in agg_children
+                              if len(toks) > length
+                              and tuple(t.lower() for t in toks[:length]) == pfx]
+                    if len(group) >= min_siblings:
+                        best_prefix = list(pfx)
+                        best_group  = group
+                        break
+            if best_prefix:
+                break
+        if not best_prefix:
+            continue
+
+        # Build the new intermediate parent
+        new_id = max(node_map) + 1
+        # Title-case the prefix using the original child capitalisation
+        # (take it from the first matched child's tokens)
+        orig_tokens = best_group[0][1][:len(best_prefix)]
+        prefix_name = ' '.join(orig_tokens)
+        new_node = _make_node(new_id, prefix_name, ntype='aggregation',
+                               desc=(f'Factored common prefix: "{prefix_name}" | '
+                                     f'Siblings: {len(best_group)} | '
+                                     f'Source: sibling factoring [Castanet 2007]'))
+        new_node['label_provenance'] = {
+            'label_source':     'factored_common_prefix',
+            'evidence_terms':   [c[0] for c in best_group],
+            'confidence':       1.0,
+            'llm_used':         False,
+            'llm_rejected':     False,
+        }
+        new_node['structure_provenance'] = {
+            'route':            'sibling_factoring',
+            'aspect_method':    None,
+            'silhouette':       None,
+            'slot_coverage':    None,
+            'factored_from':    [c[0] for c in best_group],
+            'common_prefix_tokens': len(best_prefix),
+        }
+        # Rename the factored siblings (strip the prefix from their names)
+        for cid, toks in best_group:
+            child = node_map[cid]
+            new_name = ' '.join(toks[len(best_prefix):]).strip()
+            if new_name:
+                child['name'] = new_name
+        # Rewire parent → new_node → factored siblings
+        moved_ids = {c[0] for c in best_group}
+        parent['related'] = [c for c in parent['related']
+                             if int(c) not in moved_ids]
+        parent['related'].append(new_id)
+        new_node['related'] = [c[0] for c in best_group]
+        nodes.append(new_node)
+        node_map[new_id] = new_node
+        inserted += 1
+        # Re-examine this parent in case multiple prefix groups exist
+        queue.append(parent_id)
+        # Also examine the new parent for further nesting
+        queue.append(new_id)
+    return inserted
+
+# ──────────────────────────────────────────────────────────────────────────────
+# POST-BUILD PASS 2  — TRACO-INSPIRED LOW-QUALITY NODE PRUNING
+# ──────────────────────────────────────────────────────────────────────────────
+def prune_low_quality_aggregations(nodes: list,
+                                     tfidf: TfidfVectorizer = None,
+                                     min_coherence: float = 0.0,
+                                     max_child_ratio: float = 1.0) -> int:
+    """
+    Conservative noise pruning.
+
+    ONLY rule applied by default: dissolve aggregation nodes whose name is
+    pure noise — single-word stopword titles ('And', 'Them'), dictionary
+    markers ('Key', 'Note'), or all-stopword titles ('To Be', 'Have A Lot Of').
+
+    Group anchors (route='group_anchor') and sibling-factoring nodes
+    (route='sibling_factoring') are NEVER pruned — they are structural and
+    legitimately have short names that may not score well on TF-IDF metrics.
+
+    Rules B (parent-child coherence) and C (lopsided split) are intentionally
+    DISABLED by default — both metrics punish good hierarchies where children
+    are legitimately more specific than parents (low TF-IDF cosine) or where
+    a slot mining branch happens to be dominated by one large subgroup.
+    They can be opted into by passing tfidf and tightening the thresholds.
+
+    Inspired by TraCo (Wu et al., AAAI 2024): affinity / rationality /
+    diversity failures are common in hierarchical topic models.  Here we
+    enforce the most conservative form of that — only obviously-noise names.
+
+    Returns the number of nodes dissolved.
+    """
+    node_map = {int(n['id']): n for n in nodes}
+    parent_of: dict = {}
+    for n in nodes:
+        for c in n.get('related', []):
+            parent_of.setdefault(int(c), int(n['id']))
+
+    FORBIDDEN = frozenset(_STOPWORDS) | frozenset({
+        'key', 'note', 'item', 'label', 'group', 'cluster',
+    })
+    # Routes that are ALWAYS structural — never prune these even if the name
+    # is short / stopword-like (e.g. 'MOT' is a group anchor, not noise).
+    PROTECTED_ROUTES = frozenset({'group_anchor', 'sibling_factoring'})
+
+    to_dissolve: list = []
+    for n in nodes:
+        if n.get('type') != 'aggregation':
+            continue
+        nid = int(n['id'])
+        if nid not in parent_of:
+            continue
+        sp = n.get('structure_provenance', {})
+        if sp.get('route') in PROTECTED_ROUTES:
+            continue
+
+        name_lower = str(n.get('name', '')).strip().lower()
+        if not name_lower:
+            to_dissolve.append(nid)
+            continue
+
+        # Only Rule A is enforced by default.  A token is "noise" if it is a
+        # stopword/marker OR shorter than 2 chars AND not a digit (numeric
+        # labels like '2' / '4' / '6' / '8' are kept — they may be meaningful,
+        # e.g. PAL pattern counts).
+        toks = name_lower.split()
+        def _is_noise_tok(t: str) -> bool:
+            if t in FORBIDDEN:
+                return True
+            if len(t) < 2 and not t.isdigit():
+                return True
+            return False
+        all_noise = bool(toks) and all(_is_noise_tok(t) for t in toks)
+        if name_lower in FORBIDDEN or all_noise:
+            to_dissolve.append(nid)
+
+    # Dissolve: promote children to grandparent
+    dissolved = 0
+    for nid in to_dissolve:
+        n = node_map.get(nid)
+        if not n:
+            continue
+        gp_id = parent_of.get(nid)
+        if gp_id is None:
+            continue
+        gp = node_map.get(gp_id)
+        if not gp:
+            continue
+        gp['related'] = [c for c in gp['related'] if int(c) != nid]
+        for cid in n.get('related', []):
+            if int(cid) not in gp['related']:
+                gp['related'].append(int(cid))
+            parent_of[int(cid)] = gp_id
+        n['isShown'] = False
+        n['type']    = 'dissolved'
+        n.setdefault('structure_provenance', {})['dissolved_reason'] = \
+            'noise_label_only'
+        dissolved += 1
+    return dissolved
+
+def build_dynamic_lod_tree(can: pd.DataFrame,
+                            aspect_reprs: list,
+                            aspect_labels: list,
+                            tfidf: TfidfVectorizer,
+                            max_depth: int = 6,
+                            min_cluster_size: int = 2,
+                            sil_threshold: float = 0.04,
+                            max_clusters_per_split: int = 8,
+                            project: str = 'project',
+                            local_nmf: bool = True,
+                            min_local_nmf_size: int = 8,
+                            max_aspects: int = 10,
+                            sbert_model=None,
+                            llm_label_fn: Optional[Callable] = None,
+                            use_slot_mining: bool = True,
+                            text_col_names: Optional[list] = None,
+                            use_fastopic: bool = True,
+                            fastopic_min_size: int = 8,
+                            llm_role_classifier_fn: Optional[Callable] = None,
+                            use_role_decomposition: bool = True,
+                            llm_role_namer_fn: Optional[Callable] = None,
+                            role_regularity_threshold: float = 0.40,
+                            per_row_role_extractor_fn: Optional[Callable] = None,
+                            use_per_row_role_extraction: bool = True) -> list:
+    """
+    Build a dynamic top-down LoD tree.
+
+    Entry strategy (new — fully data-driven, no hardcoding):
+      • If detected _group metadata provides L1/L2 structure, materialise those
+        path segments as aggregation nodes first.
+      • Then apply NMF aspect discovery and GMM clustering *locally* inside each
+        terminal group (not globally across all variables).
+      • Falls back to global NMF from root when no group structure is found.
+
+    Within each recursive split [ZHU §3.3 adapted]:
+      • Evaluate K aspects by silhouette — select the highest  (simplified
+        best-aspect split; not the full probabilistic search of Zhu Eq.6/7).
+      • Single-variable clusters are attached directly without an aggregation
+        wrapper (singleton prevention).
+    """
+    texts  = can['_text'].fillna('').astype(str).tolist()
+    n_vars = len(can)
+
+    # ── build leaf attribute nodes (ids 1..n_vars) ───────────────────────────
+    nodes: list    = [_make_node(0, project, ntype='root', desc='Root node')]
+    var_nodes: dict = {}   # can positional index → node id
+    for i, (_, row) in enumerate(can.iterrows(), start=1):
+        nd = _make_node(i, row['_label'], ntype='attribute',
+                        desc=row['_text'], dtype='determine')
+        nd['metadata'] = {
+            'leaf_id': str(row['_id']),
+            'row_index': int(row['_row']),
+            'group': row['_group'],
+        }
+        nodes.append(nd)
+        var_nodes[int(row.name)] = i
+    node_map: dict = {int(n['id']): n for n in nodes}
+
+    # ── recursive splitter ────────────────────────────────────────────────────
+    def _recurse(vi_global: list,
+                 cur_reprs: list,
+                 cur_labels: list,
+                 cur_tfidf: TfidfVectorizer,
+                 parent_id: int,
+                 depth: int,
+                 parent_path: str,
+                 aspect_method_tag: str = 'nmf'):
+        """
+        vi_global  : global positional indices into `can` for this node's variables.
+        cur_reprs  : list of K arrays, each shape (len(vi_global), d).
+                     Rows correspond positionally to vi_global — no global indexing.
+        cur_labels : NMF aspect labels for cur_reprs.
+        cur_tfidf  : TF-IDF vectorizer fitted on this scope's texts.
+        """
+        if depth >= max_depth or len(vi_global) < min_cluster_size:
+            for vi in vi_global:
+                _add_child(node_map, parent_id, var_nodes[vi])
+            return
+
+        # NEW: shortcut for tiny homogeneous clusters — same _group, ≤3 vars.
+        # Avoids spending GMM/silhouette evaluation on already-meaningful leaves.
+        if len(vi_global) <= 3:
+            groups_here = {str(can.iloc[vi]['_group']) for vi in vi_global}
+            if len(groups_here) == 1:
+                for vi in vi_global:
+                    _add_child(node_map, parent_id, var_nodes[vi])
+                return
+
+        sub_texts = [texts[vi] for vi in vi_global]
+
+        # Evaluate every aspect — pick the one with highest silhouette [ZHU §3.3]
+        best_sil, best_k_idx, best_lbls = -1.0, -1, None
+        for k_idx, rep in enumerate(cur_reprs):
+            if rep.shape[0] < 3:
+                continue
+            lbls, _, sil = cluster_aspect_gmm(
+                rep, max_k=min(max_clusters_per_split, len(vi_global) // 2))
+            if sil > best_sil:
+                best_sil, best_k_idx, best_lbls = sil, k_idx, lbls
+
+        if best_k_idx == -1 or best_sil < sil_threshold or best_lbls is None:
+            for vi in vi_global:
+                _add_child(node_map, parent_id, var_nodes[vi])
+            return
+
+        # NEW: split-quality rejection (TraCo-inspired structural check).
+        # Reject splits that are extremely imbalanced or mostly singletons —
+        # silhouette can be high even when one cluster swallows most variables.
+        cluster_sizes = list(Counter(best_lbls).values())
+        max_ratio     = max(cluster_sizes) / sum(cluster_sizes)
+        n_singletons  = sum(1 for s in cluster_sizes if s == 1)
+        if max_ratio > 0.85 or n_singletons > len(cluster_sizes) // 2:
+            for vi in vi_global:
+                _add_child(node_map, parent_id, var_nodes[vi])
+            return
+
+        # Group variables by cluster assignment
+        cluster_to_global: dict = defaultdict(list)
+        for local_pos, (vi, cl) in enumerate(zip(vi_global, best_lbls)):
+            cluster_to_global[int(cl)].append(vi)
+
+        aspect_name     = cur_labels[best_k_idx]
+        global_to_local = {vi: idx for idx, vi in enumerate(vi_global)}
+
+        for cl_id, cl_global in cluster_to_global.items():
+            if not cl_global:
+                continue
+
+            # Singleton prevention: attach single-variable clusters directly [NEW]
+            if len(cl_global) == 1:
+                _add_child(node_map, parent_id, var_nodes[cl_global[0]])
+                continue
+
+            cl_texts  = [texts[vi] for vi in cl_global]
+            cl_groups = [str(can.iloc[vi]['_group']) for vi in cl_global]
+            lbl, prov = label_cluster(cl_texts, sub_texts, cur_tfidf,
+                                       cluster_groups=cl_groups,
+                                       parent_path=parent_path,
+                                       llm_label_fn=llm_label_fn,
+                                       return_provenance=True)
+            desc = (f'Aspect: {aspect_name} | '
+                    f'Silhouette: {best_sil:.3f} | '
+                    f'Variables: {len(cl_global)}')
+            nid = _next_id(nodes)
+            nd  = _make_node(nid, lbl, desc=desc)
+            nd['label_provenance'] = prov
+            nd['structure_provenance'] = {
+                'route':           'aspect_clustering',
+                'aspect_method':   aspect_method_tag,
+                'silhouette':      round(float(best_sil), 4),
+                'slot_coverage':   None,
+            }
+            nodes.append(nd)
+            node_map[nid] = nd
+            _add_child(node_map, parent_id, nid)
+
+            # Slice embeddings to this cluster's local positions and recurse
+            cl_pos     = [global_to_local[vi] for vi in cl_global]
+            sub_reprs  = [rep[cl_pos] for rep in cur_reprs]
+            _recurse(cl_global, sub_reprs, cur_labels, cur_tfidf,
+                     nid, depth + 1, f'{parent_path} > {lbl}',
+                     aspect_method_tag=aspect_method_tag)
+
+    # ── group-anchored entry  ─────────────────────────────────────────────────
+    # The _group column is built from user-confirmed group_cols — fully data-driven.
+    # We materialise each path segment as an aggregation node, then run NMF/GMM
+    # only inside each terminal group.
+
+    group_node_ids: dict = {}   # path string → node id
+
+    def _get_or_create_group_node(path: str, parent_id: int) -> int:
+        if path in group_node_ids:
+            return group_node_ids[path]
+        seg_name = path.split(' > ')[-1].strip()
+        nid = _next_id(nodes)
+        nd  = _make_node(nid, seg_name, ntype='aggregation', desc=f'Group: {path}')
+        nd['structure_provenance'] = {
+            'route':           'group_anchor',
+            'aspect_method':   None,
+            'silhouette':      None,
+            'slot_coverage':   None,
+        }
+        nodes.append(nd)
+        node_map[nid] = nd
+        _add_child(node_map, parent_id, nid)
+        group_node_ids[path] = nid
+        return nid
+
+    # Collect global variable positions per terminal group path
+    terminal_groups: dict = defaultdict(list)
+    for vi, (_, row) in enumerate(can.iterrows()):
+        terminal_groups[str(row['_group'])].append(vi)
+
+    non_ungrouped = [g for g in terminal_groups if g.strip().lower() != 'ungrouped']
+
+    if not non_ungrouped:
+        # No group structure detected → fall back to global NMF from root
+        full_reprs = [rep[list(range(n_vars))] for rep in aspect_reprs]
+        _recurse(list(range(n_vars)), full_reprs, aspect_labels, tfidf,
+                 0, 0, project)
+    else:
+        # Optional Streamlit progress bar — visible feedback for large datasets
+        # like HCP where the per-group loop dominates runtime.
+        try:
+            import streamlit as _st_progress
+            _pbar = _st_progress.progress(0.0, text='Building groups…')
+            _show_progress = True
+        except Exception:
+            _pbar = None
+            _show_progress = False
+
+        sorted_groups = sorted(terminal_groups.items())
+        n_groups_total = len(sorted_groups)
+
+        for _g_idx, (group_path, vi_list) in enumerate(sorted_groups):
+            if _show_progress and _pbar is not None:
+                try:
+                    short_path = group_path[:60] + ('…' if len(group_path) > 60 else '')
+                    _pbar.progress((_g_idx + 1) / max(1, n_groups_total),
+                                    text=f'[{_g_idx + 1}/{n_groups_total}] '
+                                         f'{short_path}  ({len(vi_list)} vars)')
+                except Exception:
+                    pass
+            # Build L1/L2 path nodes from detected group metadata
+            segments = [s.strip() for s in group_path.split(' > ') if s.strip()]
+            pid = 0
+            for depth_seg in range(len(segments)):
+                cumpath = ' > '.join(segments[:depth_seg + 1])
+                pid = _get_or_create_group_node(cumpath, pid)
+            existing_depth = len(segments)
+
+            if len(vi_list) < min_cluster_size:
+                for vi in vi_list:
+                    _add_child(node_map, pid, var_nodes[vi])
+                continue
+
+            # ── Aspect-discovery callback for this group ──
+            # Top-level: optionally uses FASTopic (slow, transformer-based,
+            # semantic).  Sub-recursion: NMF only (fast, lexical).  This split
+            # gives the best of both: rich top-level structure + quick deeper
+            # splits.  Critical for HCP performance (~10× speedup vs running
+            # FASTopic in every sub-recursion).
+            def _aspect_recurse_for_group(vi_sub: list, parent: int,
+                                            depth_left: int = 99,
+                                            is_top_level: bool = False) -> int:
+                local_texts = [texts[vi] for vi in vi_sub]
+                use_reprs, use_labels, use_tfidf = None, None, None
+                aspect_method_tag = 'sliced_global'
+
+                if (is_top_level and use_fastopic and _FASTOPIC_AVAILABLE
+                        and len(vi_sub) >= fastopic_min_size):
+                    try:
+                        f_tfidf, _, _, _, f_H, _, f_labels = discover_aspects_fastopic(
+                            local_texts, max_aspects=max_aspects)
+                        f_reprs = per_aspect_representations(
+                            local_texts, f_H, f_tfidf, sbert_model)
+                        use_reprs, use_labels, use_tfidf = f_reprs, f_labels, f_tfidf
+                        aspect_method_tag = 'fastopic'
+                    except Exception:
+                        use_reprs = None
+                if use_reprs is None and local_nmf and len(vi_sub) >= min_local_nmf_size:
+                    try:
+                        l_tfidf, _, _, _, l_H, _, l_labels = discover_aspects(
+                            local_texts, max_aspects)
+                        l_reprs = per_aspect_representations(
+                            local_texts, l_H, l_tfidf, sbert_model)
+                        use_reprs, use_labels, use_tfidf = l_reprs, l_labels, l_tfidf
+                        aspect_method_tag = 'nmf'
+                    except Exception:
+                        use_reprs = None
+                if use_reprs is None:
+                    use_reprs  = [rep[vi_sub] for rep in aspect_reprs]
+                    use_labels, use_tfidf = aspect_labels, tfidf
+
+                before = sum(1 for n in nodes if n.get('type') == 'aggregation')
+                effective_depth = max(0, min(depth_left, max_depth))
+                _recurse(vi_sub, use_reprs, use_labels, use_tfidf,
+                         parent, max(0, max_depth - effective_depth), group_path,
+                         aspect_method_tag=aspect_method_tag)
+                after = sum(1 for n in nodes if n.get('type') == 'aggregation')
+                return max(0, after - before)
+
+            local_texts = [texts[vi] for vi in vi_list]
+            _top_grp = group_path.split(' > ')[0].strip()
+
+            role_built = False
+            role_regularity = None
+            route_label = None
+
+            # ── ROUTING 1A: PER-ROW LLM ROLE EXTRACTION (Zhu et al. EMNLP 2025) ──
+            # The cleanest semantic route: one LLM call per row extracts
+            # measure / statistic / condition / subtype values directly from
+            # the description text.  Strict substring grounding prevents
+            # hallucination.  Bypasses SBERT phrase clustering entirely.
+            if (use_per_row_role_extraction
+                    and per_row_role_extractor_fn is not None
+                    and len(vi_list) >= 4):
+                try:
+                    role_result = discover_roles_via_per_row_extraction(
+                        local_texts,
+                        text_col_names=text_col_names,
+                        per_row_extractor_fn=per_row_role_extractor_fn,
+                        group_name=_top_grp,
+                        regularity_threshold=role_regularity_threshold,
+                    )
+                    role_regularity = role_result.get('regularity')
+                    # Always capture audit data, even when valid=False, so the
+                    # Role Decomposition tab can show what the LLM proposed
+                    # and why proposals were rejected.  group_path + vi_list
+                    # let the display map each audit row back to its variable.
+                    try:
+                        st.session_state.a2_per_row_audit.append({
+                            'group_name':   _top_grp,
+                            'group_path':   group_path,
+                            'vi_list':      [int(v) for v in vi_list],
+                            'n_rows':       len(vi_list),
+                            'coverage':     role_result.get('coverage'),
+                            'valid':        role_result.get('valid'),
+                            'roles_final':  list(role_result.get('roles', {}).keys()),
+                            'summary':      role_result.get('extractor_summary', {}),
+                            'per_row_audit': role_result.get('per_row_audit', []),
+                        })
+                    except Exception:
+                        pass
+                    if role_result['valid']:
+                        role_built = build_role_hierarchy(
+                            vi_list, role_result, can,
+                            pid, group_path, nodes, node_map, var_nodes,
+                            max_depth_remaining=max(2, max_depth - existing_depth),
+                            post_split_fn=lambda vi_grp, p, d:
+                                _aspect_recurse_for_group(vi_grp, p, d,
+                                                            is_top_level=False),
+                            min_post_split_size=max(6, min_cluster_size + 4))
+                        if role_built:
+                            route_label = 'per_row_llm_extraction'
+                except Exception:
+                    role_built = False
+
+            # ── ROUTING 1B: SBERT phrase clustering fallback (Option D original) ──
+            if not role_built and (use_role_decomposition and sbert_model is not None
+                    and len(vi_list) >= 6):
+                try:
+                    role_result = discover_roles_via_sbert_phrase_clustering(
+                        local_texts,
+                        text_col_names=text_col_names,
+                        sbert_model=sbert_model,
+                        llm_role_namer_fn=llm_role_namer_fn,
+                        group_name=_top_grp,
+                        regularity_threshold=role_regularity_threshold,
+                    )
+                    role_regularity = role_result.get('regularity')
+                    if role_result['valid']:
+                        role_built = build_role_hierarchy(
+                            vi_list, role_result, can,
+                            pid, group_path, nodes, node_map, var_nodes,
+                            max_depth_remaining=max(2, max_depth - existing_depth),
+                            post_split_fn=lambda vi_grp, p, d:
+                                _aspect_recurse_for_group(vi_grp, p, d,
+                                                            is_top_level=False),
+                            min_post_split_size=max(6, min_cluster_size + 4))
+                        if role_built:
+                            route_label = 'sbert_phrase_clustering'
+                except Exception:
+                    role_built = False
+            if role_built:
+                anchor = node_map.get(pid)
+                if anchor is not None:
+                    anchor.setdefault('structure_provenance', {})
+                    anchor['structure_provenance']['phrase_regularity'] = role_regularity
+                    anchor['structure_provenance']['route_used'] = route_label
+                continue
+
+            # ── ROUTING 2: phrase-slot mining (IE / mutual-exclusion) ──
+            slot_built = False
+            if use_slot_mining and len(vi_list) >= 6:
+                try:
+                    slot_result = mine_phrase_slots(
+                        local_texts,
+                        text_col_names=text_col_names,
+                        group_name=_top_grp,
+                        llm_role_classifier_fn=llm_role_classifier_fn,
+                    )
+                    if slot_result['valid']:
+                        slot_built = build_slot_hierarchy(
+                            vi_list, slot_result, can,
+                            pid, group_path, nodes, node_map, var_nodes,
+                            max_depth_remaining=max(2, max_depth - existing_depth),
+                            post_slot_split_fn=lambda vi_grp, p, d:
+                                _aspect_recurse_for_group(vi_grp, p, d,
+                                                            is_top_level=False),
+                            min_post_slot_size=max(6, min_cluster_size + 4))
+                except Exception:
+                    slot_built = False
+            if slot_built:
+                anchor = node_map.get(pid)
+                if anchor is not None:
+                    anchor.setdefault('structure_provenance', {})
+                    anchor['structure_provenance']['phrase_regularity'] = role_regularity
+                    anchor['structure_provenance']['route_used'] = 'slot_mining_fallback'
+                continue
+
+            # ── ROUTING 3: aspect discovery (FASTopic / NMF) ──
+            anchor = node_map.get(pid)
+            if anchor is not None:
+                anchor.setdefault('structure_provenance', {})
+                anchor['structure_provenance']['phrase_regularity'] = role_regularity
+                anchor['structure_provenance']['route_used'] = 'aspect_clustering_fallback'
+            _aspect_recurse_for_group(vi_list, pid, is_top_level=True)
+
+    # Clear the progress bar
+    try:
+        if _show_progress and _pbar is not None:
+            _pbar.empty()
+    except Exception:
+        pass
+
+    # ── POST-BUILD PASS 1 — sibling common-prefix factoring [Castanet 2007] ──
+    try:
+        n_factored = factor_sibling_common_prefixes(
+            nodes, min_siblings=3, min_prefix_tokens=2)
+    except Exception:
+        n_factored = 0
+
+    # ── POST-BUILD PASS 2 — conservative noise-label pruning ──────────────────
+    try:
+        n_dissolved = prune_low_quality_aggregations(nodes)
+    except Exception:
+        n_dissolved = 0
+
+    # ── POST-BUILD PASS 3 — strip group prefix from child labels ──────────────
+    # 'DMS — Total Errors' under DMS → 'Total Errors'.  Reduces visual
+    # redundancy without altering tree structure.
+    try:
+        n_stripped = strip_group_prefix_from_labels(nodes)
+    except Exception:
+        n_stripped = 0
+
+    # ── POST-BUILD PASS 4 — enforce single parent (collapse DAG → tree) ───────
+    # Keeps each variable under its most specific parent so the hierarchy is a
+    # true tree, matching the viewer/evaluator assumptions and rendering proportionally.
+    try:
+        n_reparented = enforce_single_parent(nodes)
+    except Exception:
+        n_reparented = 0
+
+    # ── POST-BUILD PASS 5 — drop aggregation nodes left childless by PASS 4 ───
+    try:
+        n_empty_pruned = prune_empty_aggregations(nodes)
+    except Exception:
+        n_empty_pruned = 0
+
+    # Annotate the root with post-build statistics
+    if nodes and nodes[0].get('type') == 'root':
+        nodes[0]['post_build_stats'] = {
+            'sibling_factor_nodes_inserted': int(n_factored),
+            'low_quality_nodes_dissolved':   int(n_dissolved),
+            'group_prefix_labels_stripped':  int(n_stripped),
+            'dag_links_removed':             int(n_reparented),
+            'empty_aggregations_pruned':     int(n_empty_pruned),
+        }
+
+    # Deduplicate children
+    for nd in nodes:
+        nd['related'] = list(dict.fromkeys(int(x) for x in nd['related']))
+
+    return nodes
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 9  — EVALUATION  [ZHU Table 2 / TraCo / TICL §3.4]
+# ──────────────────────────────────────────────────────────────────────────────
+def purity_score(true_labels, pred_labels) -> float:
+    true = np.array(true_labels); pred = np.array(pred_labels)
+    total = len(true)
+    if total == 0:
+        return 0.0
+    score = 0
+    for cl in np.unique(pred):
+        mask = pred == cl
+        if not mask.any():
+            continue
+        counts = np.bincount(true[mask].astype(int))
+        score += counts.max()
+    return score / total
+
+def evaluate(true_labels, pred_labels) -> dict:
+    le = LabelEncoder()
+    tl = le.fit_transform([str(x) for x in true_labels])
+    pl = np.array(pred_labels, dtype=int)
+    return {
+        'NMI':    round(float(normalized_mutual_info_score(tl, pl)), 4),
+        'ARI':    round(float(adjusted_rand_score(tl, pl)),          4),
+        'Purity': round(purity_score(tl, pl),                        4),
+    }
+
+def hierarchy_quality_metrics(nodes: list, tfidf: TfidfVectorizer) -> dict:
+    """
+    Compute lightweight structural quality metrics inspired by TraCo (AAAI 2024).
+
+    [TraCo] Wu et al. identify three failure modes in hierarchical topic models:
+    low affinity (children unrelated to parent), low rationality (children not
+    more specific than parent), and low diversity (sibling topics too similar).
+
+    Here we measure:
+    • parent-child coherence: mean cosine(parent_label, child_label) — proxy for affinity.
+    • sibling diversity: mean (1 - pairwise cosine) among siblings — proxy for diversity.
+    """
+    node_map = {n['id']: n for n in nodes}
+    coherence_scores, diversity_scores = [], []
+
+    for n in nodes:
+        if n['type'] != 'aggregation' or not n['related']:
+            continue
+        children     = [node_map[c] for c in n['related'] if c in node_map]
+        child_labels = [c['name'] for c in children
+                        if c.get('type') == 'aggregation' and c['name'] != 'Group']
+        if len(child_labels) < 2:
+            continue
+
+        try:
+            parent_vec  = tfidf.transform([n['name']])
+            child_vecs  = tfidf.transform(child_labels)
+            # Parent-child coherence
+            coh = float(cosine_similarity(parent_vec, child_vecs).mean())
+            coherence_scores.append(coh)
+            # Sibling diversity
+            sib_sims = cosine_similarity(child_vecs)
+            np.fill_diagonal(sib_sims, 0)
+            diversity_scores.append(float(1 - sib_sims.mean()))
+        except Exception:
+            pass
+
+    return {
+        'mean_parent_child_coherence': round(np.mean(coherence_scores), 4) if coherence_scores else 0.0,
+        'mean_sibling_diversity':      round(np.mean(diversity_scores),  4) if diversity_scores else 0.0,
+        'n_aggregation_nodes':         len([n for n in nodes if n['type'] == 'aggregation']),
+        'n_singleton_splits':          len([n for n in nodes
+                                           if n['type'] == 'aggregation'
+                                           and len(n['related']) == 1]),
+    }
+
+# ──────────────────────────────────────────────────────────────────────────────
+# DISPLAY-TIME ONE-CHILD CHAIN COMPRESSION  (visualization only — structure preserved in JSON)
+# ──────────────────────────────────────────────────────────────────────────────
+def compress_one_child_chains(nodes: list) -> list:
+    """
+    Display-only transformation: collapse chains where an aggregation node has
+    exactly one aggregation child (e.g. 'DMS → DMS Recommended Standard').
+    The collapsed node displays as 'DMS / DMS Recommended Standard'.
+    Structural data in the exported JSON is NOT modified — this returns a new
+    node list used only for visualisation.
+    """
+    nodes = _filter_dissolved(nodes)
+    nm = {int(n['id']): dict(n) for n in nodes}   # deep-ish copy
+    parent_of: dict = {}
+    for n in nodes:
+        for c in n.get('related', []):
+            parent_of.setdefault(int(c), int(n['id']))
+
+    def _is_chain_link(n):
+        if n.get('type') != 'aggregation':
+            return False
+        children = n.get('related', [])
+        return (len(children) == 1
+                and nm.get(int(children[0]), {}).get('type') == 'aggregation')
+
+    changed = True
+    while changed:
+        changed = False
+        for nid, n in list(nm.items()):
+            if _is_chain_link(n):
+                child_id = int(n['related'][0])
+                child    = nm[child_id]
+                # Merge: child takes parent's id, parent's slot, but combined name
+                merged_name = f"{n['name']} / {child['name']}"
+                new_node = dict(child)
+                new_node['id']   = nid
+                new_node['name'] = merged_name
+                new_node['desc'] = f"{n.get('desc','')} | {child.get('desc','')}"
+                # Rewire child's children to nid (already nid)
+                nm[nid] = new_node
+                # Remove the original child node
+                if child_id in nm:
+                    del nm[child_id]
+                # Re-point any references to child_id → nid
+                for other in nm.values():
+                    other['related'] = [nid if int(c) == child_id else int(c)
+                                        for c in other.get('related', [])]
+                changed = True
+                break
+
+    return list(nm.values())
+
+# ──────────────────────────────────────────────────────────────────────────────
+# VISUALISATION HELPERS
+# ──────────────────────────────────────────────────────────────────────────────
+def _filter_dissolved(nodes: list) -> list:
+    """
+    Drop dissolved/hidden nodes for visualisation.  Children of dissolved
+    nodes were already promoted to the grandparent by the pruning pass, so
+    dropping the dissolved wrapper here makes the tree render cleanly.
+    """
+    drop_ids = {int(n['id']) for n in nodes
+                 if n.get('type') == 'dissolved' or n.get('isShown') is False}
+    if not drop_ids:
+        return nodes
+    out = []
+    for n in nodes:
+        if int(n['id']) in drop_ids:
+            continue
+        m = dict(n)
+        m['related'] = [int(c) for c in n.get('related', [])
+                         if int(c) not in drop_ids]
+        out.append(m)
+    return out
+
+def _leaf_ids(nodes: list, nid: int) -> list:
+    m = {int(n['id']): n for n in nodes}
+    out = []
+    def rec(x):
+        n = m.get(int(x))
+        if not n: return
+        if n.get('type') == 'attribute': out.append(int(x)); return
+        for c in n.get('related', []): rec(int(c))
+    rec(nid)
+    return list(dict.fromkeys(out))
+
+def _parent_map(nodes: list) -> dict:
+    pm = {}
+    for n in nodes:
+        for c in n.get('related', []):
+            if int(c) not in pm:
+                pm[int(c)] = int(n['id'])
+    return pm
+
+def _tree_value_map(nodes: list, pm: dict) -> dict:
+    """
+    Leaf count per node measured along the *rendered* tree (each node has
+    exactly one parent, per `pm`).  Plotly draws sectors using that same
+    single-parent structure, so values built this way always satisfy
+    parent == sum(children) — which is what branchvalues='total' requires.
+
+    The full hierarchy can be a DAG (a variable promoted under more than one
+    role branch), in which case `_leaf_ids` double-counts a shared leaf and a
+    parent's unique-leaf count comes out *less* than the sum of its children's
+    counts.  Feeding those numbers to a 'total' chart blanks it.  Counting on
+    the rendered tree instead avoids that without changing the hierarchy.
+    """
+    kids = {}
+    for child, par in pm.items():
+        kids.setdefault(int(par), []).append(int(child))
+    nodemap = {int(n['id']): n for n in nodes}
+    memo = {}
+    def count(nid: int) -> int:
+        if nid in memo:
+            return memo[nid]
+        memo[nid] = 1  # guard against cycles while recursing
+        n = nodemap.get(nid)
+        if n is not None and n.get('type') == 'attribute':
+            memo[nid] = 1
+            return 1
+        ch = kids.get(nid, [])
+        v = sum(count(c) for c in ch) if ch else 1
+        memo[nid] = max(1, v)
+        return memo[nid]
+    return {nid: count(nid) for nid in nodemap}
+
+def _wrap_hover(text: str, width: int = 80) -> str:
+    """Soft-wrap long descriptions onto multiple <br>-separated lines so the
+    Plotly hover tooltip shows the full text instead of being cut off."""
+    import textwrap as _tw
+    s = str(text or '')
+    if not s:
+        return ''
+    lines = []
+    for raw_line in s.split('\n'):
+        lines.extend(_tw.wrap(raw_line, width=width) or [''])
+    return '<br>'.join(lines)
+
+def plot_sunburst(nodes: list, max_depth: int = 4):
+    nodes = _filter_dissolved(nodes)
+    pm = _parent_map(nodes)
+    vm = _tree_value_map(nodes, pm)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n['id'])
+        lc  = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get('name', ''))[:40])
+        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+        values.append(vm.get(nid, 1))
+        hover.append(f"<b>{n.get('name','')}</b><br>Type: {n.get('type','')}<br>"
+                     f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc',''))}")
+    fig = go.Figure(go.Sunburst(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues='total', hovertext=hover, hoverinfo='text',
+        maxdepth=max_depth, insidetextorientation='radial',
+        marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
+                      title=dict(text='Click sector to drill down — click centre to go back',
+                                 font=dict(size=13), x=0.5))
+    return fig
+
+def plot_treemap(nodes: list):
+    nodes = _filter_dissolved(nodes)
+    pm = _parent_map(nodes)
+    vm = _tree_value_map(nodes, pm)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n['id'])
+        lc  = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get('name', ''))[:40])
+        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+        values.append(vm.get(nid, 1))
+        hover.append(f"<b>{n.get('name','')}</b><br>Variables: {lc}<br>"
+                     f"{_wrap_hover(n.get('desc',''))}")
+    fig = go.Figure(go.Treemap(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues='total', hovertext=hover, hoverinfo='text',
+        textinfo='label+value',
+        marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
+    return fig
+
+# ──────────────────────────────────────────────────────────────────────────────
+# NODE-LINK TREE  — Reingold-Tilford layout (matches Approach 1.1 interface)
+# ──────────────────────────────────────────────────────────────────────────────
+def _a2_node_color(n: dict) -> str:
+    t = n.get('type', '')
+    if t == 'root':      return '#c44e52'
+    if t == 'attribute': return '#4C72B0'
+    if t == 'collapsed': return '#bbbbbb'
+    return '#8C8C8C'
+
+def _display_graph(nodes: list, max_depth: int = 4, show_hidden: bool = False):
+    """Walk tree to chosen depth, inserting 'collapsed' placeholders for cut-off branches."""
+    m = {int(n['id']): n for n in nodes}
+    dnodes: dict = {}
+    edges: list  = []
+    counter = 10 ** 9
+
+    def rec(nid, depth):
+        nonlocal counter
+        n = m.get(int(nid))
+        if not n:
+            return
+        if not show_hidden and n.get('isShown') is False and depth > 0:
+            return
+        dnodes[int(nid)] = n
+        if depth >= max_depth and n.get('related'):
+            counter += 1
+            cid = counter
+            n_leaves = len(_leaf_ids(nodes, nid))
+            dnodes[cid] = {'id': cid,
+                           'name': f'… {n_leaves} variables',
+                           'type': 'collapsed', 'dtype': 'determine',
+                           'related': [], 'desc': f"Collapsed: {n.get('name')}",
+                           'isShown': True}
+            edges.append((int(nid), cid))
+            return
+        for c in n.get('related', []):
+            ch = m.get(int(c))
+            if not ch:
+                continue
+            if not show_hidden and ch.get('isShown') is False:
+                continue
+            edges.append((int(nid), int(c)))
+            rec(int(c), depth + 1)
+
+    rec(0, 0)
+    return list(dnodes.values()), edges
+
+def _positions(dnodes: list, edges: list):
+    """Reingold-Tilford style positions: x=depth, y=subtree-aware vertical."""
+    H_SCALE = 3.0
+    V_SPACE = 1.8
+    children: dict = defaultdict(list)
+    for p, c in edges:
+        children[p].append(c)
+    pos: dict = {}
+    counter = {'v': 0}
+
+    def rec(nid, depth):
+        ch = children.get(nid, [])
+        if not ch:
+            y_pos = counter['v'] * V_SPACE
+            counter['v'] += 1
+            pos[nid] = (depth * H_SCALE, y_pos)
+            return y_pos
+        child_ys = [rec(c, depth + 1) for c in ch]
+        y_pos = float(np.mean(child_ys))
+        pos[nid] = (depth * H_SCALE, y_pos)
+        return y_pos
+
+    rec(0, 0)
+    return pos
+
+def plot_node_link(nodes: list, max_depth: int = 4,
+                    show_hidden: bool = False, show_leaf_labels: bool = False):
+    """
+    Node-link tree with elbow edges (matches Approach 1.1 layout).
+    Best for exploring structure at moderate depth — Sunburst remains
+    recommended for large hierarchies per Taxonomizer (Bian et al. 2020).
+    """
+    nodes = _filter_dissolved(nodes)
+    dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
+    pos = _positions(dnodes, edges)
+
+    # Elbow edges
+    ex, ey = [], []
+    for p, c in edges:
+        if p not in pos or c not in pos:
+            continue
+        x0, y0 = pos[p]
+        x1, y1 = pos[c]
+        xm = (x0 + x1) / 2
+        ex += [x0, xm, xm, x1, None]
+        ey += [y0, y0, y1, y1, None]
+    traces = [go.Scatter(x=ex, y=ey, mode='lines',
+                          line=dict(width=1, color='#c8c8c8'),
+                          hoverinfo='skip', showlegend=False)]
+
+    agg_xs, agg_ys, agg_labels, agg_colors, agg_hover = [], [], [], [], []
+    lf_xs,  lf_ys,  lf_labels,  lf_colors,  lf_hover  = [], [], [], [], []
+
+    for n in dnodes:
+        nid = int(n['id'])
+        if nid not in pos:
+            continue
+        x, y = pos[nid]
+        lc   = len(_leaf_ids(nodes, nid))
+        lab  = n.get('name', str(nid))
+        htxt = (f"<b>{n.get('name','')}</b><br>"
+                f"Type: {n.get('type','')}<br>"
+                f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc',''))}")
+        col  = _a2_node_color(n)
+
+        if n.get('type') in ('root', 'aggregation', 'collapsed'):
+            display_lab = (lab + (f' ({lc})' if lc else ''))[:50]
+            agg_xs.append(x); agg_ys.append(y)
+            agg_labels.append(display_lab); agg_colors.append(col); agg_hover.append(htxt)
+        else:
+            display_lab = lab[:40] if show_leaf_labels else ''
+            lf_xs.append(x); lf_ys.append(y)
+            lf_labels.append(display_lab); lf_colors.append(col); lf_hover.append(htxt)
+
+    if agg_xs:
+        traces.append(go.Scatter(
+            x=agg_xs, y=agg_ys, mode='markers+text',
+            text=agg_labels, textposition='middle right',
+            hovertext=agg_hover, hoverinfo='text',
+            marker=dict(size=16, color=agg_colors,
+                        line=dict(color='white', width=2)),
+            showlegend=False))
+    if lf_xs:
+        traces.append(go.Scatter(
+            x=lf_xs, y=lf_ys, mode='markers+text',
+            text=lf_labels, textposition='middle right',
+            hovertext=lf_hover, hoverinfo='text',
+            marker=dict(size=7, color=lf_colors, symbol='circle',
+                        opacity=0.75, line=dict(color='white', width=1)),
+            showlegend=False))
+
+    n_leaves = max(12, len(lf_xs))
+    fig = go.Figure(traces)
+    fig.update_layout(
+        height=max(700, min(4000, int(n_leaves * 32))),
+        margin=dict(l=20, r=220, t=40, b=20),
+        plot_bgcolor='white', paper_bgcolor='white',
+        xaxis=dict(visible=False, fixedrange=False),
+        yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
+        dragmode='pan',
+        annotations=[dict(
+            text='Tip: Sunburst is better for large hierarchies [Taxonomizer 2020]',
+            xref='paper', yref='paper', x=0.0, y=1.01,
+            showarrow=False, font=dict(size=11, color='grey'), align='left')]
+    )
+    return fig
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STREAMLIT APP
+# ──────────────────────────────────────────────────────────────────────────────
+st.set_page_config(page_title='Approach 2 — Multi-Aspect Hierarchy',
+                   layout='wide')
+st.title('Approach 2 — Role-Decomposed Metadata Hierarchy')
+st.caption('Group anchoring → LLM role extraction → role-nested LoD tree. '
+           'Full method details and citations in the Method tab.')
+
+# Session-state init — defensive: always ensure these keys exist with safe
+# defaults.  Streamlit can occasionally drop attribute-style access if the key
+# was set via [setter] in a previous run; using setdefault here works on both
+# new and existing sessions.
+st.session_state.setdefault('a2_nodes', None)
+st.session_state.setdefault('a2_can',   None)
+st.session_state.setdefault('a2_meta',  {})
+st.session_state.setdefault('a2_per_row_audit', [])   # list of group audits
+
+# Local-LLM auto-detection: probe Ollama server.  LLM refinement defaults to
+# ON when Ollama is reachable; user can switch it off any time.  Env vars
+# OLLAMA_URL and OLLAMA_MODEL override the localhost / qwen2.5:3b defaults.
+_ollama_url     = os.environ.get('OLLAMA_URL', OLLAMA_URL_DEFAULT).strip() or OLLAMA_URL_DEFAULT
+_ollama_model   = os.environ.get('OLLAMA_MODEL', OLLAMA_MODEL_DEFAULT).strip() or OLLAMA_MODEL_DEFAULT
+_ollama_reachable = _ping_ollama(_ollama_url)
+
+# Groq detection: env-var GROQ_API_KEY makes the cloud option available.
+_groq_url       = os.environ.get('GROQ_URL', GROQ_URL_DEFAULT).strip() or GROQ_URL_DEFAULT
+_groq_model     = os.environ.get('GROQ_MODEL', GROQ_MODEL_DEFAULT).strip() or GROQ_MODEL_DEFAULT
+_groq_key_env   = os.environ.get('GROQ_API_KEY', '').strip()
+
+_default_provider = 'groq' if (_groq_key_env and _LLM_CLIENT_AVAILABLE) else 'ollama'
+_default_llm_on   = (
+    (_ollama_reachable or bool(_groq_key_env)) and _LLM_CLIENT_AVAILABLE)
+
+with st.sidebar:
+    st.header('1 · Input')
+    uploads = st.file_uploader('Metadata / data-dictionary file(s)',
+                                type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
+                                accept_multiple_files=True)
+
+    st.header('2 · Algorithm')
+    max_aspects     = st.slider('Max aspects (K upper bound)', 3, 15, 6)
+    max_depth       = st.slider('Max tree depth', 2, 10, 6)
+    min_cluster_sz  = st.slider('Min variables per cluster', 1, 10, 2)
+    sil_thresh      = st.slider('Silhouette threshold',
+                                0.01, 0.30,
+                                value=0.04 if not _SBERT_AVAILABLE else 0.05,
+                                step=0.01)
+    max_k_split     = st.slider('Max child clusters per split', 2, 12, 5)
+    use_sbert       = st.checkbox('SBERT embeddings',
+                                  value=_SBERT_AVAILABLE,
+                                  disabled=not _SBERT_AVAILABLE)
+    local_nmf       = st.checkbox('Local NMF fallback', value=True)
+    min_local_nmf   = st.slider('Min group size for NMF', 5, 30, 6)
+    use_slot_mining = st.checkbox('Phrase-slot mining', value=True)
+    use_fastopic   = st.checkbox(
+        'FASTopic aspects',
+        value=_FASTOPIC_AVAILABLE,
+        disabled=not _FASTOPIC_AVAILABLE,
+    )
+    fastopic_min_sz = st.slider('Min group size for FASTopic', 6, 100, 40)
+    if not _FASTOPIC_AVAILABLE:
+        st.warning('FASTopic not installed — using NMF.')
+
+    st.header('3 · LLM provider')
+    provider_options = ['Ollama (local)', 'Groq (cloud)']
+    provider_default_idx = 1 if _default_provider == 'groq' else 0
+    provider_label = st.radio('Provider', provider_options,
+                              index=provider_default_idx, horizontal=True)
+    llm_provider = 'groq' if provider_label.startswith('Groq') else 'ollama'
+
+    if llm_provider == 'groq':
+        if not _LLM_CLIENT_AVAILABLE:
+            st.warning('`openai` package not installed.')
+        elif _groq_key_env:
+            st.success('GROQ_API_KEY detected → ready')
+        groq_key_in = st.text_input(
+            'Groq API key', value=_groq_key_env, type='password',
+            help='Free key at console.groq.com/keys, or set GROQ_API_KEY env var.')
+        groq_model_in = st.text_input('Groq model', value=_groq_model)
+        llm_base_url = _groq_url
+        llm_model    = groq_model_in
+        llm_api_key  = groq_key_in
+        llm_ready    = bool(groq_key_in and _LLM_CLIENT_AVAILABLE)
+    else:
+        if not _LLM_CLIENT_AVAILABLE:
+            st.warning('`openai` package not installed.')
+        elif _ollama_reachable:
+            st.success(f'Ollama reachable at `{_ollama_url}`')
+        else:
+            st.warning(f'Ollama not reachable at `{_ollama_url}`.')
+        ollama_url_in   = st.text_input('Ollama URL', value=_ollama_url)
+        ollama_model_in = st.text_input('Ollama model', value=_ollama_model)
+        llm_base_url = ollama_url_in
+        llm_model    = ollama_model_in
+        llm_api_key  = ''
+        llm_ready    = bool(_LLM_CLIENT_AVAILABLE and _ollama_reachable)
+
+    st.header('4 · LLM features')
+    use_per_row_role_extraction = st.checkbox(
+        'Per-row role extraction (primary route)',
+        value=llm_ready,
+        disabled=not llm_ready,
+        help='One LLM call per variable extracts measure / statistic / '
+             'condition / subtype, grounded to the description text.'
+    )
+    use_llm = st.checkbox(
+        'LLM label refinement',
+        value=llm_ready,
+        disabled=not llm_ready,
+    )
+    use_llm_roles = st.checkbox(
+        'LLM phrase-role classifier (fallback)',
+        value=llm_ready,
+        disabled=not llm_ready,
+    )
+    use_role_decomposition = st.checkbox(
+        'SBERT phrase clustering (fallback)',
+        value=_SBERT_AVAILABLE and llm_ready,
+        disabled=not (_SBERT_AVAILABLE and llm_ready),
+    )
+    role_namer_constrained = st.checkbox(
+        'Constrained role vocabulary',
+        value=True,
+    )
+    role_regularity_threshold = st.slider(
+        'Min phrase regularity', 0.05, 0.80, 0.20, 0.05,
+    )
+
+    st.header('5 · Project')
+    project_name = st.text_input('Project name', value='project')
+
+# ── load and configure files ──────────────────────────────────────────────────
+if uploads:
+    import tempfile
+    tmp    = Path(tempfile.mkdtemp())
+    raw_by = {}
+    cfg_by = {}
+
+    st.subheader('Step 1 — Inspect metadata')
+    for f in uploads:
+        p = tmp / safe_name(f.name)
+        p.write_bytes(f.getbuffer())
+        try:
+            df = load_any(p)
+            raw_by[f.name] = df
+            cfg_by[f.name] = detect_roles(df)
+            with st.expander(f'{f.name}', expanded=False):
+                st.write(f'Rows: **{len(df):,}**  Columns: **{len(df.columns)}**')
+                st.dataframe(df.head(8), use_container_width=True)
+        except Exception as e:
+            st.error(f'Could not load {f.name}: {e}')
+
+    st.subheader('Step 2 — Confirm column roles')
+    configs = {}
+    for name, df in raw_by.items():
+        cols = list(df.columns)
+        auto = cfg_by[name]
+        with st.expander(f'{name}', expanded=True):
+            c1, c2 = st.columns(2)
+            with c1:
+                leaf  = st.multiselect('Leaf variable column(s)', cols,
+                                       default=[c for c in auto['leaf_cols']  if c in cols],
+                                       key=f'lf_{name}')
+                group = st.multiselect('Group / task column(s)', cols,
+                                       default=[c for c in auto['group_cols'] if c in cols],
+                                       key=f'gr_{name}')
+            with c2:
+                text  = st.multiselect('Description column(s)', cols,
+                                       default=[c for c in auto['text_cols']  if c in cols],
+                                       key=f'tx_{name}')
+                meta  = st.multiselect('Type / unit column(s)', cols,
+                                       default=[c for c in auto['meta_cols']  if c in cols],
+                                       key=f'mt_{name}')
+            configs[name] = {'leaf_cols': leaf, 'group_cols': group,
+                             'text_cols': text,  'meta_cols':  meta}
+
+    if st.button('Build Approach 2 Hierarchy', type='primary'):
+        try:
+            # Clear stale audit data from any previous build
+            st.session_state.a2_per_row_audit = []
+
+            cans  = [build_canonical(df, configs[name], name)
+                     for name, df in raw_by.items()]
+            can   = pd.concat(cans, ignore_index=True)
+            st.session_state.a2_can = can
+            texts = can['_text'].fillna('').astype(str).tolist()
+
+            # Global NMF (used as fallback and for facet trees)
+            with st.spinner('Step 3 — Discovering global aspects via NMF …'):
+                tfidf, X, nmf, W, H, K, alabels = discover_aspects(texts, max_aspects)
+                st.session_state.a2_meta = {
+                    'K': K, 'aspect_labels': alabels, 'tfidf': tfidf, 'W': W, 'H': H
+                }
+                st.info(f'Global aspects discovered: **{K}** — {" | ".join(alabels)}')
+
+            sbert_model = None
+            if use_sbert and _SBERT_AVAILABLE:
+                with st.spinner('Loading SBERT model …'):
+                    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
+
+            with st.spinner('Step 4 — Building global per-aspect representations …'):
+                reprs = per_aspect_representations(texts, H, tfidf, sbert_model)
+
+            # Build LLM function trio — same provider + endpoint for all.
+            llm_fn = None
+            if use_llm:
+                llm_fn = make_llm_label_fn(llm_base_url, llm_model,
+                                             provider=llm_provider,
+                                             api_key=llm_api_key)
+                if llm_fn is None:
+                    st.warning('LLM unreachable — deterministic labels only.')
+
+            llm_role_namer = None
+            if use_role_decomposition:
+                llm_role_namer = make_llm_role_namer_fn(
+                    llm_base_url, llm_model,
+                    constrained=role_namer_constrained,
+                    provider=llm_provider, api_key=llm_api_key)
+
+            llm_role_fn = None
+            if use_llm_roles:
+                llm_role_fn = make_llm_role_classifier_fn(
+                    llm_base_url, llm_model,
+                    provider=llm_provider, api_key=llm_api_key)
+
+            # NEW: per-row role extractor (Zhu et al. EMNLP 2025) — primary route
+            per_row_extractor = None
+            if use_per_row_role_extraction:
+                per_row_extractor = make_per_row_role_extractor_fn(
+                    llm_base_url, llm_model,
+                    provider=llm_provider, api_key=llm_api_key)
+                if per_row_extractor:
+                    st.info(f'Per-row role extraction: **{llm_provider}** · `{llm_model}`')
+                else:
+                    st.warning('LLM unreachable for per-row role extraction.')
+
+            # Collect detected text-column names across all uploaded configs —
+            # used by phrase-slot mining to identify description-like fields.
+            all_text_cols: list = []
+            for cfg in configs.values():
+                for c in cfg.get('text_cols', []):
+                    if c not in all_text_cols:
+                        all_text_cols.append(c)
+
+            # Replace the module-level FIELD_NAME_NOISE with a noise set DERIVED
+            # from the actual detected column names — zero hardcoding.  All
+            # downstream calls (label_cluster, _bigram_preferred_terms) read
+            # the module-level name so this swap propagates everywhere.
+            import sys as _sys
+            _sys.modules[__name__].FIELD_NAME_NOISE = build_field_noise(configs)
+
+            with st.spinner('Step 6 — Building group-anchored LoD tree '
+                            '(slot mining → FASTopic → NMF) …'):
+                nodes = build_dynamic_lod_tree(
+                    can, reprs, alabels, tfidf,
+                    max_depth=max_depth,
+                    min_cluster_size=min_cluster_sz,
+                    sil_threshold=sil_thresh,
+                    max_clusters_per_split=max_k_split,
+                    project=project_name,
+                    local_nmf=local_nmf,
+                    min_local_nmf_size=min_local_nmf,
+                    max_aspects=max_aspects,
+                    sbert_model=sbert_model,
+                    llm_label_fn=llm_fn,
+                    use_slot_mining=use_slot_mining,
+                    text_col_names=all_text_cols,
+                    use_fastopic=use_fastopic,
+                    fastopic_min_size=fastopic_min_sz,
+                    llm_role_classifier_fn=llm_role_fn,
+                    use_role_decomposition=use_role_decomposition,
+                    llm_role_namer_fn=llm_role_namer,
+                    role_regularity_threshold=role_regularity_threshold,
+                    per_row_role_extractor_fn=per_row_extractor,
+                    use_per_row_role_extraction=use_per_row_role_extraction,
+                )
+                st.session_state.a2_nodes = nodes
+
+
+            # Concise build summary — per-route node counts
+            route_counts: dict = Counter()
+            for n in nodes:
+                if n.get('type') == 'aggregation':
+                    route_counts[
+                        n.get('structure_provenance', {}).get('route', '—')] += 1
+            n_leaves   = len([n for n in nodes if n.get('type') == 'attribute'])
+            n_internal = len([n for n in nodes if n.get('type') == 'aggregation'])
+            route_str  = ' · '.join(f'{r}: {c}' for r, c in route_counts.most_common())
+            st.success(f'Done — {n_leaves} variables · {n_internal} internal nodes '
+                       f'({route_str})')
+
+            # If any LLM call hit a rate-limit (429), the model ran out of
+            # tokens — tell the user to switch model in the sidebar and rebuild.
+            ran_out = any(
+                ('RateLimit' in str(r) or '429' in str(r))
+                for a in (st.session_state.get('a2_per_row_audit') or [])
+                for r in (a.get('summary') or {})
+            )
+            if ran_out:
+                st.error(f'Ran out of tokens on `{llm_model}`. '
+                         f'Switch to another Groq model in the sidebar '
+                         f'(e.g. llama-3.1-8b-instant) and rebuild.')
+        except Exception as e:
+            st.error(f'Build failed: {e}')
+            import traceback; st.code(traceback.format_exc())
+
+# ── display ───────────────────────────────────────────────────────────────────
+# Robust session-state reads — use .get() so a partial/incomplete build that
+# wrote some keys but not others doesn't crash the display layer.
+if st.session_state.get('a2_nodes') is None:
+    st.info('Upload a metadata file and click **Build Approach 2 Hierarchy** to start.')
+    st.stop()
+
+nodes  = st.session_state.get('a2_nodes')
+can    = st.session_state.get('a2_can')
+meta   = st.session_state.get('a2_meta') or {}
+
+tabs = st.tabs(['LoD Tree', 'Evaluation', 'Role Decomposition',
+                'Label Provenance', 'Metadata', 'Export', 'Method'])
+
+with tabs[0]:
+    # ── Visualization controls (above chart — easy to find, matches Approach 1.1) ─
+    vc1, vc2, vc3, vc4, vc5 = st.columns([2, 2, 1, 1, 1])
+    with vc1:
+        viz_mode = st.radio(
+            'View mode',
+            ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
+            horizontal=True, index=0,
+            help='Sunburst best for large hierarchies [Taxonomizer]. '
+                 'Node-link best for moderate depth structure inspection.'
+        )
+    with vc2:
+        depth_display = st.slider('Depth (Level of Detail)', 1, 8, 4, 1)
+    with vc3:
+        show_leaf_labels = st.checkbox('Leaf labels', value=False)
+    with vc4:
+        show_hidden = st.checkbox('Hidden nodes', value=False)
+    with vc5:
+        compress_chains = st.checkbox('Compress chains', value=True,
+                                       help='Merge one-child aggregation chains '
+                                            '(e.g. "DMS → DMS Recommended Standard") '
+                                            'for display. Export JSON keeps original structure.')
+    st.divider()
+
+    display_nodes = compress_one_child_chains(nodes) if compress_chains else nodes
+
+    if viz_mode == 'Sunburst (drill-down)':
+        st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
+                        use_container_width=True)
+    elif viz_mode == 'Treemap':
+        st.plotly_chart(plot_treemap(display_nodes), use_container_width=True)
+    else:
+        st.plotly_chart(plot_node_link(display_nodes, depth_display,
+                                        show_hidden, show_leaf_labels),
+                        use_container_width=True)
+
+    n_l = len([n for n in nodes if n.get('type') == 'attribute'])
+    n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
+    # max depth
+    pm  = _parent_map(nodes)
+    def _node_depth(nid):
+        d = 0; cur = nid
+        while cur in pm:
+            cur = pm[cur]; d += 1
+        return d
+    max_d = max((_node_depth(n['id']) for n in nodes), default=0)
+    c1, c2, c3, c4 = st.columns(4)
+    c1.metric('Variables', n_l)
+    c2.metric('Internal nodes', n_i)
+    c3.metric('Global aspects', meta.get('K', '?'))
+    c4.metric('Max depth', max_d)
+
+with tabs[1]:
+    import hierarchy_eval as he
+
+    st.markdown('### Evaluation')
+    if can is None or meta.get('W') is None:
+        st.info('Run the builder first.')
+    else:
+        st.caption(
+            'No manually curated reference taxonomy is available for this experiment. '
+            'The primary metrics below are **reference-free** — they assess the hierarchy '
+            'itself. Group-column figures later are descriptive self-consistency checks only. '
+            'Identical definitions to the Baseline and Approach 1 apps make the primary '
+            'diagnostics directly comparable.'
+        )
+
+        # ── PRIMARY: reference-free hierarchy quality (compute on demand) ──────
+        # These use SBERT, which is slow to load. Computing them only on a button
+        # click keeps the tree, sliders and Save button instant.
+        st.markdown('#### Primary — reference-free hierarchy quality')
+        if st.button('▶Compute reference-free metrics', key='a2_eval_btn'):
+            with st.spinner('Computing reference-free metrics (loads SBERT once)…'):
+                tm   = he.traco_metrics(nodes)
+                npmi = he.npmi_coherence(nodes, can['_text'].tolist())
+            st.session_state['a2_eval_cache'] = {'tm': tm, 'npmi': npmi}
+
+        _ev = st.session_state.get('a2_eval_cache')
+        if _ev:
+            tm, npmi = _ev['tm'], _ev['npmi']
+            p1, p2, p3 = st.columns(3)
+            p1.metric('Parent–child coherence', tm['pc_coherence'],
+                      help='TraCo (Wu et al., AAAI 2024). Children nest under parent theme.')
+            p2.metric('Sibling diversity', tm['sibling_diversity'],
+                      help='TraCo (Wu et al., AAAI 2024). Higher = distinct siblings; LOW = redundant.')
+            p3.metric('NPMI label coherence', npmi,
+                      help='Lau et al., EACL 2014. Label terms genuinely co-occur in the data.')
+            st.caption(f'Embedding backend: **{tm["encoder"]}**.')
+        else:
+            st.info('Click the button above to compute coherence / diversity / NPMI '
+                    '(takes a few seconds the first time while SBERT loads).')
+
+        # ── Label-quality proxies (interpretability) ──────────────────────────
+        st.markdown('#### Label quality *(interpretability — reference-free)*')
+        lq = he.label_quality(nodes)
+        l1, l2, l3 = st.columns(3)
+        l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
+                  help='% of internal labels that read as a real concept (short noun '
+                       'phrase, WordNet head) rather than a "/"-joined term fragment.')
+        l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
+                  help='% of internal labels duplicating a sibling label (lower is better).')
+        l3.metric('Avg label words', lq['avg_label_words'],
+                  help='Mean label length in words.')
+
+        # ── Structural statistics ─────────────────────────────────────────────
+        st.markdown('#### Structural statistics')
+        sm = he.structural_stats(nodes)
+        s1, s2, s3, s4, s5 = st.columns(5)
+        s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
+        s2.metric('Max leaf depth',    sm['max_depth'])
+        s3.metric('Avg leaf depth',    sm['avg_leaf_depth'])
+        s4.metric('Avg branching',     sm['avg_branching_factor'])
+        s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%")
+
+        # ── Group-structure self-consistency (descriptive, NOT accuracy) ───────
+        st.markdown('#### Group-structure self-consistency *(descriptive — not accuracy)*')
+        st.caption(
+            'The group column is a **construction input** (group-anchored L1/L2), so this '
+            'only confirms the NMF aspect partition reflects its own input — expected high, '
+            'NOT a quality signal and NOT comparable across methods as an accuracy metric.'
+        )
+        true_labels = can['_group'].apply(
+            lambda x: str(x).split(' > ')[0].strip()).tolist()
+        W        = meta['W']
+        pred_nmf = np.argmax(W, axis=1).tolist()
+        metrics  = evaluate(true_labels, pred_nmf)
+        st.metric('ARI (self-consistency)', metrics['ARI'])
+
+        # ── legacy global aspect table (diagnostic only) ─────────────────────
+        with st.expander('Legacy global NMF aspect table (diagnostic — not the '
+                          'primary result)'):
+            st.caption('Global NMF aspects are a coarse lexical fallback. The '
+                       'hierarchy is built from per-variable role extraction — '
+                       'see the Role Decomposition tab for the actual result.')
+            alabs = meta.get('aspect_labels', [])
+            W_df  = pd.DataFrame(
+                W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
+            W_df.insert(0, 'Variable', can['_label'].tolist())
+            st.dataframe(W_df.round(4), use_container_width=True)
+
+with tabs[2]:
+    st.markdown('### Role decomposition')
+    st.caption('Each variable decomposed into measure / statistic / condition / subtype.')
+
+    if can is None:
+        st.info('Build a hierarchy first.')
+    else:
+        # ── Per-group summary ─────────────────────────────────────────────────
+        st.markdown('#### Per-group routing')
+        reg_rows = []
+        for n in nodes:
+            sp = n.get('structure_provenance', {})
+            if sp.get('route') == 'group_anchor' and 'phrase_regularity' in sp:
+                reg_rows.append({
+                    'Group':       n.get('name', ''),
+                    'Regularity':  sp.get('phrase_regularity'),
+                    'Route used':  sp.get('route_used', '—'),
+                })
+        if reg_rows:
+            reg_df = pd.DataFrame(reg_rows).sort_values(
+                'Regularity', ascending=False, na_position='last')
+            st.dataframe(reg_df, use_container_width=True, hide_index=True)
+
+        # ── Per-variable role table ───────────────────────────────────────────
+        st.markdown('#### Per-variable role table')
+
+        # Primary source: raw per-row LLM extractions captured in the audit
+        # (complete — includes roles that didn't become tree levels, e.g.
+        # condition values skipped by singleton prevention).
+        # Fallback per variable: roles collected from tree-ancestor nodes.
+        audit_roles_by_vi: dict = {}
+        for a in (st.session_state.get('a2_per_row_audit') or []):
+            vi_list_a = a.get('vi_list') or []
+            for r in a.get('per_row_audit', []):
+                ridx = r.get('row_idx')
+                accepted = r.get('accepted_roles') or {}
+                if ridx is not None and ridx < len(vi_list_a) and accepted:
+                    audit_roles_by_vi[int(vi_list_a[ridx])] = accepted
+
+        # Tree-walk fallback (roles that became hierarchy levels)
+        node_map_disp = {int(n['id']): n for n in nodes}
+        parent_lookup: dict = {}
+        for n in nodes:
+            for c in n.get('related', []):
+                parent_lookup.setdefault(int(c), int(n['id']))
+
+        def _tree_roles_for_attr(node_id: int) -> dict:
+            roles_here: dict = {}
+            cur = node_id
+            while cur in parent_lookup:
+                cur = parent_lookup[cur]
+                cur_node = node_map_disp.get(cur)
+                if not cur_node:
+                    break
+                lp = cur_node.get('label_provenance', {})
+                role = (lp.get('role')
+                         or cur_node.get('structure_provenance', {}).get('slot_role'))
+                if role and role not in roles_here:
+                    roles_here[role] = cur_node.get('name', '')
+            return roles_here
+
+        role_rows = []
+        for vi, (_, crow) in enumerate(can.iterrows()):
+            # Audit roles win; tree roles fill any gaps
+            roles_here = dict(_tree_roles_for_attr(vi + 1))   # attribute ids = 1..n
+            for r, v in (audit_roles_by_vi.get(vi) or {}).items():
+                if v:
+                    roles_here[r] = v
+            row = {
+                'Group':    str(crow.get('_group', '')).split(' > ')[0].strip(),
+                'Variable': str(crow.get('_label', '')),
+            }
+            for std_role in ('measure', 'statistic', 'condition',
+                              'subtype', 'outcome', 'modifier'):
+                row[std_role.title()] = roles_here.pop(std_role, '')
+            if roles_here:
+                row['Other roles'] = '; '.join(
+                    f'{r}: {v}' for r, v in roles_here.items())
+            role_rows.append(row)
+
+        if role_rows:
+            role_df = pd.DataFrame(role_rows)
+            st.dataframe(role_df, use_container_width=True, hide_index=True)
+            st.download_button(
+                'Download per-variable role CSV',
+                data=role_df.to_csv(index=False).encode('utf-8'),
+                file_name=f'{safe_name(project_name)}_approach2_role_decomposition.csv',
+                mime='text/csv',
+            )
+
+        # ── Per-row LLM extractor audit ───────────────────────────────────────
+        audits = st.session_state.get('a2_per_row_audit', []) or []
+        if audits:
+            st.markdown('#### Extraction audit')
+
+            # Summary table per group
+            sum_rows = []
+            for a in audits:
+                sum_rows.append({
+                    'Group':         a.get('group_name', ''),
+                    'Rows':          a.get('n_rows', 0),
+                    'Coverage':      a.get('coverage'),
+                    'Valid':         a.get('valid'),
+                    'Roles found':   ', '.join(a.get('roles_final', []))[:60],
+                    'Reasons':       ', '.join(f'{k}:{v}' for k, v in
+                                                (a.get('summary', {}) or {}).items()),
+                })
+            st.dataframe(pd.DataFrame(sum_rows), use_container_width=True,
+                          hide_index=True)
+
+            # Drill-down per group
+            grp_names = [a.get('group_name', '?') for a in audits]
+            if grp_names:
+                sel_grp = st.selectbox(
+                    'Drill into a group to see per-row proposals + rejections:',
+                    grp_names)
+                sel_audit = next((a for a in audits
+                                    if a.get('group_name') == sel_grp), None)
+                if sel_audit:
+                    row_rows = []
+                    for r in sel_audit.get('per_row_audit', [])[:60]:
+                        accepted = r.get('accepted_roles', {}) or {}
+                        rejected = r.get('rejected', []) or []
+                        row_rows.append({
+                            'Row #':      r.get('row_idx', ''),
+                            'Description': r.get('description_snippet', ''),
+                            'Accepted':   '; '.join(f'{k}={v}'
+                                                     for k, v in accepted.items())[:140],
+                            'Rejected':   '; '.join(
+                                f'{x[0]}={x[1]!r} (missing stems: {x[2]})'
+                                if isinstance(x, (list, tuple)) and len(x) >= 3
+                                else str(x) for x in rejected)[:200],
+                            'Reason':     r.get('reason', ''),
+                        })
+                    if row_rows:
+                        st.dataframe(pd.DataFrame(row_rows),
+                                      use_container_width=True, hide_index=True)
+                        # Download as CSV for offline analysis
+                        csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
+                        st.download_button(
+                            'Download per-row audit for this group',
+                            data=csv_bytes,
+                            file_name=f'{safe_name(project_name)}_audit_{safe_name(sel_grp)}.csv',
+                            mime='text/csv',
+                        )
+        else:
+            st.info('No role assignments recorded yet — Option D may have '
+                    'fallen back to slot mining or aspect clustering for all '
+                    'groups in this dataset.')
+
+with tabs[3]:
+    st.markdown('### Label provenance')
+    st.caption('Audit trail: which stage produced each node label.')
+    rows = []
+    for n in nodes:
+        if n.get('type') != 'aggregation':
+            continue
+        p = n.get('label_provenance', {})
+        s = n.get('structure_provenance', {})
+        rows.append({
+            'Node':         n.get('name', ''),
+            'Source':       p.get('label_source', '—'),
+            'Route':        s.get('route', '—'),
+            'Aspect method': s.get('aspect_method') or '—',
+            'Silhouette':   s.get('silhouette') if s.get('silhouette') is not None else '—',
+            'LLM used':     p.get('llm_used', False),
+            'LLM rejected': p.get('llm_rejected', False),
+            'LLM proposed': p.get('llm_raw_label', ''),
+            'LLM reason':   p.get('llm_reason', '')[:60],
+            'Confidence':   round(float(p.get('confidence', 1.0)), 3),
+            'Evidence':     ', '.join(str(t) for t in p.get('evidence_terms', []))[:120],
+        })
+    if not rows:
+        st.info('No internal nodes yet — build a hierarchy first.')
+    else:
+        prov_df = pd.DataFrame(rows)
+
+        # ── Labels by source ──────────────────────────────────────────────────
+        source_counts = prov_df['Source'].value_counts()
+        st.write('**Labels by source**')
+        cols_src = st.columns(min(5, max(2, len(source_counts))))
+        for i, (src, cnt) in enumerate(source_counts.items()):
+            cols_src[i % len(cols_src)].metric(str(src), int(cnt))
+
+        # ── Structure routes ──────────────────────────────────────────────────
+        am_counts = prov_df['Aspect method'].value_counts()
+        st.write('**Structure routes used**')
+        cols_am = st.columns(min(5, max(2, len(am_counts))))
+        for i, (am, cnt) in enumerate(am_counts.items()):
+            cols_am[i % len(cols_am)].metric(str(am), int(cnt))
+
+        # ── LLM usage — split per-row extraction from the downstream refiner ──
+        # Per-row nodes are LLM-BUILT (source 'per_row_llm_role'); the refiner
+        # only renames deterministically-labeled nodes (source 'llm' when its
+        # proposal is accepted).  Counting them together made the panel read
+        # "N calls, 0 accepted" even on a fully successful build.
+        n_per_row        = int((prov_df['Source'] == 'per_row_llm_role').sum())
+        refiner_accepted = int((prov_df['Source'] == 'llm').sum())
+        refiner_rejected = int(((prov_df['LLM rejected'] == True)  # noqa: E712
+                                 & (prov_df['Source'] != 'per_row_llm_role')).sum())
+        st.write('**LLM usage**')
+        cL1, cL2, cL3 = st.columns(3)
+        cL1.metric('Per-row extraction nodes', n_per_row)
+        cL2.metric('Refiner accepted', refiner_accepted)
+        cL3.metric('Refiner rejected', refiner_rejected)
+        if refiner_accepted == 0 and refiner_rejected == 0 and n_per_row > 0:
+            st.caption('Label refiner did not run — the tree was built entirely '
+                        'by per-row extraction, leaving no deterministic labels '
+                        'to refine.')
+        if refiner_rejected > 0:
+            with st.expander('Rejected refiner proposals'):
+                rej = prov_df[(prov_df['LLM rejected'] == True)  # noqa: E712
+                              & (prov_df['LLM proposed'].astype(str).str.len() > 0)]
+                if len(rej):
+                    st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
+                                  use_container_width=True, hide_index=True)
+
+        # ── Full provenance table ─────────────────────────────────────────────
+        st.write('**Full per-node provenance**')
+        st.dataframe(prov_df, use_container_width=True, hide_index=True)
+
+with tabs[4]:
+    if can is not None:
+        st.dataframe(can.drop(columns=['_row'], errors='ignore'),
+                     use_container_width=True)
+
+with tabs[5]:
+    # ── derive a per-CSV base name from the uploaded files ────────────────────
+    # Uses the actual uploaded file names so different CSVs get different
+    # output filenames (e.g. ai-mind-…json vs HCP_S1200_…json).
+    csv_basis = ''
+    if can is not None and '_source' in can.columns:
+        sources = [str(s) for s in can['_source'].dropna().unique().tolist()]
+        # Drop extensions, join with '+' if multiple files merged
+        bases = []
+        for s in sources:
+            stem = Path(s).stem
+            bases.append(safe_name(stem))
+        csv_basis = '+'.join(bases) if bases else safe_name(project_name)
+    if not csv_basis:
+        csv_basis = safe_name(project_name)
+
+    lod_fname = f'{csv_basis}_approach2_lod.json'
+
+    st.caption(f'Filename basis: **{csv_basis}**  '
+                f'(taken from the uploaded CSV — different CSVs export under different names)')
+
+    col1, col2 = st.columns(2)
+    with col1:
+        if nodes:
+            st.download_button(
+                'LoD tree JSON',
+                data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
+                file_name=f'{csv_basis}_approach2_lod.json',
+                mime='application/json',
+                use_container_width=True,
+            )
+    with col2:
+        if can is not None:
+            st.download_button(
+                'Canonical CSV',
+                data=can.to_csv(index=False).encode('utf-8'),
+                file_name=f'{csv_basis}_approach2_canonical.csv',
+                mime='text/csv',
+                use_container_width=True,
+            )
+
+    st.divider()
+    # ── Save directly into the project's outputs/approach_2/ folder ────────────
+    _out_dir = Path(__file__).resolve().parent / 'outputs' / 'approach_2'
+    st.markdown('### Save to project folder')
+    st.caption(
+        'The download buttons above go to your browser’s Downloads folder (a browser '
+        f'restriction). This button instead writes the files into `{_out_dir}` with the '
+        'dataset name — convenient for `evaluate_all.py`.'
+    )
+    if st.button('Save all to outputs/approach_2/', type='primary',
+                 use_container_width=True):
+        try:
+            _out_dir.mkdir(parents=True, exist_ok=True)
+            saved = []
+            if nodes:
+                (_out_dir / f'{csv_basis}_approach2_lod.json').write_text(
+                    json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
+                saved.append(f'{csv_basis}_approach2_lod.json')
+            if can is not None:
+                can.to_csv(_out_dir / f'{csv_basis}_approach2_canonical.csv', index=False)
+                saved.append(f'{csv_basis}_approach2_canonical.csv')
+            st.success(f'Saved to `{_out_dir}`:\n\n- ' + '\n- '.join(saved))
+        except Exception as _e:
+            st.error(f'Could not save: {_e}')
+
+with tabs[6]:
+    st.markdown("""
+## Approach 2 — Role-Decomposed Hierarchy via SBERT Phrase Clustering
+### Option D primary route + slot mining + FASTopic + constrained LLM labels
+
+### Algorithm
+
+```
+Step 1  Build metadata text objects (variable name + description + group)
+        [GON §3] — Gonçalves et al. (2019)
+
+Step 2  Group-anchored L1/L2 structure  [NEW]
+        — detected group columns → path nodes (no hardcoding)
+        — e.g. category > assessment, or task > variant
+        — falls back to global NMF if no groups detected
+
+Step 3a Phrase-slot mining (slot-first routing)  [IE / slot induction]
+        — For each terminal group, attempt deterministic phrase-slot
+          decomposition of variable descriptions:
+            • extract concept-prefix per row
+            • mine repeated n-grams (1–4 tokens, ≥2 rows)
+            • compute mutual-exclusion across phrase pairs:
+                M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
+            • cluster phrases by mutual exclusion → slots
+        — A slot is a set of phrases that rarely co-occur within a row but
+          each co-occur with phrases from other slots (= alternatives at
+          the same semantic position).
+        — Activates only when slot structure is statistically strong:
+            • ≥ 2 slots discovered
+            • ≥ 55% row coverage
+            • each slot ≥ 2 distinct phrases
+        — No domain hardcoding: phrase content is discovered from the data;
+          slot names = the highest-coverage phrase in each slot.
+        — When valid, the local hierarchy is built directly from slot values.
+        — When invalid (free-form descriptions, e.g. parts of HCP), routing
+          falls through to NMF (Step 3b).
+
+Step 3b Local NMF aspect discovery per terminal group  [ZHU §3.1 adapted]
+        — Fallback path when slot mining does not apply.
+        — NMF runs inside each group, not across all variables
+        — prevents globally-dominant terms from polluting local aspects
+        — K selected by reconstruction-error elbow (deterministic)
+        — falls back to global embeddings for small groups (< min_local_nmf_size)
+
+Step 4  Per-aspect variable representations  [ZHU §3.1]
+        — for each aspect k: filter text to top-T terms → SBERT or masked TF-IDF
+
+Step 5  Independent per-aspect GMM clustering  [ZHU §3.2]
+        — GMM with diagonal covariance + BIC for stable k selection
+        — runs inside each group's aspect space
+
+Step 6  Simplified best-aspect split  [ZHU §3.3 adapted]
+        — at each node: evaluate all K aspects by silhouette score
+        — highest silhouette → GMM split → child aggregation nodes
+        — NOTE: this is a silhouette-based greedy split, not the full
+          probabilistic search of Zhu et al. Eq. 6/7
+        — singleton prevention: 1-variable clusters attach directly (no wrapper node)
+
+Step 5a UPSTREAM LLM phrase-role classification  [TopicGPT, NAACL 2024 adapted]
+        — One LLM call per terminal group:
+          input  : (i) all repeated mined phrases (verbatim from the corpus)
+                   (ii) 2–4 sample variable descriptions for context
+                   (iii) the group name
+          output : {role_name: [phrases]}  e.g. {measure: [...], statistic: [...],
+                                                  condition: [...]}
+        — Anti-hallucination:
+          • every phrase in the returned roles MUST match an input phrase
+            verbatim (validator drops anything else)
+          • role names must be 1–2 generic English words (drops fancy/long names)
+          • ≥ 2 valid roles required; else falls back to mutual-exclusion
+        — Phrase ↔ role mapping drives the slot hierarchy in Step 6:
+          measure-like roles become outer levels, condition-like roles inner.
+        — STRUCTURAL use of the LLM (TopicGPT discovers topics).  Distinct
+          from the downstream label-refiner step (TopicTag, see Step 7e).
+
+Step 7  Node labeling  [ZHU §4.3 / TopicTag DocEng 2024]
+        a) description-prefix phrase shared by ≥60% of cluster
+        b) group-purity prefix: if ≥70% share one _group top-level value
+        c) data-driven boilerplate + FIELD_NAME_NOISE filter
+        d) bigram-preferred discriminative TF-IDF suffix
+        e) OPTIONAL downstream LLM refinement [TopicTag]:
+           — receives only evidence terms + parent path + sample descriptions
+           — strict grounding check: every label word must appear in evidence
+           — rejected proposals fall back to deterministic label
+           — provenance stored on each node (label_source, confidence, evidence_terms)
+
+Step 8  Evaluation  [TraCo AAAI 2024 / Lau EACL 2014 / TICL §3.4]
+        — reference-free parent-child coherence and sibling diversity
+        — reference-free NPMI label coherence and label-quality proxies
+        — optional group-structure self-consistency, explicitly not manual-reference accuracy
+        — label and structure provenance audit tables
+```
+
+### Key design decisions
+
+| Decision | Rationale |
+|---|---|
+| FASTopic replaces NMF as primary aspect discovery | NMF (1999) is lexical only; FASTopic (NeurIPS 2024) uses pretrained Transformer + Dual Semantic-relation Reconstruction → semantic, not lexical. |
+| NMF kept as fallback | Required for very small groups or when FASTopic / SBERT model is unavailable. |
+| Slot mining tried first | Decomposes variables along multiple semantic dimensions before any topic model. No document-level method (NMF, BERTopic, FASTopic) can do this — they all collapse one variable into one vector. |
+| No facet trees | Removed: a single coherent LoD tree is easier to defend than parallel views of one clustering. |
+| Deterministic labels = default thesis result | Reproducible without API access. LLM is opt-in re-phrasing only. |
+| LLM via local Ollama | Localhost OpenAI-compatible endpoint (`http://localhost:11434/v1`) → LLM ON by default whenever Ollama is reachable; easy to disable. Override `OLLAMA_URL` / `OLLAMA_MODEL` env vars for non-default deployments. No external API, no key management, fully reproducible from a known model checkpoint. |
+| Strict LLM grounding | Every label word must appear in evidence — labels come from the CSV, LLM only rewords. |
+| Per-node provenance | Audit trail: `label_source ∈ {description_prefix, tfidf_bigram, group_anchor, phrase_slot, llm, fallback}`. |
+
+### Thesis wording (defense-safe)
+
+*Approach 2 is a dataset-constrained multi-aspect hierarchy with strict separation
+between structural decisions and label generation. The hierarchy topology is produced
+deterministically: detected group metadata anchors the upper levels, IE-style phrase-slot
+mining decomposes variables along multiple semantic dimensions when description structure
+permits, and FASTopic (NeurIPS 2024) discovers latent semantic aspects in the remaining
+groups, with NMF retained as a lexical fallback for small groups. Concept labels are
+generated by a deterministic five-stage pipeline whose evidence comes exclusively from
+the dataset itself. An optional TopicTag-style LLM refinement layer may re-phrase these
+labels, but every LLM proposal must pass a grounding check — each word in the proposed
+label must appear in the extracted evidence — and every label records its provenance
+(source stage, confidence, evidence terms). The LLM can neither alter the tree structure
+nor introduce vocabulary absent from the input CSV.*
+
+### Papers used
+
+| Ref | Citation | Role in this method |
+|---|---|---|
+| [ZHU] | Zhu et al. (2025). *Context-Aware Hierarchical Taxonomy Generation via LLM-Guided Multi-Aspect Clustering.* EMNLP 2025. | Main scaffold — adapted (FASTopic+NMF replace LLM aspect generation; greedy silhouette replaces Eq. 6/7 search). |
+| [FASTopic] | Wu et al. (2024). *FASTopic: Pretrained Transformer is a Fast, Adaptive, Stable, and Transferable Topic Model.* NeurIPS 2024 (arXiv:2405.17978). | Recent SOTA replacement for NMF — semantic topic discovery via Dual Semantic-relation Reconstruction with optimal transport. |
+| [IE-Slot] | Established IE literature on slot induction (surveyed *ACM Computing Surveys* 2022). | Phrase-slot mining adaptation — decomposes one variable into multiple alternative-phrase signals before clustering. |
+| [GON] | Gonçalves et al. (2019). ESWC 2019. | Canonical metadata text-object construction. |
+| [TopicGPT] | Pham et al. (2024). *TopicGPT: A Prompt-based Topic Modeling Framework.* NAACL 2024 (arXiv:2311.01449). | **STRUCTURAL** upstream LLM use — discovers semantic-role schema from mined phrases (one call per group); drives slot ordering in the hierarchy. Anti-hallucination contract: every phrase verbatim from corpus. |
+| [TopicTag] | Eren et al. (2024). DocEng 2024 (arXiv:2407.19616). | Constrained LLM label-refinement pattern — LLM only names existing clusters, never modifies structure. Downstream use only. |
+| [Qwen2.5] | Qwen Team (2024). *Qwen 2.5 Technical Report.* arXiv:2412.15115. | Open instruction-tuned model used as the local LLM (via Ollama) for label refinement. Replaces a hosted LLM for full offline reproducibility. |
+| [TraCo] | Wu et al. (2024). AAAI 2024 (arXiv:2401.14113). | Diagnostic metrics: parent-child coherence + sibling diversity. |
+| [TaxoAdapt] | Kargupta et al. (2025). ACL 2025 (arXiv:2506.10737). | Multidimensional taxonomy motivation. |
+| [SC-Taxo] | (2026). arXiv:2605.00620. | Future work — bidirectional semantic consistency. |
+| [BISE-26] | Motamedi, Novalija, Rei (2026). Springer BISE. | Validates group-anchored entry strategy. |
+| [TICL] | Kejriwal et al. (2022). EAAI 108, 104548. | Concept-label evaluation framework. |
+
+### Known limitations (honest)
+
+* **FASTopic is still document-level** — better than NMF semantically, but a single variable
+  is still one vector. Multi-dimension decomposition relies on phrase-slot mining.
+* **Greedy silhouette split** — not the full probabilistic search of Zhu et al. Eq. 6/7.
+* **TraCo metrics are diagnostic only** — measured, not enforced (no neural transport plan).
+* **LLM labels are only as recent as your Anthropic model** — model choice affects reproducibility;
+  the deterministic pipeline is the canonical thesis result.
+""")