# approach_2.py — Approach 2: Dataset-Constrained Multi-Aspect Hierarchy
#
# ALGORITHM (no domain hardcoding):
#
#   Step 1  Build metadata text objects                                  [GON §3]
#   Step 2  Group-anchored L1/L2 from detected _group metadata          [BISE-26]
#   Step 3  Per terminal group — routing in priority order:
#           (a) Phrase-slot mining               [IE / slot induction]
#               decomposes one variable into multiple alternative-phrase
#               signals before clustering — fixes the document-level ceiling
#               that NMF/BERTopic/FASTopic share.
#           (b) FASTopic semantic aspect discovery [Wu et al. NeurIPS 2024]
#               transformer-based Dual Semantic-relation Reconstruction with
#               optimal transport — recent SOTA replacement for NMF.
#           (c) NMF lexical fallback             [ZHU §3.1 adapted]
#               retained for small groups or when FASTopic is unavailable.
#   Step 4  Per-aspect variable representations                          [ZHU §3.1]
#   Step 5  Independent per-aspect clustering: GMM+BIC small / KMeans large [ZHU §3.2]
#   Step 6  Top-down LoD tree (simplified silhouette best-aspect split)  [ZHU §3.3 adapted]
#   Step 7  Node labeling — deterministic by default:
#           description-prefix → group anchor → IDF + FIELD_NAME filter →
#           bigram-preferred discriminative TF-IDF suffix
#           OPTIONAL: constrained LLM re-phrasing                         [TopicTag, DocEng 2024]
#                     — every label word must appear in evidence (grounding check)
#                     — provenance stored per node (audit trail)
#   Step 8  Reference-free diagnostics: parent-child coherence, sibling diversity,
#           label coherence, label quality, structure/provenance audit
#                                                                        [TraCo, AAAI 2024]
#
# Facet trees (Castanet, 2007) removed in this version — a single coherent LoD tree.
#
# PAPERS:
#   [ZHU]      Zhu et al. (2025). EMNLP 2025.   Main scaffold (adapted)
#   [FASTopic] Wu et al. (2024). NeurIPS 2024.  Semantic aspect discovery
#   [GON]      Gonçalves et al. (2019). ESWC.   Canonical text objects
#   [TopicTag] Eren et al. (2024). DocEng.      Constrained LLM label refinement
#   [TraCo]    Wu et al. (2024). AAAI.          Affinity + diversity metrics
#   [TICL]     Kejriwal et al. (2022). EAAI.    Optional clustering diagnostics
#   [BISE-26]  Motamedi et al. (2026). BISE.    Group-anchored entry validation
#   [IE-Slot]  IE / slot-induction literature (surveyed Xu et al., FCS 2024).

from __future__ import annotations
import json
import os
import warnings
from collections import Counter, defaultdict
from pathlib import Path
from typing import Callable, Optional

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import streamlit as st
from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

try:
    from sentence_transformers import SentenceTransformer
    _SBERT_AVAILABLE = True
except Exception:   # broken torch raises OSError (WinError 127), not ImportError
    _SBERT_AVAILABLE = False

try:
    from openai import OpenAI as _OpenAIClient
    _LLM_CLIENT_AVAILABLE = True
except ImportError:
    _LLM_CLIENT_AVAILABLE = False

# Ollama defaults (overridable via env vars OLLAMA_URL / OLLAMA_MODEL).
OLLAMA_URL_DEFAULT   = 'http://localhost:11434/v1'
OLLAMA_MODEL_DEFAULT = 'qwen2.5:3b-instruct'
GROQ_URL_DEFAULT     = 'https://api.groq.com/openai/v1'
GROQ_MODEL_DEFAULT   = 'qwen/qwen3-32b'

def _ping_ollama(base_url: str = OLLAMA_URL_DEFAULT, timeout: float = 1.5) -> bool:
    """Quick reachability check for the local Ollama server."""
    if not _LLM_CLIENT_AVAILABLE:
        return False
    try:
        import urllib.request as _urlreq
        # /v1/models is OpenAI-compat; Ollama also exposes /api/tags
        with _urlreq.urlopen(base_url.rstrip('/v1') + '/api/tags',
                              timeout=timeout) as r:
            return r.status == 200
    except Exception:
        return False

def _make_llm_client(provider: str, base_url: str, api_key: str = '') -> Optional[object]:
    """
    Build an OpenAI-compatible client for either local Ollama or cloud Groq.

    Both providers expose an OpenAI-compatible REST endpoint, so the same
    openai.OpenAI client class works for both — only the base_url and
    auth differ.
    """
    if not _LLM_CLIENT_AVAILABLE:
        return None
    if provider == 'groq':
        if not api_key:
            return None
        return _OpenAIClient(base_url=base_url, api_key=api_key)
    # Ollama ignores the key but the SDK requires a non-empty string
    if not _ping_ollama(base_url):
        return None
    return _OpenAIClient(base_url=base_url, api_key='ollama-local')

def _parse_json_response(raw: str) -> dict:
    """
    Robustly extract a JSON object from an LLM response.

    Handles the response shapes seen across providers/models:
      • plain JSON:                  {"role": "measure"}
      • markdown-fenced:             ```json\n{...}\n```
      • reasoning-model preamble:    <think>...</think>\n{...}
        (Qwen3 / DeepSeek-R1 style models emit thinking traces before the
        answer when JSON mode is unavailable — e.g. qwen/qwen3-32b on Groq)
      • prose then JSON:             Here is the result: {...}

    Strategy: strip <think> blocks, then take the substring from the first
    '{' to the last '}' and parse it.  Raises ValueError if no JSON found.
    """
    s = (raw or '').strip()
    if not s:
        raise ValueError('empty response')
    # Strip reasoning blocks (Qwen3 / R1 style)
    while '<think>' in s:
        start = s.find('<think>')
        end   = s.find('</think>', start)
        if end == -1:
            # Unclosed think block — drop everything from <think> onward,
            # the JSON (if any) would be before it
            s = s[:start]
            break
        s = s[:start] + s[end + len('</think>'):]
    s = s.strip()
    # Take first '{' .. last '}' — covers fences and prose prefixes/suffixes
    i, j = s.find('{'), s.rfind('}')
    if i == -1 or j == -1 or j <= i:
        raise ValueError('no JSON object in response')
    return json.loads(s[i:j + 1])

def _safe_chat_completion(client, model: str, prompt: str,
                            max_tokens: int = 200, temperature: float = 0.1):
    """
    Call an OpenAI-compatible chat completion endpoint with automatic
    fallback for providers that don't support JSON mode on a given model.

    Some models on Groq (notably some Qwen 3 variants) reject
    `response_format={"type":"json_object"}` with HTTP 400 BadRequestError.
    This wrapper first tries WITH JSON mode (better reliability when
    supported), and if the provider rejects it with a bad-request error,
    retries WITHOUT.  Prompts in this codebase already say 'Output JSON only'
    and we strip ```json fences after parsing, so the retry path still
    works deterministically.
    """
    base_args = {
        'model':       model,
        'messages':    [{'role': 'user', 'content': prompt}],
        'temperature': temperature,
        'max_tokens':  max_tokens,
    }
    try:
        return client.chat.completions.create(
            **base_args, response_format={'type': 'json_object'})
    except Exception as e:
        # Retry without JSON mode on bad-request / unsupported-feature errors
        err_name = type(e).__name__
        err_text = str(e)
        if ('BadRequest' in err_name or '400' in err_text
                or 'response_format' in err_text):
            return client.chat.completions.create(**base_args)
        raise

try:
    from fastopic import FASTopic                # type: ignore[import-not-found]
    _FASTOPIC_AVAILABLE = True
except Exception:   # fastopic pulls in torch; broken torch raises OSError, not ImportError
    _FASTOPIC_AVAILABLE = False

# ──────────────────────────────────────────────────────────────────────────────
# ROLE KEYS  (domain-agnostic)
# ──────────────────────────────────────────────────────────────────────────────
LEAF_KEYS  = 'variable var field column attribute name code id item indicator question measure concept'.split()
GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
TEXT_KEYS  = 'description definition desc label title question meaning note notes text display full details explanation'.split()
META_KEYS  = 'type dtype data_type datatype unit units format decimal precision values value coding range min max scale'.split()

# ──────────────────────────────────────────────────────────────────────────────
# FILE LOADING
# ──────────────────────────────────────────────────────────────────────────────
def safe_name(n: str) -> str:
    return ''.join(c if c.isalnum() or c in '-_.' else '_' for c in n)

def try_read_csv(path: Path) -> pd.DataFrame:
    best, best_score = None, -1
    # Try explicit comma first (most common), then let Python sniff, then other separators.
    # Reject results with only 1 column — likely a parsing failure.
    for enc in ['utf-8-sig', 'utf-8', 'latin1']:
        for sep in [',', '\t', ';', '|', None]:
            try:
                df = pd.read_csv(path, sep=sep, engine='python', encoding=enc,
                                 on_bad_lines='skip')
                if df.shape[1] < 2:
                    continue
                s = df.shape[1] * 10 - float(df.isna().mean().mean())
                if s > best_score:
                    best, best_score = df, s
            except Exception:
                pass
    if best is None:
        raise ValueError(f'Could not read {path.name}')
    best.columns = [str(c).strip().replace(';', '') for c in best.columns]
    return best

def load_any(path: Path) -> pd.DataFrame:
    s = path.suffix.lower()
    if s in ['.csv', '.tsv', '.txt']:
        return try_read_csv(path)
    if s in ['.xlsx', '.xls']:
        return pd.read_excel(path)
    if s == '.json':
        obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
        if isinstance(obj, list):
            return pd.json_normalize(obj)
        if isinstance(obj, dict):
            for v in obj.values():
                if isinstance(v, list):
                    return pd.json_normalize(v)
    raise ValueError(f'Unsupported: {s}')

# ──────────────────────────────────────────────────────────────────────────────
# COLUMN ROLE DETECTION  (domain-agnostic heuristic scoring)
# ──────────────────────────────────────────────────────────────────────────────
def _norm(c: str) -> str:
    return ''.join(ch if ch.isalnum() else ' ' for ch in str(c).lower())

def _ks(c: str, keys: list) -> int:
    return sum(1 for k in keys if k in _norm(c))

def detect_roles(df: pd.DataFrame) -> dict:
    n = max(len(df), 1)
    out = []
    for col in df.columns:
        s = df[col]
        nun = int(s.nunique(dropna=True))
        ur  = nun / n
        avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
        # Raised group uniqueness ceiling from 80 → 300 so large datasets (e.g. HCP
        # with 100+ assessment categories) are not excluded.
        out.append({
            'column':      str(col),
            'leaf_score':  4 * _ks(col, LEAF_KEYS)  + (3 if 0.5 <= ur <= 1 else 0),
            'group_score': 4 * _ks(col, GROUP_KEYS) + (3 if 1 < nun < min(n * 0.5, 300) else 0),
            'text_score':  5 * _ks(col, TEXT_KEYS)  + (4 if avg > 50 else 0),
            'meta_score':  4 * _ks(col, META_KEYS)  + (2 if 1 < nun < min(n * 0.8, 100) else 0),
        })
    prof  = pd.DataFrame(out)
    leaf  = prof.sort_values('leaf_score', ascending=False).head(1)['column'].tolist()
    text  = (prof[prof.text_score >= 4]
             .sort_values('text_score', ascending=False)['column'].tolist()) or leaf[:]
    group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf))]
             .sort_values('group_score', ascending=False).head(3)['column'].tolist())
    meta  = (prof[(prof.meta_score >= 4) & (~prof.column.isin(text + leaf + group))]
             .sort_values('meta_score', ascending=False).head(4)['column'].tolist())
    return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'meta_cols': meta}

def sv(x) -> str:
    return '' if pd.isna(x) else str(x).strip()

def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
    """Build normalised per-variable rows with a unified _text field [GON §3]."""
    leaf_cols  = cfg.get('leaf_cols', [])
    group_cols = cfg.get('group_cols', [])
    text_cols  = cfg.get('text_cols', [])
    meta_cols  = cfg.get('meta_cols', [])
    rows = []
    for i, row in df.iterrows():
        label = (' / '.join(p for p in [sv(row.get(c, '')) for c in leaf_cols] if p)
                 or f'var_{i}')
        group = (' > '.join(p for p in [sv(row.get(c, '')) for c in group_cols]
                             if p and p.lower() not in ['nan', 'none'])
                 or 'Ungrouped')
        all_cols = list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols))
        text = ' | '.join(f'{c}: {sv(row.get(c, ""))}' for c in all_cols
                          if sv(row.get(c, '')))
        rows.append({'_source': source, '_row': int(i), '_label': label,
                     '_id': f'{group}.{label}', '_group': group, '_text': text})
    can = pd.DataFrame(rows)
    cnt: dict = defaultdict(int)
    ids = []
    for lid in can['_id']:
        cnt[lid] += 1
        ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
    can['_id'] = ids
    return can

# ──────────────────────────────────────────────────────────────────────────────
# STEP 3  — ASPECT DISCOVERY VIA NMF  [ZHU §3.1 adapted]
# ──────────────────────────────────────────────────────────────────────────────
def _elbow_k(errors: list, k_range: range) -> int:
    if len(errors) <= 2:
        return list(k_range)[0]
    diffs = np.diff(errors)
    drops = np.diff(diffs)
    if drops.max() - drops.min() < 1e-8:
        return max(2, int(np.sqrt(len(errors))))
    elbow_idx = int(np.argmax(drops)) + 1
    return list(k_range)[min(elbow_idx, len(k_range) - 1)]

def discover_aspects(texts: list, max_aspects: int = 10):
    """
    Discover K latent semantic aspects via NMF on TF-IDF  [ZHU §3.1 adapted].

    Replaces Zhu et al.'s LLM aspect generation with NMF (deterministic, no
    hallucination).  K is selected by reconstruction-error elbow.

    Returns tfidf, X, nmf, W, H, K, labels.
    """
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
                            max_features=3000, min_df=1)
    X     = tfidf.fit_transform(texts)
    terms = np.array(tfidf.get_feature_names_out())
    n_vars = X.shape[0]

    k_range = range(2, min(max_aspects + 1, n_vars // 2 + 1, 16))
    if len(k_range) < 2:
        k_range = range(2, 3)

    errors = []
    for k in k_range:
        m = NMF(n_components=k, random_state=42, max_iter=400, init='nndsvda')
        m.fit_transform(X)
        errors.append(m.reconstruction_err_)

    K   = _elbow_k(errors, k_range)
    nmf = NMF(n_components=K, random_state=42, max_iter=400, init='nndsvda')
    W   = nmf.fit_transform(X)
    H   = nmf.components_

    # Aspect labels: top-4 terms per NMF component
    labels = []
    for k in range(K):
        top_idx = np.argsort(H[k])[-4:][::-1]
        labels.append(' / '.join(terms[top_idx]))

    return tfidf, X, nmf, W, H, K, labels

# ──────────────────────────────────────────────────────────────────────────────
# STEP 3 (FASTopic variant)  — semantic aspect discovery  [Wu et al. NeurIPS 2024]
# ──────────────────────────────────────────────────────────────────────────────
def discover_aspects_fastopic(texts: list,
                               max_aspects: int = 10,
                               fallback_tfidf: bool = True):
    """
    Recent SOTA semantic aspect discovery via FASTopic [Wu et al. NeurIPS 2024,
    arXiv:2405.17978].

    FASTopic uses a pretrained Transformer (SBERT) to embed documents, then
    learns topic/word embeddings via Dual Semantic-relation Reconstruction
    (DSR) with optimal transport.  Beats NMF, BERTopic, and CombinedTM on
    standard topic benchmarks — semantic, not lexical, and reproducible.

    Adapter shape matches `discover_aspects` so it is a drop-in replacement:
        returns (tfidf, X, model, W, H, K, labels)
    `tfidf` and `X` are still produced (used downstream by label_cluster +
    masked TF-IDF representation fallbacks); FASTopic provides W (doc-topic),
    H_proxy (topic-term scores derived from top words), K, and labels.

    Falls back to NMF if FASTopic is not installed or the group is too small
    for transformer training (< 6 documents).
    """
    n_vars = len(texts)
    # Keep a TF-IDF matrix available for downstream code paths
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
                             max_features=3000, min_df=1)
    X = tfidf.fit_transform(texts)
    terms = np.array(tfidf.get_feature_names_out())

    if not _FASTOPIC_AVAILABLE or n_vars < 6:
        if fallback_tfidf:
            return discover_aspects(texts, max_aspects)
        raise RuntimeError('FASTopic unavailable and TF-IDF fallback disabled')

    # FASTopic K selection: bounded similar to NMF elbow but simpler — pick a
    # reasonable K from corpus size (avoids overfitting tiny groups).
    K = max(2, min(max_aspects, int(np.ceil(np.sqrt(n_vars))) + 1, n_vars - 1))

    try:
        model = FASTopic(num_topics=K,
                          doc_embed_model='all-MiniLM-L6-v2',
                          verbose=False)
        # fit_transform returns (top_words_per_topic, doc_topic_dist)
        result = model.fit_transform(texts)
        if isinstance(result, tuple) and len(result) == 2:
            top_words, doc_topic = result
        else:
            # Some FASTopic versions return only doc_topic; pull top words via API
            doc_topic = result
            top_words = [model.get_top_words(topic_id=k, num_top_words=10)
                         for k in range(K)]
    except Exception:
        # Robust fallback if FASTopic fails (small corpus, OOM, etc.)
        if fallback_tfidf:
            return discover_aspects(texts, max_aspects)
        raise

    W = np.asarray(doc_topic, dtype=np.float32)
    if W.ndim != 2 or W.shape[0] != n_vars:
        if fallback_tfidf:
            return discover_aspects(texts, max_aspects)

    # Build H_proxy: K × n_terms with weight = position-decay of each top word
    n_terms = len(terms)
    term_to_idx = {t: i for i, t in enumerate(terms)}
    H_proxy = np.zeros((K, n_terms), dtype=np.float32)
    labels  = []
    for k in range(K):
        words_k = top_words[k] if k < len(top_words) else []
        # Each entry may be 'word', or 'word score', or (word, score)
        clean: list = []
        for w in words_k:
            if isinstance(w, (list, tuple)):
                w = w[0]
            w = str(w).split(' ')[0].strip().lower()
            if w:
                clean.append(w)
        for rank, w in enumerate(clean):
            if w in term_to_idx:
                H_proxy[k, term_to_idx[w]] += 1.0 / (rank + 1)
        labels.append(' / '.join(clean[:4]) if clean else f'aspect {k+1}')

    return tfidf, X, model, W, H_proxy, K, labels

# ──────────────────────────────────────────────────────────────────────────────
# STEP 4  — PER-ASPECT VARIABLE REPRESENTATIONS  [ZHU §3.1]
# ──────────────────────────────────────────────────────────────────────────────
def per_aspect_representations(texts: list, H: np.ndarray,
                                tfidf: TfidfVectorizer,
                                sbert_model=None) -> list:
    """
    Build K independent representation matrices — one per aspect  [ZHU §3.1].

    For each aspect k:
      • identify top-T terms from H[k]
      • filter variable texts to those terms → encode with SBERT (or masked TF-IDF)

    Returns list of K arrays, each shape (n_vars, embed_dim).
    """
    terms  = np.array(tfidf.get_feature_names_out())
    X_arr  = tfidf.transform(texts).toarray()
    K      = H.shape[0]
    T      = min(30, len(terms))
    reprs  = []

    for k in range(K):
        top_idx   = np.argsort(H[k])[-T:]
        top_terms = set(terms[top_idx])

        if sbert_model is not None:
            filtered = []
            for txt in texts:
                tokens = txt.lower().split()
                kept   = ' '.join(t for t in tokens if t in top_terms)
                filtered.append(kept if kept.strip() else txt)
            emb = sbert_model.encode(filtered, show_progress_bar=False,
                                     batch_size=64, normalize_embeddings=True)
        else:
            mask  = H[k]
            emb   = X_arr * mask[np.newaxis, :]
            norms = np.linalg.norm(emb, axis=1, keepdims=True)
            norms[norms == 0] = 1.0
            emb = emb / norms

        reprs.append(emb.astype(np.float32))

    return reprs

# ──────────────────────────────────────────────────────────────────────────────
# STEP 5  — INDEPENDENT PER-ASPECT CLUSTERING  [ZHU §3.2]
# ──────────────────────────────────────────────────────────────────────────────
def cluster_aspect_gmm(emb: np.ndarray, max_k: int = 8, fast_threshold: int = 30):
    """
    Cluster variables within one aspect space  [ZHU §3.2].

    Hybrid strategy:
      • n ≤ fast_threshold → GMM + BIC (accurate, recommended for small clusters)
      • n  > fast_threshold → MiniBatchKMeans + silhouette selection
        (engineering adaptation for large groups, e.g. HCP Cognition / FreeSurfer)

    Both paths use diagonal covariance / SVD pre-reduction for numerical
    stability on high-dimensional sparse embeddings.
    """
    n = emb.shape[0]
    if n <= 2:
        return np.zeros(n, dtype=int), 1, 0.0

    d_target = min(20, emb.shape[1], n - 1)
    if emb.shape[1] > d_target:
        svd   = TruncatedSVD(n_components=d_target, random_state=42)
        emb_r = svd.fit_transform(emb)
    else:
        emb_r = emb.copy()

    best_score, best_labels, best_k = -np.inf, None, 2

    if n > fast_threshold:
        # Fast path: MiniBatchKMeans + silhouette  (large groups)
        for k in range(2, min(max_k + 1, n)):
            try:
                km     = MiniBatchKMeans(n_clusters=k, random_state=42,
                                          n_init=3, batch_size=min(256, n),
                                          max_iter=100)
                labels = km.fit_predict(emb_r)
                if len(set(labels)) < 2:
                    continue
                sil = float(silhouette_score(emb_r, labels))
                if sil > best_score:
                    best_score, best_labels, best_k = sil, labels, k
            except Exception:
                continue
    else:
        # Accurate path: GMM + BIC  (small groups)
        best_bic = np.inf
        for k in range(2, min(max_k + 1, n)):
            try:
                gmm = GaussianMixture(n_components=k, random_state=42,
                                      covariance_type='diag', reg_covar=1e-3,
                                      max_iter=80, n_init=1,
                                      init_params='random_from_data')
                gmm.fit(emb_r)
                bic    = gmm.bic(emb_r)
                labels = gmm.predict(emb_r)
                if bic < best_bic:
                    best_bic, best_labels, best_k = bic, labels, k
            except Exception:
                continue

    if best_labels is None:
        best_labels = np.zeros(n, dtype=int)

    sil = 0.0
    if len(set(best_labels)) > 1:
        try:
            sil = float(silhouette_score(emb_r, best_labels))
        except Exception:
            pass

    return best_labels.astype(int), best_k, sil

# ──────────────────────────────────────────────────────────────────────────────
# STEP 8a  — OPTIONAL CONSTRAINED LLM LABEL REFINEMENT  [TopicTag, DocEng 2024]
# ──────────────────────────────────────────────────────────────────────────────
def _light_stem(w: str) -> str:
    """
    Minimal English morphological normalisation — no NLTK dependency.

    Used by the LLM grounding check so that 'latencies' matches 'latency',
    'errors' matches 'error', 'completion' matches 'completed', etc.
    Avoids rejecting plurals and common tense variants while still requiring
    every label word to derive from evidence vocabulary.

    Based on Porter-stemmer-style suffix stripping (Porter 1980, adapted).
    """
    w = w.lower().strip()
    for suffix in ('ization', 'isation', 'ousness', 'iveness',
                   'ization', 'ities', 'iness',
                   'ation', 'ments', 'ness',
                   'ies', 'ied', 'ing', 'ers',
                   'ed', 'es', 'er', 'ly', 's'):
        if w.endswith(suffix) and len(w) > len(suffix) + 2:
            return w[:-len(suffix)]
    return w

def make_llm_label_fn(base_url: str = OLLAMA_URL_DEFAULT,
                       model: str = OLLAMA_MODEL_DEFAULT,
                       provider: str = 'ollama',
                       api_key: str = '') -> Optional[Callable]:
    """
    Build a TopicTag-style constrained LLM label refinement function backed
    by a local Ollama server (OpenAI-compatible API at /v1).

    [TopicTag] Eren et al. (2024) run NMF to discover topics, then use an LLM
    to generate human-readable concept labels from the NMF topic terms.
    The LLM receives ONLY the extracted evidence from the CSV — it cannot
    alter the tree, cannot introduce new vocabulary, and must pass a strict
    grounding check (every word in the proposed label must appear in evidence).

    Local-LLM choice (Qwen 2.5 3B Instruct via Ollama) is deliberate:
      • zero cost, zero API dependency, fully reproducible
      • no external data transmission (privacy + thesis defensibility)
      • TopicTag itself benchmarks open models (Llama, Mistral) — using an
        open local model matches the paper's evaluation setup more closely
        than a closed hosted model.

    Returns a callable (candidate, top_terms, parent_path, sample_texts)
    → (label, metadata_dict).  Returns None if Ollama is unreachable or the
    openai client package is missing.
    """
    client = _make_llm_client(provider, base_url, api_key)
    if client is None:
        return None

    def _refine(candidate: str, top_terms: list, parent_path: str,
                sample_texts: list):
        meta = {'confidence': 0.0, 'evidence_terms': [],
                'reason': '', 'raw_label': ''}
        prompt = (
            'You are labeling a cluster in a metadata variable hierarchy.\n'
            'The label MUST be derived strictly from the evidence terms and '
            'sample variable descriptions provided. Do not introduce concepts '
            'or vocabulary that are not visible in the evidence.\n\n'
            f'Parent path: {parent_path}\n'
            f'Evidence terms (from NMF/TF-IDF over the cluster): {", ".join(top_terms[:10])}\n'
            f'Sample variable descriptions:\n'
            + '\n'.join(f'  - {str(t)[:160]}' for t in sample_texts[:4]) + '\n'
            f'\nCurrent candidate label: {candidate}\n\n'
            'Task: Return a concise 2–5 word concept label that PARAPHRASES '
            'the evidence into a cleaner concept name.\n'
            'Rules:\n'
            '1. Every word in the label must appear in (or be an obvious '
            'morphological variant of) the evidence terms or sample descriptions.\n'
            '2. Do not invent domain concepts that are not in the evidence.\n'
            '3. Prefer multi-word noun phrases over single keywords.\n'
            '4. Avoid generic words: data, score, variable, assessment, total, '
            'description, value, decimal.\n'
            '5. Use base forms — singular nouns (Latency not Latencies), and '
            'avoid -ing / -ed verb suffixes unless required.\n'
            '6. Output strict JSON only — no prose, no markdown.\n\n'
            'Output: {"label": "...", "evidence_terms": ["...", "..."], "confidence": 0.0}'
        )
        try:
            # max_tokens generous: reasoning models (Qwen3) emit <think> traces
            # that consume budget before the JSON appears.
            resp = _safe_chat_completion(client, model, prompt,
                                           max_tokens=1200, temperature=0.2)
            raw = (resp.choices[0].message.content or '').strip()
            result = _parse_json_response(raw)
            label  = str(result.get('label', '')).strip()
            conf   = float(result.get('confidence', 0))
            evid   = result.get('evidence_terms', []) or []
            meta['confidence'], meta['evidence_terms'] = conf, evid
            meta['raw_label'] = label
            if not label:
                meta['reason'] = 'empty'
                return candidate, meta
            # NOTE: we IGNORE the LLM's self-reported confidence.  Qwen 3B
            # routinely returns conf ≈ 0.5 on perfectly good labels — using it
            # as a gate rejected useful refinements.  Grounding (below) is the
            # real anti-hallucination check; if every label word stems back to
            # the corpus, the label is accepted regardless of self-confidence.
            #
            # Stem-aware grounding accepts morphological variants
            # (latencies ↔ latency, errors ↔ error, completion ↔ completed).
            haystack_text = ' '.join(
                str(s) for s in (
                    top_terms[:10] + evid + list(sample_texts[:4]) + [parent_path]
                )
            ).lower()
            haystack_stems = {_light_stem(w) for w in haystack_text.split()
                               if len(w) >= 3}
            label_words = [w for w in label.lower().split() if len(w) >= 3]
            label_stems = {_light_stem(w) for w in label_words}
            ungrounded  = label_stems - haystack_stems
            if label_words and ungrounded:
                meta['reason'] = f'ungrounded_words: {sorted(ungrounded)}'
                return candidate, meta
            meta['reason'] = 'accepted'
            return label, meta
        except Exception as e:
            meta['reason'] = f'exception: {type(e).__name__}'
            return candidate, meta

    return _refine

# ──────────────────────────────────────────────────────────────────────────────
# STEP 5a — UPSTREAM LLM PHRASE-ROLE CLASSIFIER  [TopicGPT, NAACL 2024 adapted]
#
# Used to discover semantic roles (Measure / Statistic / Condition / Subtype
# style dimensions) from the corpus and ASSIGN each mined phrase to a role.
# This is fundamentally different from TopicTag-style label refinement:
#   • TopicTag (and the make_llm_label_fn above) uses LLMs to RENAME clusters
#     after the tree structure is already decided.  LLM is downstream.
#   • This function uses an LLM to CREATE the role schema before the tree is
#     built.  Slot ordering and parent-child placement are driven by the
#     LLM-derived roles.  LLM is upstream — closer to TopicGPT (Pham et al.,
#     NAACL 2024), where LLMs perform topic discovery, not topic labeling.
#
# Anti-hallucination contract (preserved):
#   • Every phrase in the role assignment MUST come from the corpus verbatim.
#     Outputs are validated against the input phrase list — anything new is
#     dropped.
#   • Role NAMES are proposed by the LLM but kept to 1–2 generic English
#     words; long or fanciful role names are rejected.
#   • If validation fails, the caller falls back to deterministic mutual-
#     exclusion clustering (current slot mining behaviour).  No silent
#     failure modes.
# ──────────────────────────────────────────────────────────────────────────────
def make_llm_role_classifier_fn(base_url: str = OLLAMA_URL_DEFAULT,
                                  model: str = OLLAMA_MODEL_DEFAULT,
                                  provider: str = 'ollama',
                                  api_key: str = '') -> Optional[Callable]:
    """
    Build a callable that classifies repeated corpus phrases into semantic
    roles using an LLM (Ollama or Groq).
    """
    client = _make_llm_client(provider, base_url, api_key)
    if client is None:
        return None

    def _classify(phrases: list, sample_descs: list, group_name: str = ''):
        meta = {'reason': '', 'raw': '', 'role_count': 0}
        if not phrases:
            meta['reason'] = 'no_phrases'
            return None, meta
        # Truncate to keep the prompt small and the model focused
        phrases_list = list(dict.fromkeys(str(p) for p in phrases))[:60]
        prompt = (
            'You are analysing a list of repeated phrases mined from a '
            'data-dictionary corpus.\n\n'
            f'Group context: {group_name or "(unknown)"}\n\n'
            'Repeated phrases (verbatim from the corpus):\n'
            + '\n'.join(f'  - {p}' for p in phrases_list) + '\n\n'
            'Sample variable descriptions for context:\n'
            + '\n'.join(f'  - {str(s)[:180]}' for s in sample_descs[:4]) + '\n\n'
            'TASK: Group these phrases into 2–5 SEMANTIC ROLES. Each role '
            'represents one ORTHOGONAL DIMENSION of what the variable measures '
            '(for example: what is measured, what statistic is used, under '
            'what condition).\n\n'
            'STRICT RULES:\n'
            '1. Use 2 to 5 roles. Fewer is better when phrases do not really '
            'belong to different dimensions.\n'
            '2. Role names: ONE OR TWO generic English words, lowercase. '
            'Examples of good role names: "measure", "statistic", "condition", '
            '"outcome", "subtype", "modifier". Do not invent fancy or domain-'
            'specific role names.\n'
            '3. Every phrase from the input list MUST appear EXACTLY ONCE in '
            'exactly one role. Copy phrases VERBATIM. Do not rephrase, '
            'normalise, plural-strip, or invent new phrases.\n'
            '4. Phrases that are alternatives (rarely co-occur in the same '
            'variable) should go in the SAME role.\n'
            '5. Phrases that describe DIFFERENT dimensions of the same '
            'variable should go in DIFFERENT roles.\n'
            '6. Output strict JSON only — no prose, no markdown fences.\n\n'
            'Output schema:\n'
            '{\n'
            '  "roles": {\n'
            '    "role_name_1": ["phrase a", "phrase b", ...],\n'
            '    "role_name_2": ["phrase c", "phrase d", ...]\n'
            '  }\n'
            '}'
        )
        try:
            resp = _safe_chat_completion(client, model, prompt,
                                           max_tokens=2000, temperature=0.1)
            raw = (resp.choices[0].message.content or '').strip()
            meta['raw'] = raw[:400]
            result = _parse_json_response(raw)
            roles_raw = result.get('roles', {}) or {}
            if not isinstance(roles_raw, dict):
                meta['reason'] = 'roles_not_dict'
                return None, meta

            # ── Validation ────────────────────────────────────────────────────
            input_set = {p.lower(): p for p in phrases_list}
            roles_clean: dict = {}
            seen_phrases: set = set()
            for role_name, items in roles_raw.items():
                # Role name must be 1–2 generic words
                rn = str(role_name).strip().lower()
                if not rn or len(rn.split()) > 2 or len(rn) > 24:
                    continue
                kept: list = []
                if not isinstance(items, list):
                    continue
                for it in items:
                    s = str(it).strip().lower()
                    if not s or s in seen_phrases:
                        continue
                    if s not in input_set:
                        # Phrase invented by LLM — drop (anti-hallucination)
                        continue
                    kept.append(input_set[s])   # original casing
                    seen_phrases.add(s)
                if len(kept) >= 2:
                    roles_clean[rn] = kept
            if len(roles_clean) < 2:
                meta['reason'] = 'too_few_valid_roles'
                return None, meta
            meta['reason']     = 'accepted'
            meta['role_count'] = len(roles_clean)
            return roles_clean, meta
        except Exception as e:
            meta['reason'] = f'exception: {type(e).__name__}'
            return None, meta

    return _classify

# ──────────────────────────────────────────────────────────────────────────────
# STEP 8b–d  — NODE LABELING  [ZHU §4.3 / TopicTag]
# ──────────────────────────────────────────────────────────────────────────────
# Generic metadata field-name boilerplate that appears in essentially any
# data-dictionary header (description, value, name, ...).  Dataset-specific
# column tokens (e.g. 'fullDisplayName' for HCP, 'Decimal Places' for AI-MIND)
# are added at runtime by build_field_noise() — derived from the user-confirmed
# column names, never enumerated by hand.
_GENERIC_FIELD_NOISE = frozenset({
    'description', 'definition', 'value', 'metadata', 'desc',
    'name', 'item', 'variable', 'field', 'attribute',
    'code', 'type', 'dtype',
})

def build_field_noise(configs: dict) -> frozenset:
    """
    Build the field-name boilerplate set entirely from the user-confirmed
    column roles.  For each detected column name we extract alphanumeric tokens
    and add them to the noise set.

    Examples (derived, NOT hardcoded):
      HCP   'fullDisplayName' → {fulldisplayname}
      HCP   'columnHeader'    → {columnheader}
      AI-MIND 'Decimal Places' → {decimal, places}

    Result: the same effective filter as a hand-crafted list, but produced
    deterministically from whatever columns the current CSV has.  Datasets we
    have never seen get an automatically-tailored noise set.
    """
    noise = set(_GENERIC_FIELD_NOISE)
    for cfg in (configs or {}).values():
        for col_list in cfg.values():
            for col in col_list:
                tokens = ''.join(c if c.isalnum() else ' '
                                 for c in str(col)).split()
                noise.update(t.lower() for t in tokens if len(t) >= 2)
    return frozenset(noise)

# Module-level fallback used when label_cluster is called without a configs-
# derived noise set.  Replaced at build time by the Streamlit pipeline below.
FIELD_NAME_NOISE: frozenset = _GENERIC_FIELD_NOISE

def _extract_common_prefix_phrase(cluster_texts: list,
                                    min_coverage: float = 0.6) -> str:
    """
    Many data dictionaries write 'Concept Name: definition...' in the
    description.  If most cluster members share a concept-name prefix, that
    prefix IS the concept label.  Fully data-driven — works on any dictionary
    using the 'name: definition' convention.

    Returns a Title-cased phrase, or '' if no shared prefix is strong enough.
    """
    # _text is "col1: val1 | col2: val2 | ...".  Find description-like field
    # and take its prefix before the inner colon.
    prefixes = []
    for t in cluster_texts:
        for chunk in str(t).split(' | '):
            if ':' not in chunk:
                continue
            key, val = chunk.split(':', 1)
            key_l = key.lower()
            if 'descrip' in key_l or 'def' in key_l or 'full' in key_l:
                phrase = val.split(':')[0].strip()
                tokens = phrase.split()
                if 2 <= len(tokens) <= 6:
                    prefixes.append(tokens)
                break

    if not prefixes:
        return ''

    n_thresh = max(1, int(min_coverage * len(prefixes)))
    max_len  = max(len(p) for p in prefixes)
    for length in range(min(6, max_len), 1, -1):
        starts = Counter(tuple(p[:length]) for p in prefixes if len(p) >= length)
        if not starts:
            continue
        top, cnt = starts.most_common(1)[0]
        if cnt >= n_thresh:
            return ' '.join(top).title()
    return ''

def _bigram_preferred_terms(diff: np.ndarray, terms: np.ndarray,
                             boilerplate: set, prefix_lower: str,
                             n_terms: int) -> list:
    """
    Pick top-n discriminative terms, preferring bigrams and removing redundancy.

    Rules:
    1. Skip short tokens (<3 chars), boilerplate, field-name noise, and any
       token already in the prefix.
    2. When a bigram is selected, drop any previously-picked unigram that is
       a substring of it.
    3. Skip unigrams that are substrings of any already-picked bigram.
    """
    order  = np.argsort(diff)[::-1]
    picked, picked_lower = [], []
    for i in order:
        t  = terms[i]; tl = t.lower()
        if (len(t) < 3 or tl in boilerplate or tl in FIELD_NAME_NOISE
                or (prefix_lower and tl in prefix_lower)):
            continue
        is_unigram = ' ' not in t
        # Rule 3: unigram already covered by a picked bigram?
        if is_unigram and any(tl in pl for pl in picked_lower if ' ' in pl):
            continue
        # Rule 2: replace picked unigrams subsumed by this new bigram
        if not is_unigram:
            keep = [(p, pl) for p, pl in zip(picked, picked_lower)
                    if not (' ' not in pl and pl in tl)]
            picked, picked_lower = [k[0] for k in keep], [k[1] for k in keep]
        picked.append(t); picked_lower.append(tl)
        if len(picked) >= n_terms:
            break
    return picked

def label_cluster(cluster_texts: list, all_texts: list,
                  tfidf: TfidfVectorizer, n_terms: int = 3,
                  cluster_groups: Optional[list] = None,
                  parent_path: str = '',
                  llm_label_fn: Optional[Callable] = None,
                  return_provenance: bool = False):
    """
    Label a cluster node using a five-stage deterministic pipeline.

    A) Description-prefix candidate: extract concept phrase from 'Name: def...'
       pattern shared by ≥60% of cluster members.
    B) Group-purity prefix: if ≥70% share a top-level _group value, prepend it.
    C) Boilerplate filter: drop terms with TF-IDF IDF ≤ 1.7 plus FIELD_NAME_NOISE.
    D) Bigram-preferred discriminative suffix [ZHU §4.3].
    E) Optional constrained LLM refinement [TopicTag, DocEng 2024].

    When return_provenance=True returns a (label, provenance_dict) tuple where
    provenance records which stage produced the label (description_prefix /
    tfidf_bigram / group_anchor / llm) plus evidence terms and (for LLM) the
    confidence score and a grounding check result.
    """
    prov: dict = {'label_source': 'fallback',
                  'evidence_terms': [],
                  'confidence': 1.0,
                  'llm_used': False,
                  'llm_rejected': False}

    if not cluster_texts:
        return ('Group', prov) if return_provenance else 'Group'

    terms    = np.array(tfidf.get_feature_names_out())
    idf_vals = tfidf.idf_
    boilerplate = set(terms[idf_vals <= 1.7].tolist())

    # ── B) group-purity prefix ───────────────────────────────────────────────
    prefix = ''
    if cluster_groups:
        top_lvl = [str(g).split(' > ')[0].strip() for g in cluster_groups]
        top_grp, cnt = Counter(top_lvl).most_common(1)[0]
        if cnt / len(top_lvl) >= 0.70 and top_grp not in ('', 'Ungrouped'):
            prefix = top_grp

    # ── A) description-prefix candidate phrase ───────────────────────────────
    phrase = _extract_common_prefix_phrase(cluster_texts)
    if phrase and prefix and phrase.lower().startswith(prefix.lower()):
        phrase = phrase[len(prefix):].strip()

    # ── D) bigram-preferred discriminative terms ─────────────────────────────
    top_terms_raw, words = [], []
    try:
        X_all  = tfidf.transform(all_texts).toarray()
        X_clus = tfidf.transform(cluster_texts).toarray()
        diff   = X_clus.mean(axis=0) - X_all.mean(axis=0)
        top_terms_raw = [terms[i] for i in np.argsort(diff)[::-1][:20]]
        words = _bigram_preferred_terms(diff, terms, boilerplate,
                                          prefix.lower(), n_terms)
    except Exception:
        pass

    # ── compose deterministic candidate (records which stage produced it) ────
    if phrase:
        candidate = f'{prefix} — {phrase}' if prefix else phrase
        prov['label_source']   = 'description_prefix'
        prov['evidence_terms'] = [phrase] + ([prefix] if prefix else [])
    elif words:
        suffix    = ' / '.join(w.title() for w in words)
        candidate = f'{prefix} — {suffix}' if prefix else suffix
        prov['label_source']   = 'tfidf_bigram'
        prov['evidence_terms'] = list(words) + ([prefix] if prefix else [])
    elif prefix:
        candidate = prefix
        prov['label_source']   = 'group_anchor'
        prov['evidence_terms'] = [prefix]
    else:
        candidate = 'Group'

    # ── E) optional constrained LLM refinement [TopicTag] ────────────────────
    # LLM is used ONLY as a re-phraser of evidence already present in the
    # cluster.  Refinement is rejected if the returned label is not grounded
    # in the evidence terms — keeping the user's contract that "labels should
    # come from the csv itself".
    if llm_label_fn and candidate != 'Group':
        prov['llm_used'] = True
        try:
            refined, llm_meta = llm_label_fn(
                candidate, top_terms_raw[:10],
                parent_path, cluster_texts[:4])
            # Always record the raw LLM proposal and the grounding-check reason
            # — even when rejected — so the Provenance tab can show "what did
            # the LLM suggest and why was it dropped?"
            prov['llm_raw_label'] = llm_meta.get('raw_label', '')
            prov['llm_reason']    = llm_meta.get('reason', '')
            if refined and refined != candidate:
                candidate = refined
                prov['label_source']   = 'llm'
                prov['confidence']     = llm_meta.get('confidence', 0.0)
                prov['evidence_terms'] = llm_meta.get('evidence_terms',
                                                       prov['evidence_terms'])
            else:
                prov['llm_rejected'] = True
        except Exception as _e:
            prov['llm_rejected'] = True
            prov['llm_reason']   = f'exception: {type(_e).__name__}'

    if return_provenance:
        return candidate, prov
    return candidate

# ──────────────────────────────────────────────────────────────────────────────
# STEP 5b  — PHRASE-SLOT MINING  (data-driven IE-style slot induction)
#
# Information-extraction adaptation for structured metadata dictionaries.
# Many data dictionaries (e.g. AI-MIND cognitive tests, HCP Study Completion)
# express each variable as a regular phrase combining several semantic
# dimensions (measure type, statistic, condition, etc.).  Document-level
# methods (NMF, BERTopic, CTM) cannot separate these because all dimensions
# collapse into one vector.  Phrase-slot mining decomposes each description
# into multiple phrases, identifies which phrases are mutually-exclusive
# alternatives across the corpus, and uses each alternative-set as a slot.
#
# No domain hardcoding: slot phrases are discovered from n-gram co-occurrence
# patterns in the actual descriptions.  Slot names are derived from the
# phrases themselves; if structure is too weak, the algorithm reports
# `valid=False` and the caller falls back to NMF.
# ──────────────────────────────────────────────────────────────────────────────
# ── Generic dictionary markers stripped before slot mining ────────────────────
# These are universal data-dictionary conventions (`KEY: ...`, `Note: ...`,
# `Question: ...`) — not domain knowledge.  Without stripping, they become
# false slot phrases (e.g. AI-MIND output contained `Key` nodes promoted from
# `KEY: DMS Percent Correct ...`).
_LEADING_MARKERS = frozenset({
    'key', 'note', 'notes', 'definition', 'description', 'desc',
    'question', 'q', 'item', 'value', 'meaning', 'label',
})

def _strip_leading_markers(text: str) -> str:
    """Iteratively strip leading dictionary markers like 'KEY:' or 'Note:'."""
    out = text
    for _ in range(4):  # bounded — never more than a few stacked markers
        if ':' not in out:
            break
        head, rest = out.split(':', 1)
        if head.strip().lower() in _LEADING_MARKERS:
            out = rest.strip()
        else:
            break
    return out

def _split_concept_and_body(text: str) -> tuple:
    """
    Split a 'Concept Name: definition sentence' description into its concept
    prefix and its definition body.

    Data dictionaries overwhelmingly use the convention
        <short concept phrase> : <longer explanatory definition>
    e.g.  'DMS Correct Latency Standard Deviation (SD) (0 second delay):
           The standard deviation of response latencies for trials ...'

    The concept phrase is the clean, canonical label; the body is explanatory
    prose that introduces boilerplate ('the number of times', 'a subject
    revisits a box ...') and weaker surface phrases ('response latencies').

    Returns (concept, body).  If no clear concept/body boundary exists, concept
    is the whole text and body is ''.  Generic — no domain knowledge.
    """
    s = _strip_leading_markers(str(text).strip())
    if ':' not in s:
        return s, ''
    head, body = s.split(':', 1)
    head = head.strip()
    body = body.strip()
    # Only treat `head` as a concept prefix if it's phrase-length (not a whole
    # sentence) — a real concept name is short.
    if 1 <= len(head.split()) <= 14:
        return head, body
    return s, ''

# Generic data-dictionary definition-prose openers.  Role values starting with
# these are explanatory fragments, not concept labels — reject them.  Generic
# English, not domain vocabulary.
_BOILERPLATE_VALUE_PREFIXES = (
    'the number of', 'number of', 'the subject', 'a subject', 'the participant',
    'a participant', 'this measure', 'this variable', 'calculated across',
    'calculated as', 'the percentage of', 'the proportion of', 'the total number',
    'the mean of', 'the median of', 'the standard deviation of', 'the amount of',
    'the time', 'the length of', 'expressed as', 'defined as', 'measured as',
)

def _is_boilerplate_value(val: str) -> bool:
    """True if a role value looks like a definition fragment rather than a label."""
    v = str(val).strip().lower()
    if not v:
        return True
    if any(v.startswith(p) for p in _BOILERPLATE_VALUE_PREFIXES):
        return True
    # Sentence-length values are definitions, not concept labels
    if len(v.split()) > 6:
        return True
    return False

# ── English stopwords for phrase-quality filtering ────────────────────────────
# Reject phrases like 'and', 'them', 'to be', 'have a lot of' from becoming
# slot nodes.  This is general English filtering, not domain knowledge.
_STOPWORDS = frozenset(
    'a an the and or but of in on at by for with about into during '
    'through over under above below from to as is are was were be being '
    'been have has had do does did this that these those they them their '
    'it its he she his her you your we our i my me us not no nor '
    'how often when where which who why what '
    'lot lots much many more most some any all none '
    'very also too just so such only even still even also '
    'one two three first second '   # 'second' as standalone — careful: kept in n-grams via context
    .split()
)

def _phrase_is_meaningful(phrase: str, group_name_lower: str = '') -> bool:
    """
    Reject phrases that should never be a hierarchy node.

    Rules (all generic, no domain knowledge):
      • all-stopword phrases ('and', 'to be', 'have a lot of')
      • boundary stopwords ('the response time' → 'the' boundary)
      • phrase equals the parent group name ('DMS' inside DMS branch)
      • single dictionary-marker words ('key', 'note')
      • pure numeric tokens
    """
    p = phrase.strip().lower()
    if not p:
        return False
    if p == group_name_lower:
        return False
    if p in _LEADING_MARKERS:
        return False
    tokens = p.split()
    if not tokens:
        return False
    # Boundary stopwords
    if tokens[0] in _STOPWORDS or tokens[-1] in _STOPWORDS:
        return False
    # Need at least one content token (non-stopword, length ≥ 2)
    content = [t for t in tokens if t not in _STOPWORDS and len(t) >= 2]
    if not content:
        return False
    # Single-token phrases must be a meaningful word, not a bare digit
    if len(tokens) == 1 and tokens[0].isdigit():
        return False
    return True

# ──────────────────────────────────────────────────────────────────────────────
# OPTION D — SBERT PHRASE CLUSTERING + CONSTRAINED LLM ROLE NAMING
#
# Following the EDC pattern (Zhang & Soh, EMNLP 2024) and ZOES (arXiv
# 2506.04458, 2025):
#   Extract   → mine repeated phrases from each variable description
#   Define    → semantically cluster phrases via SBERT + AgglomerativeClustering
#   Canonicalize → constrained-vocabulary LLM names each cluster with a role
#                  (measure / statistic / condition / subtype / outcome / modifier)
#
# Anti-hallucination contract:
#   • Phrases are extracted verbatim from the corpus, never invented
#   • LLM only NAMES discovered clusters — cannot move phrases
#   • Role names are constrained to a fixed generic English vocabulary
#     (configurable via constrained=True/False)
#   • Anonymous fallback when LLM is unavailable or invalid (cluster_0, ...)
#
# Different from Approach 1: Approach 1 embeds the WHOLE description as one
# vector; this embeds each EXTRACTED PHRASE as its own vector.  Approach 1
# clusters variables; this clusters phrases.  Approach 1 produces a single
# similarity tree; this produces a multi-role decomposition + nested tree.
# ──────────────────────────────────────────────────────────────────────────────
_ALLOWED_ROLES_CONSTRAINED = frozenset({
    'measure', 'statistic', 'condition', 'subtype', 'outcome', 'modifier',
})

# Generic role priority for hierarchy nesting (no domain knowledge).
# Higher priority = outermost (closest to root) level inside the group.
_ROLE_PRIORITY = {
    'measure':   6,
    'outcome':   5,
    'statistic': 4,
    'condition': 3,
    'modifier':  2,
    'subtype':   1,
}

def make_llm_role_namer_fn(base_url: str = OLLAMA_URL_DEFAULT,
                            model: str = OLLAMA_MODEL_DEFAULT,
                            constrained: bool = True,
                            provider: str = 'ollama',
                            api_key: str = '') -> Optional[Callable]:
    """
    Returns a callable (phrases, sample_descs) → (role_name, meta).
    Works with Ollama (local) or Groq (cloud).
    """
    client = _make_llm_client(provider, base_url, api_key)
    if client is None:
        return None

    def _name_cluster(phrases: list, sample_descs: list = None):
        meta = {'reason': '', 'raw': '', 'constrained': constrained}
        if not phrases:
            meta['reason'] = 'empty_phrases'
            return None, meta
        sample_descs = sample_descs or []

        head = (
            'Below is a cluster of semantically related phrases mined from a '
            'data-dictionary corpus. The phrases were grouped by sentence-'
            'transformer embedding similarity.\n\n'
            'Phrases in cluster:\n'
            + '\n'.join(f'  - {p}' for p in phrases[:20]) + '\n\n'
            'Sample variable descriptions for context:\n'
            + '\n'.join(f'  - {str(s)[:160]}' for s in sample_descs[:3]) + '\n\n'
        )
        if constrained:
            prompt = head + (
                'Choose EXACTLY ONE semantic role from this fixed list that '
                'best names what these phrases share. Pick the SINGLE best fit:\n'
                '  - measure   (the base concept being measured)\n'
                '  - statistic (mean, median, standard deviation, total, ...)\n'
                '  - condition (when/where/under what circumstances)\n'
                '  - outcome   (result/output type)\n'
                '  - subtype   (a specific kind of measure)\n'
                '  - modifier  (a qualifier)\n\n'
                'Output strict JSON only — no markdown:\n'
                '{"role": "measure"}'
            )
        else:
            prompt = head + (
                'Propose ONE generic 1–2 word lowercase English noun that names '
                'the semantic role these phrases share. Examples of good roles: '
                'measure, statistic, condition, subtype, location, time, '
                'identifier, frequency, severity, quality.\n\n'
                'Output strict JSON only:\n'
                '{"role": "..."}'
            )

        try:
            # max_tokens must cover reasoning-model <think> traces (Qwen3)
            # before the tiny JSON answer appears.
            resp = _safe_chat_completion(client, model, prompt,
                                           max_tokens=800, temperature=0.1)
            raw = (resp.choices[0].message.content or '').strip()
            meta['raw'] = raw[:200]
            result = _parse_json_response(raw)
            role = str(result.get('role', '')).strip().lower()
            if not role:
                meta['reason'] = 'empty_role'
                return None, meta
            if constrained:
                if role not in _ALLOWED_ROLES_CONSTRAINED:
                    meta['reason'] = f'role_not_allowed: {role}'
                    return None, meta
            else:
                toks = role.split()
                if len(toks) > 2 or len(role) > 24:
                    meta['reason'] = 'role_too_long'
                    return None, meta
                if not all(t.isalpha() and t.islower() for t in toks):
                    meta['reason'] = 'role_not_alpha_lower'
                    return None, meta
            meta['reason'] = 'accepted'
            return role, meta
        except Exception as e:
            meta['reason'] = f'exception: {type(e).__name__}'
            return None, meta

    return _name_cluster

def _extract_phrases_for_role_clustering(texts: list,
                                           text_col_names: Optional[list],
                                           group_name: str,
                                           min_phrase_count: int = 2
                                           ) -> tuple:
    """
    Shared helper: extract repeated meaningful phrases from a group of
    descriptions. Returns (phrases_list, per_row_phrase_sets, regularity).
    Regularity = fraction of rows that contain ≥2 mined phrases.
    """
    text_keys = ({c.strip().lower() for c in (text_col_names or [])}
                 if text_col_names else None)
    n = len(texts)

    # 1. extract description-prefix per row
    prefixes = []
    for t in texts:
        prefix = ''
        for chunk in str(t).split(' | '):
            if ':' not in chunk:
                continue
            key, val = chunk.split(':', 1)
            key_l = key.strip().lower()
            if text_keys is not None and key_l not in text_keys:
                continue
            prefix = val.split(':')[0].strip()
            break
        if not prefix:
            prefix = str(t).strip()
        prefix = _strip_leading_markers(prefix)
        prefixes.append(prefix)

    # 2. normalise
    def _norm(p):
        s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
                    for ch in str(p).lower())
        return ' '.join(s.split())
    norm = [_norm(p) for p in prefixes]

    # 3. mine n-grams 1..4 ≥3 chars
    row_phrases: list = []
    phrase_count: Counter = Counter()
    for text in norm:
        tokens = text.split()
        row_set = set()
        for ngram_n in range(1, 5):
            for i in range(len(tokens) - ngram_n + 1):
                phrase = ' '.join(tokens[i:i + ngram_n])
                if len(phrase) >= 3:
                    row_set.add(phrase)
        row_phrases.append(row_set)
        for p in row_set:
            phrase_count[p] += 1

    # 4. keep repeated, retain maximal per row
    keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
    refined = []
    for row in row_phrases:
        sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
        kept: list = []
        for p in sorted_by_len:
            if not any(p != q and p in q for q in kept):
                kept.append(p)
        refined.append(set(kept))

    phrase_count = Counter()
    for r in refined:
        for p in r:
            phrase_count[p] += 1

    group_lower = (group_name or '').strip().lower()
    phrases = sorted([p for p, c in phrase_count.items()
                       if c >= min_phrase_count
                       and _phrase_is_meaningful(p, group_lower)])

    regularity = sum(1 for r in refined if len(r) >= 2) / max(1, n)
    return phrases, refined, round(regularity, 4)

# ──────────────────────────────────────────────────────────────────────────────
# PER-ROW LLM ROLE EXTRACTOR  [Zhu et al. EMNLP 2025 — proper implementation]
#
# For each variable description, ONE LLM call extracts role values directly:
#   measure   = base concept being measured (e.g. "Correct Latency")
#   statistic = statistical aggregation (e.g. "Standard Deviation")
#   condition = experimental condition (e.g. "0 second delay")
#   subtype   = specific error / outcome subtype (e.g. "Incorrect Colour")
#
# Anti-hallucination contract:
#   • Each returned value MUST be a literal substring of the input description
#     (grounding check rejects anything else).
#   • LLM cannot invent new roles — only the four canonical roles are output.
#   • Empty string is a valid output ("this row has no statistic" etc.).
#
# Cost: ONE call per variable.  AI-MIND ≈ 108 calls, HCP ≈ 813.
# ──────────────────────────────────────────────────────────────────────────────
_PER_ROW_ROLES = ('measure', 'statistic', 'condition', 'subtype')

def make_per_row_role_extractor_fn(base_url: str = OLLAMA_URL_DEFAULT,
                                     model: str = OLLAMA_MODEL_DEFAULT,
                                     provider: str = 'ollama',
                                     api_key: str = '') -> Optional[Callable]:
    """
    Returns a callable (description, sample_descs_in_group) → (roles_dict, meta).

    Implements the Zhu et al. (2025, EMNLP) multi-aspect encoding pattern:
    instead of clustering phrases and naming clusters, ask the LLM to extract
    each role value directly from each row's description.  Each extracted
    value must be a verbatim substring (anti-hallucination).
    """
    client = _make_llm_client(provider, base_url, api_key)
    if client is None:
        return None

    def _extract(concept: str, sample_concepts: list = None,
                 full_text: str = None):
        meta = {'reason': '', 'raw': ''}
        if not concept or len(str(concept).strip()) < 3:
            return {}, {'reason': 'empty_description'}

        # `concept` = the clean concept-name phrase (preferred extraction source)
        # `full_text` = concept + definition body (used for subtype + grounding)
        concept = str(concept)[:300]
        full    = str(full_text or concept)[:600]
        sample_concepts = sample_concepts or []
        prompt = (
            'You are extracting semantic role values from one variable in a '
            'data dictionary.\n\n'
            f'CONCEPT NAME (use this for measure / statistic / condition):\n'
            f'  {concept}\n\n'
            f'FULL DEFINITION (use ONLY for subtype, and only if needed):\n'
            f'  {full}\n\n'
            'Similar concept names in the same group (context):\n'
            + '\n'.join(f'  - {str(s)[:120]}' for s in sample_concepts[:3]) + '\n\n'
            'TASK: Extract values for these four semantic roles. Take '
            'measure, statistic and condition from the CONCEPT NAME wherever '
            'possible. Every value MUST be a verbatim substring of the CONCEPT '
            'NAME or FULL DEFINITION. Empty string if a role does not apply.\n\n'
            '  measure   = the base quantity being measured\n'
            '              (e.g. "reaction time", "accuracy", "score")\n'
            '  statistic = a statistical aggregation operator\n'
            '              (e.g. "mean", "median", "standard deviation", "total")\n'
            '  condition = an experimental condition or scope\n'
            '              (e.g. "baseline", "follow-up", "task condition")\n'
            '  subtype   = a specific subtype / kind / error type\n'
            '              (e.g. "error type", "response type", "trial type")\n\n'
            'STRICT RULES:\n'
            '1. Each value COPIED VERBATIM — do not invent, summarise, paraphrase.\n'
            '2. Prefer short concept phrases over long definition fragments.\n'
            '3. Do NOT return a value that is a sentence or starts with "the '
            'number of", "the subject", "calculated across" — those are '
            'definition prose, not labels.\n'
            '4. Empty string "" for roles that do not apply.\n'
            '5. Output strict JSON only:\n\n'
            '{"measure": "...", "statistic": "...", "condition": "...", "subtype": ""}'
        )

        try:
            resp = _safe_chat_completion(client, model, prompt,
                                           max_tokens=1500, temperature=0.1)
            raw = (resp.choices[0].message.content or '').strip()
            meta['raw'] = raw[:300]
            result = _parse_json_response(raw)
        except Exception as e:
            meta['reason'] = f'exception: {type(e).__name__}: {str(e)[:80]}'
            return {}, meta

        # Grounding is checked against the FULL text (concept + body) so that
        # subtype values living in the definition body still pass.
        ground_lower  = full.lower()
        ground_tokens = [w.strip(',.()[]{}"\'') for w in ground_lower.split()]
        ground_stems  = {_light_stem(w) for w in ground_tokens
                          if len(w) >= 3 and w not in _STOPWORDS}

        roles: dict = {}
        rejected: list = []
        for role in _PER_ROW_ROLES:
            val = result.get(role, '')
            if not isinstance(val, str):
                continue
            val_clean = val.strip().strip('"').strip("'")
            if not val_clean:
                continue
            # P2: reject definition-prose fragments before grounding
            if _is_boilerplate_value(val_clean):
                rejected.append((role, val_clean, ['boilerplate']))
                continue
            # Strict substring (cheapest, most common)
            if val_clean.lower() in ground_lower:
                roles[role] = val_clean
                continue
            # Token-stem grounding fallback (morphological variants)
            val_tokens = [w.strip(',.()[]{}"\'') for w in val_clean.lower().split()]
            val_stems  = {_light_stem(w) for w in val_tokens
                           if len(w) >= 3 and w not in _STOPWORDS}
            if val_stems and val_stems.issubset(ground_stems):
                roles[role] = val_clean
            else:
                missing = sorted(val_stems - ground_stems) if val_stems else ['(no content tokens)']
                rejected.append((role, val_clean, missing))

        meta['reason']   = 'accepted' if roles else 'all_rejected'
        meta['rejected'] = rejected
        meta['n_extracted'] = len(roles)
        meta['n_rejected']  = len(rejected)
        return roles, meta

    return _extract

def discover_roles_via_per_row_extraction(
        texts: list,
        text_col_names: Optional[list],
        per_row_extractor_fn: Callable,
        group_name: str = '',
        regularity_threshold: float = 0.40) -> dict:
    """
    Zhu et al. (EMNLP 2025) style: one LLM call per row extracts measure /
    statistic / condition / subtype values directly from each description.

    Returns the same dict shape as discover_roles_via_sbert_phrase_clustering()
    so it can flow into build_role_hierarchy().
    """
    n = len(texts)
    if n < 4:
        return {'valid': False, 'reason': 'too_few_rows',
                'regularity': 0.0, 'roles': {}, 'row_assignments': []}

    # Extract a "concept-prefix" snippet per row for context to the extractor
    text_keys = ({c.strip().lower() for c in (text_col_names or [])}
                 if text_col_names else None)

    def _row_desc_value(t: str) -> str:
        """Return the raw description-column value (everything after 'desc:')."""
        for chunk in str(t).split(' | '):
            if ':' not in chunk:
                continue
            key, val = chunk.split(':', 1)
            key_l = key.strip().lower()
            if text_keys is not None and key_l not in text_keys:
                continue
            return val.strip()
        return str(t).strip()

    # For each row split into (concept-name prefix, definition body).
    # The concept feeds measure/statistic/condition; the full text grounds
    # subtype and the grounding check.
    row_concepts: list = []
    row_fulls:    list = []
    for t in texts:
        dv = _row_desc_value(t)
        concept, body = _split_concept_and_body(dv)
        row_concepts.append(concept)
        row_fulls.append((concept + ' ' + body).strip() if body else concept)

    sample_descs = row_concepts[:3]

    # Call extractor per row.  Use Streamlit progress bar so the user sees
    # what's happening (especially important for HCP).
    try:
        import streamlit as _st_p
        pbar = _st_p.progress(0.0, text=f'Extracting roles for "{group_name}"…')
        show_progress = True
    except Exception:
        pbar = None
        show_progress = False

    per_row_roles: list = []
    per_row_audit: list = []   # full audit trail (proposed + rejected per row)
    all_phrases_by_role: dict = defaultdict(list)
    extractor_meta_summary: dict = defaultdict(int)

    for i, concept in enumerate(row_concepts):
        if show_progress and pbar is not None:
            try:
                pbar.progress((i + 1) / max(1, n),
                               text=f'[{group_name}] row {i+1}/{n}')
            except Exception:
                pass
        try:
            roles, meta = per_row_extractor_fn(
                concept, sample_descs, full_text=row_fulls[i])
        except Exception as e:
            roles, meta = {}, {'reason': f'exception: {type(e).__name__}'}
        per_row_roles.append(roles)
        # Audit trail: store per-row details for the Role Decomposition tab
        per_row_audit.append({
            'row_idx':    i,
            'description_snippet': str(concept)[:120],
            'accepted_roles':      dict(roles),
            'rejected':            meta.get('rejected', []),
            'reason':              meta.get('reason', ''),
            'raw':                 meta.get('raw', ''),
        })
        extractor_meta_summary[meta.get('reason', 'unknown')] += 1
        for role, val in roles.items():
            if val and val not in all_phrases_by_role[role]:
                all_phrases_by_role[role].append(val)

    try:
        if show_progress and pbar is not None:
            pbar.empty()
    except Exception:
        pass

    # Keep only roles that have ≥ 2 distinct values across the corpus
    roles_final: dict = {}
    for role in _PER_ROW_ROLES:
        vals = all_phrases_by_role.get(role, [])
        if len(vals) >= 2:
            roles_final[role] = vals

    # Coverage = fraction of rows with ≥1 non-empty role assignment
    covered = sum(1 for r in per_row_roles if any(r.get(rl) for rl in roles_final))
    coverage = covered / max(1, n)
    regularity = coverage   # for per-row extractor, coverage is regularity

    valid = (len(roles_final) >= 2 and coverage >= regularity_threshold)

    return {
        'roles':             roles_final,
        'row_assignments':   per_row_roles,
        'coverage':          round(coverage, 4),
        'regularity':        round(regularity, 4),
        'valid':             valid,
        'role_source':       'per_row_llm_extraction',
        'extractor_summary': dict(extractor_meta_summary),
        'per_row_audit':     per_row_audit,
        'group_name':        group_name,
    }

def discover_roles_via_sbert_phrase_clustering(
        texts: list,
        text_col_names: Optional[list],
        sbert_model,
        llm_role_namer_fn: Optional[Callable] = None,
        min_phrase_count: int = 2,
        min_role_size: int = 2,
        n_clusters_range: tuple = (2, 6),
        group_name: str = '',
        regularity_threshold: float = 0.40) -> dict:
    """
    Option D core: discover semantic-role schema for a group via
        SBERT phrase clustering  +  constrained-vocab LLM cluster naming.

    Pipeline (EDC / ZOES style):
      1. Extract repeated phrases per row (shared helper).
      2. Compute group regularity = fraction of rows with ≥2 mined phrases.
         If < regularity_threshold → return invalid (caller falls back to
         existing slot mining / FASTopic / NMF path).
      3. SBERT-embed each unique phrase.
      4. Agglomerative-cluster phrases by cosine similarity; select K by
         silhouette score (range 2..6).
      5. Name each cluster via LLM (constrained vocab).  Anonymous fallback
         when LLM is off or rejects.
      6. Per-row: assign one phrase per role (longest mined phrase wins ties).
      7. Return roles + row_assignments + diagnostics.

    Returns a dict in the same shape as mine_phrase_slots() so it can flow
    straight into build_slot_hierarchy / build_role_hierarchy.
    """
    n = len(texts)
    if n < 4 or sbert_model is None:
        return {'valid': False, 'reason': 'too_few_rows_or_no_sbert',
                'regularity': 0.0, 'roles': {}, 'row_assignments': []}

    # 1. extract phrases
    phrases, refined, regularity = _extract_phrases_for_role_clustering(
        texts, text_col_names, group_name, min_phrase_count)

    if regularity < regularity_threshold:
        return {'valid': False, 'reason': f'low_regularity ({regularity:.2f})',
                'regularity': regularity, 'roles': {}, 'row_assignments': []}
    if len(phrases) < 4:
        return {'valid': False, 'reason': 'too_few_phrases',
                'regularity': regularity, 'roles': {}, 'row_assignments': []}

    # 2. SBERT embed
    try:
        embs = sbert_model.encode(phrases, normalize_embeddings=True,
                                    show_progress_bar=False, batch_size=64)
    except Exception as e:
        return {'valid': False, 'reason': f'sbert_failed: {type(e).__name__}',
                'regularity': regularity, 'roles': {}, 'row_assignments': []}

    # 3. agglomerative + silhouette K selection
    best_score, best_labels, best_k = -1.0, None, 2
    for k in range(n_clusters_range[0],
                    min(n_clusters_range[1] + 1, len(phrases))):
        try:
            ac = AgglomerativeClustering(n_clusters=k, metric='cosine',
                                          linkage='average')
            labels = ac.fit_predict(embs)
            if len(set(labels)) < 2:
                continue
            sil = float(silhouette_score(embs, labels, metric='cosine'))
            if sil > best_score:
                best_score, best_labels, best_k = sil, labels, k
        except Exception:
            continue

    if best_labels is None:
        return {'valid': False, 'reason': 'no_clusters',
                'regularity': regularity, 'roles': {}, 'row_assignments': []}

    # 4. group phrases by cluster id
    clusters_by_id: dict = defaultdict(list)
    for p, lbl in zip(phrases, best_labels):
        clusters_by_id[int(lbl)].append(p)
    valid_clusters = {cid: ps for cid, ps in clusters_by_id.items()
                      if len(ps) >= min_role_size}
    if len(valid_clusters) < 2:
        return {'valid': False, 'reason': 'too_few_valid_clusters',
                'regularity': regularity, 'roles': {}, 'row_assignments': []}

    # 5. name each cluster
    sample_descs = [str(t)[:200] for t in texts[:3]]
    roles: dict = {}
    naming_meta: dict = {}
    used_names: set = set()
    for cid, cluster_phrases in valid_clusters.items():
        role_name = None
        if llm_role_namer_fn:
            role_name, name_meta = llm_role_namer_fn(cluster_phrases, sample_descs)
            naming_meta[cid] = name_meta
        if not role_name:
            role_name = f'cluster_{cid}'
        # Disambiguate if LLM gave the same name to two clusters
        original = role_name
        suffix = 2
        while role_name in used_names:
            role_name = f'{original}_{suffix}'
            suffix += 1
        used_names.add(role_name)
        roles[role_name] = cluster_phrases

    # 6. per-row role assignment (longest phrase per role per row)
    phrase_to_role = {p: r for r, ps in roles.items() for p in ps}
    row_assignments: list = []
    covered = 0
    for r in refined:
        assignment: dict = {}
        for p in r:
            role = phrase_to_role.get(p)
            if not role:
                continue
            if role not in assignment or len(p) > len(assignment[role]):
                assignment[role] = p
        if assignment:
            covered += 1
        row_assignments.append(assignment)

    coverage = covered / n
    valid = (coverage >= 0.50 and len(roles) >= 2)

    return {
        'roles':             roles,
        'row_assignments':   row_assignments,
        'coverage':          round(coverage, 4),
        'regularity':        regularity,
        'valid':             valid,
        'role_source':       'sbert_phrase_clustering',
        'phrase_silhouette': round(float(best_score), 4),
        'n_clusters':        best_k,
        'naming_meta':       naming_meta,
    }

def build_role_hierarchy(vi_list: list,
                          role_result: dict,
                          can: pd.DataFrame,
                          parent_id: int,
                          parent_path: str,
                          nodes: list,
                          node_map: dict,
                          var_nodes: dict,
                          max_depth_remaining: int = 4,
                          post_split_fn: Optional[Callable] = None,
                          min_post_split_size: int = 4) -> bool:
    """
    Build a role-nested hierarchy from the Option D role decomposition.
    Roles ordered by generic priority:
        measure > outcome > statistic > condition > modifier > subtype
    Variables without a value for a given role skip that level.

    Returns True on success (≥ 2 aggregation nodes added), else False.
    """
    if not role_result.get('valid'):
        return False
    roles = role_result['roles']
    row_assigns = role_result['row_assignments']
    if len(vi_list) != len(row_assigns):
        return False

    role_names = sorted(roles.keys(),
                         key=lambda r: _ROLE_PRIORITY.get(r, 0),
                         reverse=True)
    if not role_names:
        return False

    aggregations_made = [0]
    vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}

    def _terminal_attach(vi_grp: list, parent: int, depth_left: int):
        if (post_split_fn is not None
                and len(vi_grp) >= min_post_split_size
                and depth_left > 0):
            try:
                added = post_split_fn(vi_grp, parent, depth_left)
                if added > 0:
                    aggregations_made[0] += added
                    return
            except Exception:
                pass
        for vi in vi_grp:
            _add_child(node_map, parent, var_nodes[vi])

    def _split(vi_subset: list, role_idx: int, current_parent: int,
                depth_remaining: int):
        if (depth_remaining <= 0 or role_idx >= len(role_names)
                or len(vi_subset) <= 1):
            _terminal_attach(vi_subset, current_parent, depth_remaining)
            return

        role = role_names[role_idx]
        groups: dict = defaultdict(list)
        unassigned: list = []
        for vi in vi_subset:
            local = vi_to_local.get(vi)
            val = row_assigns[local].get(role) if local is not None else None
            if val:
                groups[val].append(vi)
            else:
                unassigned.append(vi)

        if len(groups) <= 1:
            _split(vi_subset, role_idx + 1, current_parent, depth_remaining)
            return

        # BUGFIX: read role_source from the result so the same builder
        # correctly labels per-row LLM extraction nodes vs SBERT clustering
        # nodes.  Previously this was hardcoded to 'sbert_phrase_clustering',
        # hiding which route actually ran in the exported provenance.
        _route_src = role_result.get('role_source', 'sbert_phrase_clustering')
        _label_src = ('per_row_llm_role'
                       if _route_src == 'per_row_llm_extraction'
                       else 'sbert_phrase_role')
        _node_source_str = ('per-row LLM extraction (Zhu et al. 2025)'
                              if _route_src == 'per_row_llm_extraction'
                              else 'SBERT phrase cluster + LLM role naming')
        for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
            if len(vi_grp) == 1:
                _add_child(node_map, current_parent, var_nodes[vi_grp[0]])
                continue
            nid = _next_id(nodes)
            nd  = _make_node(nid, val.title(),
                              desc=(f'Role: {role} | Value: "{val}" | '
                                    f'Variables: {len(vi_grp)} | '
                                    f'Source: {_node_source_str}'))
            nd['label_provenance'] = {
                'label_source':  _label_src,
                'evidence_terms': [val],
                'confidence':    1.0,
                'llm_used':      True,
                'llm_rejected':  False,
                'role':          role,
            }
            nd['structure_provenance'] = {
                'route':            _route_src,
                'aspect_method':    _route_src,
                'slot_role':        role,
                'phrase_silhouette': role_result.get('phrase_silhouette'),
                'regularity':       role_result.get('regularity'),
                'n_clusters':       role_result.get('n_clusters'),
            }
            nodes.append(nd)
            node_map[nid] = nd
            _add_child(node_map, current_parent, nid)
            aggregations_made[0] += 1
            _split(vi_grp, role_idx + 1, nid, depth_remaining - 1)

        if unassigned:
            _terminal_attach(unassigned, current_parent, depth_remaining)

    _split(vi_list, 0, parent_id, max_depth_remaining)
    return aggregations_made[0] >= 2

def strip_group_prefix_from_labels(nodes: list) -> int:
    """
    Post-build pass: for every aggregation node, if its name starts with the
    parent group's name (case-insensitive), strip the prefix.

    Effect: 'DMS — Total Errors' under DMS becomes 'Total Errors'.
            'Pal Total Errors' under PAL becomes 'Total Errors'.

    Returns the number of labels modified.
    """
    node_map = {int(n['id']): n for n in nodes}
    modified = 0

    def _walk(nid: int, parent_name: str):
        nonlocal modified
        n = node_map.get(int(nid))
        if not n:
            return
        if n.get('type') == 'aggregation' and parent_name:
            current = str(n.get('name', '')).strip()
            cn_lower = current.lower()
            pn_lower = parent_name.strip().lower()
            if pn_lower and (cn_lower.startswith(pn_lower + ' ')
                              or cn_lower.startswith(pn_lower + '—')
                              or cn_lower.startswith(pn_lower + '-')
                              or cn_lower.startswith(pn_lower + ':')
                              or cn_lower.startswith(pn_lower + '/')):
                stripped = current[len(parent_name):].lstrip(' —-—:/').strip()
                if stripped and len(stripped) >= 2:
                    n['name'] = stripped
                    modified += 1
        new_parent = (n.get('name', '') if n.get('type') in ('aggregation', 'root')
                       else parent_name)
        for cid in n.get('related', []):
            _walk(int(cid), new_parent)

    _walk(0, '')
    return modified

def enforce_single_parent(nodes: list) -> int:
    """
    POST-BUILD PASS 4 — guarantee the result is a tree (each node has exactly
    one parent).

    The role builder can attach a variable both directly to a group and again
    under a sub-aggregation of that same group — e.g. a leaf under '3 Targets'
    *and* under '3 Targets > False Alarm Sequences'.  That makes the branch a
    DAG, which (a) diverges from the single-parent tree format, (b) fragments
    the branch, and (c) breaks proportional ('total') sunburst/treemap sizing.

    For every node with more than one parent, keep the MOST SPECIFIC (deepest)
    parent and detach it from the shallower ones.  Keeping the deepest parent
    removes the redundant direct attachment while preserving the finer
    sub-grouping the role extractor discovered.  Fully generic — no domain
    knowledge, no hardcoded labels.

    Returns the number of redundant parent links removed.
    """
    from collections import deque
    node_map = {int(n['id']): n for n in nodes}
    # depth = shortest distance from root (id 0) along child edges
    depth = {0: 0}
    dq = deque([0])
    while dq:
        cur = dq.popleft()
        for c in node_map.get(cur, {}).get('related', []):
            c = int(c)
            if c not in depth:
                depth[c] = depth[cur] + 1
                dq.append(c)
    parents = defaultdict(list)
    for n in nodes:
        for c in n.get('related', []):
            parents[int(c)].append(int(n['id']))
    removed = 0
    for child, ps in parents.items():
        if len(ps) <= 1:
            continue
        keep = max(ps, key=lambda p: depth.get(p, 0))  # deepest = most specific
        for p in ps:
            if p == keep:
                continue
            par = node_map.get(p)
            if par and int(child) in par['related']:
                par['related'] = [x for x in par['related'] if int(x) != int(child)]
                removed += 1
    return removed

def prune_empty_aggregations(nodes: list) -> int:
    """
    POST-BUILD PASS 5 — drop aggregation nodes that ended up with no children.

    `enforce_single_parent` can empty a shallow aggregation when all of its
    variables were kept under a deeper/sibling parent (e.g. 'RVP Response
    Latency' losing every leaf to a more specific group).  An empty category
    node is export noise — it renders as a blank sector and has no members.

    Iteratively removes childless aggregation nodes and detaches them from
    their parents (removal can empty a parent in turn).  Root and attribute
    nodes are never touched.  Returns the number of nodes removed.
    """
    removed = 0
    while True:
        node_map = {int(n['id']): n for n in nodes}
        empties = {int(n['id']) for n in nodes
                   if n.get('type') == 'aggregation' and not n.get('related')}
        if not empties:
            break
        nodes[:] = [n for n in nodes if int(n['id']) not in empties]
        for n in nodes:
            if any(int(c) in empties for c in n.get('related', [])):
                n['related'] = [int(c) for c in n['related'] if int(c) not in empties]
        removed += len(empties)
    return removed

def mine_phrase_slots(texts: list,
                       text_col_names: Optional[list] = None,
                       min_phrase_count: int = 2,
                       min_slot_size: int = 2,
                       coverage_threshold: float = 0.55,
                       excl_threshold: float = 0.75,
                       group_name: str = '',
                       llm_role_classifier_fn: Optional[Callable] = None) -> dict:
    """
    Discover phrase slots in a group of variable descriptions.

    Algorithm:
      1. Extract concept-prefix from each description (text before ':' in a
         description-like column; full text if no such column).
      2. Tokenise + lowercase; generate n-grams (1–4 tokens, ≥3 chars).
      3. Keep n-grams that appear in ≥ min_phrase_count rows.
      4. For each row, retain only maximal phrases (drop sub-phrases of
         longer phrases present in the same row).
      5. Compute mutual-exclusion score per phrase pair:
            M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
         Phrases with M[a,b] ≥ excl_threshold are 'alternatives' (rarely
         appear together → likely fill the same slot in different rows).
      6. Cluster phrases into slots via greedy mutual-exclusion BFS, starting
         from the most-frequent phrase.
      7. A slot is valid if it has ≥ min_slot_size distinct phrases.
      8. Coverage: fraction of rows that contain ≥1 phrase from ≥1 slot.
         If coverage < threshold or < 2 slots survive → valid=False.

    Returns:
      {
        'slots':            list[ {phrases: set[str], best: str} ],
        'row_assignments':  list[ dict[slot_id → phrase] ] for each row,
        'coverage':         float in [0,1],
        'valid':            bool — True if slot structure is strong enough.
      }
    """
    n = len(texts)
    if n < 4:
        return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}

    text_keys = ({c.strip().lower() for c in (text_col_names or [])}
                 if text_col_names else None)

    # ── 1. extract concept prefixes ──────────────────────────────────────────
    prefixes = []
    for t in texts:
        prefix = ''
        for chunk in str(t).split(' | '):
            if ':' not in chunk:
                continue
            key, val = chunk.split(':', 1)
            key_l = key.strip().lower()
            if text_keys is not None and key_l not in text_keys:
                continue
            prefix = val.split(':')[0].strip()
            break
        if not prefix:
            prefix = str(t).strip()
        # Strip 'KEY:', 'Note:', etc. before tokenising
        prefix = _strip_leading_markers(prefix)
        prefixes.append(prefix)

    # ── 2. normalise: alphanumerics + parens preserved, others → spaces ──────
    def _normalize(p: str) -> str:
        s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
                    for ch in str(p).lower())
        return ' '.join(s.split())

    norm = [_normalize(p) for p in prefixes]

    # ── 3. extract n-grams (1..4) per row, keep ≥3 chars ─────────────────────
    row_phrases: list = []
    phrase_count: Counter = Counter()
    for text in norm:
        tokens = text.split()
        row_set = set()
        for ngram_n in range(1, 5):
            for i in range(len(tokens) - ngram_n + 1):
                phrase = ' '.join(tokens[i:i + ngram_n])
                if len(phrase) >= 3:
                    row_set.add(phrase)
        row_phrases.append(row_set)
        for p in row_set:
            phrase_count[p] += 1

    # ── 4. keep repeated phrases, retain only maximal phrases per row ────────
    keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
    refined = []
    for row in row_phrases:
        sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
        kept: list = []
        for p in sorted_by_len:
            if not any(p != q and p in q for q in kept):
                kept.append(p)
        refined.append(set(kept))

    # Recount after refinement, re-filter
    phrase_count = Counter()
    for r in refined:
        for p in r:
            phrase_count[p] += 1
    # NEW: filter out non-meaningful phrases (stopwords, group-name echoes,
    # dictionary markers) before they enter mutual-exclusion clustering.
    group_lower = (group_name or '').strip().lower()
    phrases = sorted([p for p, c in phrase_count.items()
                       if c >= min_phrase_count
                       and _phrase_is_meaningful(p, group_lower)])
    if len(phrases) < 2 * min_slot_size:
        return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}

    p_idx  = {p: i for i, p in enumerate(phrases)}
    n_p    = len(phrases)
    counts = np.array([phrase_count[p] for p in phrases])

    # ── 5. co-occurrence + mutual-exclusion matrix ───────────────────────────
    cooc = np.zeros((n_p, n_p), dtype=int)
    for r in refined:
        idxs = [p_idx[p] for p in r if p in p_idx]
        for i in idxs:
            for j in idxs:
                if i != j:
                    cooc[i, j] += 1
    min_counts = np.minimum.outer(counts, counts).astype(float)
    min_counts[min_counts == 0] = 1.0
    mut_excl   = 1.0 - cooc / min_counts
    np.fill_diagonal(mut_excl, 0)

    # ── 6a. UPSTREAM ROUTE: ask the LLM to classify phrases into roles ───────
    # The LLM proposes a role schema (e.g. {measure: [...], statistic: [...],
    # condition: [...]}) — phrases are assigned to roles, role names provide
    # semantic ordering for the hierarchy.  Anti-hallucination: every phrase
    # must come back verbatim, otherwise rejected by the validator inside
    # make_llm_role_classifier_fn.
    slot_source = 'mutual_exclusion'
    role_names: list = []
    slots: list = []  # list[set[int]]  — phrase indices per slot

    if llm_role_classifier_fn is not None:
        try:
            classified, classifier_meta = llm_role_classifier_fn(
                phrases, texts, group_name)
        except Exception:
            classified, classifier_meta = None, {'reason': 'exception'}
        if classified:
            # Build slots in the order the LLM proposed them.  Each role is
            # one slot containing the phrase-index set.
            for role_name, role_phrases in classified.items():
                idx_set = {p_idx[p] for p in role_phrases if p in p_idx}
                if len(idx_set) >= min_slot_size:
                    slots.append(idx_set)
                    role_names.append(role_name)
            if len(slots) >= 2:
                slot_source = 'llm_role_classification'

    # ── 6b. FALLBACK: greedy mutual-exclusion BFS ────────────────────────────
    if slot_source == 'mutual_exclusion':
        visited: set = set()
        slots = []
        order   = np.argsort(-counts)
        for seed in order:
            if seed in visited:
                continue
            slot = {int(seed)}
            queue = [int(seed)]
            while queue:
                cur = queue.pop()
                for j in np.where(mut_excl[cur] >= excl_threshold)[0]:
                    j = int(j)
                    if j in slot:
                        continue
                    if all(mut_excl[j, k] >= excl_threshold - 0.15 for k in slot):
                        slot.add(j)
                        queue.append(j)
            if len(slot) >= min_slot_size:
                slots.append(slot)
                visited |= slot
        # Synthesise anonymous role names from the most-frequent phrase in each
        # slot — these become the visible slot tags in provenance.
        role_names = [phrases[max(s, key=lambda i: counts[i])] for s in slots]

    if len(slots) < 2:
        return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}

    # ── 7. assign per-row phrase per slot ────────────────────────────────────
    row_assignments: list = []
    covered = 0
    for r in refined:
        row_idx = {p_idx[p] for p in r if p in p_idx}
        assignment: dict = {}
        any_match = False
        for slot_id, slot in enumerate(slots):
            matched = row_idx & slot
            if matched:
                best = max(matched, key=lambda i: counts[i])
                assignment[slot_id] = phrases[best]
                any_match = True
        if any_match:
            covered += 1
        row_assignments.append(assignment)

    coverage = covered / n
    valid    = (coverage >= coverage_threshold and len(slots) >= 2)

    return {
        'slots': [{'phrases':  {phrases[i] for i in s},
                    'best':     phrases[max(s, key=lambda i: counts[i])],
                    'role_name': role_names[idx] if idx < len(role_names) else ''}
                   for idx, s in enumerate(slots)],
        'row_assignments': row_assignments,
        'coverage':        round(coverage, 4),
        'valid':           valid,
        'slot_source':     slot_source,    # 'llm_role_classification' or 'mutual_exclusion'
    }


def build_slot_hierarchy(vi_list: list,
                          slot_result: dict,
                          can: pd.DataFrame,
                          parent_id: int,
                          parent_path: str,
                          nodes: list,
                          node_map: dict,
                          var_nodes: dict,
                          max_depth_remaining: int = 4,
                          post_slot_split_fn: Optional[Callable] = None,
                          min_post_slot_size: int = 4) -> bool:
    """
    Build a hierarchy for `vi_list` using inferred phrase slots.

    Slots are ordered by partition quality (more distinct values + fewer
    singletons = higher priority).  Each slot becomes one tree level.
    Variables that lack a phrase at a given slot level skip that level.
    Returns True on success; False if the result is too shallow to be useful
    (caller should then fall back to NMF/GMM).
    """
    if not slot_result.get('valid'):
        return False

    slots       = slot_result['slots']
    row_assigns = slot_result['row_assignments']
    if len(vi_list) != len(row_assigns):
        return False

    # Index map: global vi → local position in row_assigns
    vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}

    # Order slots by partition-quality + semantic-shape heuristics.
    # Higher score → used at a shallower level in the hierarchy.
    #
    # Heuristics (all generic, no domain knowledge):
    #   + many distinct values, low singleton fraction (existing)
    #   + average phrase token-length (multi-word noun phrases preferred)
    #   + total row coverage of the slot
    #   − slots whose top phrases look like pure conditions
    #     (numeric token + temporal/quantity word)
    #   − slots where every phrase is just a number or 'all' / 'none' modifier
    _CONDITION_HINTS = {'second', 'seconds', 'minute', 'minutes', 'hour',
                        'hours', 'day', 'days', 'month', 'months', 'year',
                        'years', 'week', 'weeks', 'box', 'boxes', 'token',
                        'tokens', 'pattern', 'patterns', 'trial', 'trials'}

    def _looks_like_condition(phrase: str) -> bool:
        toks = phrase.split()
        if not toks:
            return False
        has_num = any(t.isdigit() or t in {'all','none','simultaneous'} for t in toks)
        has_hint = any(t in _CONDITION_HINTS for t in toks)
        return has_num and has_hint

    def _slot_score(slot_id: int) -> float:
        vals = Counter()
        for a in row_assigns:
            v = a.get(slot_id)
            if v:
                vals[v] += 1
        if not vals:
            return -1.0
        n_distinct   = len(vals)
        n_singletons = sum(1 for c in vals.values() if c == 1)
        coverage     = sum(vals.values()) / max(1, len(row_assigns))
        avg_tokens   = float(np.mean([len(p.split()) for p in vals]))
        condition_frac = sum(1 for p in vals if _looks_like_condition(p)) / n_distinct

        base = n_distinct - 0.6 * n_singletons
        base += 0.4 * coverage
        base += 0.3 * (avg_tokens - 1)
        base -= 0.8 * condition_frac
        return base

    # When the LLM produced the role schema, trust its role ordering for slots
    # whose role name is broadly "measure-like" (base concept) over
    # "statistic/condition/subtype" (modifiers).  This is generic English
    # vocabulary, not domain knowledge — same heuristic used by IE slot-
    # induction work (cf. Xu et al., FCS 2024 IE survey).
    slot_source = slot_result.get('slot_source', 'mutual_exclusion')
    _MEASURE_LIKE = {'measure', 'outcome', 'metric', 'variable', 'quantity'}
    _STATISTIC_LIKE = {'statistic', 'stat', 'aggregate', 'summary'}
    _CONDITION_LIKE = {'condition', 'modifier', 'context', 'setting'}
    _SUBTYPE_LIKE   = {'subtype', 'type', 'kind', 'category'}

    def _role_priority(role: str) -> int:
        r = (role or '').strip().lower()
        if any(k in r for k in _MEASURE_LIKE):   return 4
        if any(k in r for k in _STATISTIC_LIKE): return 3
        if any(k in r for k in _CONDITION_LIKE): return 2
        if any(k in r for k in _SUBTYPE_LIKE):   return 1
        return 0   # unknown role — fall back to data-driven score

    if slot_source == 'llm_role_classification':
        ordered_slots = sorted(
            range(len(slots)),
            key=lambda i: (_role_priority(slots[i].get('role_name', '')),
                            _slot_score(i)),
            reverse=True)
    else:
        ordered_slots = sorted(range(len(slots)), key=_slot_score, reverse=True)
    ordered_slots = [s for s in ordered_slots if _slot_score(s) > 0]
    if not ordered_slots:
        return False

    aggregations_made = [0]   # mutable counter for fallback decision

    def _attach_or_sub_recurse(vi_grp: list, parent: int, depth_left: int):
        """
        Terminal-leaf attach point inside slot mining.  When slots are
        exhausted but the cluster still has enough variables AND we have a
        post-slot callback (NMF/FASTopic + GMM splitter), recurse further to
        deepen the tree.  Otherwise attach leaves directly.
        """
        if (post_slot_split_fn is not None
                and len(vi_grp) >= min_post_slot_size
                and depth_left > 0):
            try:
                added = post_slot_split_fn(vi_grp, parent, depth_left)
                if added > 0:
                    aggregations_made[0] += added
                    return
            except Exception:
                pass
        for vi in vi_grp:
            _add_child(node_map, parent, var_nodes[vi])

    def _split(vi_subset: list, slot_ord_pos: int, current_parent: int,
               depth_remaining: int):
        if (depth_remaining <= 0 or slot_ord_pos >= len(ordered_slots)
                or len(vi_subset) <= 1):
            _attach_or_sub_recurse(vi_subset, current_parent, depth_remaining)
            return

        slot_id = ordered_slots[slot_ord_pos]
        groups: dict = defaultdict(list)
        unassigned: list = []
        for vi in vi_subset:
            local = vi_to_local.get(vi)
            val = row_assigns[local].get(slot_id) if local is not None else None
            if val:
                groups[val].append(vi)
            else:
                unassigned.append(vi)

        # If this slot doesn't partition the subset, move to next slot
        if len(groups) <= 1:
            _split(vi_subset, slot_ord_pos + 1, current_parent, depth_remaining)
            return

        # The slot's role name (from LLM classification, when applicable)
        slot_role = slots[slot_id].get('role_name', '') if slot_id < len(slots) else ''
        for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
            if len(vi_grp) == 1:
                _add_child(node_map, current_parent, var_nodes[vi_grp[0]])
                continue
            nid = _next_id(nodes)
            nd  = _make_node(nid, val.title(),
                             desc=(f'Role: {slot_role or "—"} | '
                                   f'Slot phrase: "{val}" | '
                                   f'Variables: {len(vi_grp)} | '
                                   f'Source: phrase-slot mining ({slot_source})'))
            nd['label_provenance'] = {
                'label_source': 'phrase_slot',
                'evidence_terms': [val],
                'confidence': 1.0,
                'llm_used': slot_source == 'llm_role_classification',
                'llm_rejected': False,
                'role':        slot_role,
            }
            nd['structure_provenance'] = {
                'route':           'slot_mining',
                'aspect_method':   slot_source,
                'silhouette':      None,
                'slot_coverage':   round(float(slot_result.get('coverage', 0)), 3),
                'slot_role':       slot_role,
            }
            nodes.append(nd); node_map[nid] = nd
            _add_child(node_map, current_parent, nid)
            aggregations_made[0] += 1
            _split(vi_grp, slot_ord_pos + 1, nid, depth_remaining - 1)

        if unassigned:
            _attach_or_sub_recurse(unassigned, current_parent, depth_remaining)

    _split(vi_list, 0, parent_id, max_depth_remaining)

    # Reject the slot-built tree if it added almost no structure (likely the
    # slots were not actually useful for this group).
    return aggregations_made[0] >= 2

# ──────────────────────────────────────────────────────────────────────────────
# STEP 6  — DYNAMIC TOP-DOWN LOD TREE  [ZHU §3.3 adapted]
# ──────────────────────────────────────────────────────────────────────────────
def _next_id(nodes: list) -> int:
    return max((int(n['id']) for n in nodes), default=0) + 1

def _add_child(node_map: dict, parent_id: int, child_id: int):
    p = node_map.get(int(parent_id))
    if p and int(child_id) not in p['related']:
        p['related'].append(int(child_id))

def _make_node(nid, name, ntype='aggregation', desc='', dtype='determine') -> dict:
    return {'id': int(nid), 'name': str(name), 'related': [],
            'type': ntype, 'desc': str(desc), 'dtype': dtype, 'isShown': True}

# ──────────────────────────────────────────────────────────────────────────────
# POST-BUILD PASS 1  — SIBLING COMMON-PREFIX FACTORING
# ──────────────────────────────────────────────────────────────────────────────
def factor_sibling_common_prefixes(nodes: list,
                                     min_siblings: int = 3,
                                     min_prefix_tokens: int = 2) -> int:
    """
    For each parent whose ≥`min_siblings` aggregation children share a
    common multi-token title prefix, insert a new intermediate parent named
    by that prefix and re-attach the matching siblings under it (with the
    prefix stripped from each name).

    Generic, no domain knowledge.  Inspired by sibling-label factoring
    common in faceted-classification systems (Stoica & Hearst, NAACL 2007,
    'Castanet') — collapsing redundant repeated tokens in sibling names.

    Returns the number of factor-parents inserted.
    """
    node_map = {int(n['id']): n for n in nodes}
    inserted = 0
    # We iterate over a snapshot of current aggregation nodes
    queue = [int(n['id']) for n in nodes
             if n.get('type') in ('aggregation', 'root')]
    while queue:
        parent_id = queue.pop(0)
        parent = node_map.get(parent_id)
        if not parent:
            continue
        # Gather aggregation children with their tokenised names
        agg_children = []
        for cid in parent.get('related', []):
            child = node_map.get(int(cid))
            if not child or child.get('type') != 'aggregation':
                continue
            toks = str(child.get('name', '')).split()
            if len(toks) >= min_prefix_tokens:
                agg_children.append((int(cid), toks))
        if len(agg_children) < min_siblings:
            continue
        # Greedy: find the longest prefix shared by ≥ min_siblings children
        best_prefix: list = []
        best_group: list = []
        # Sort children by name tokens for stable grouping
        agg_children.sort(key=lambda x: x[1])
        # Try each possible prefix length from longest down
        max_len = max(len(t) for _, t in agg_children)
        for length in range(max_len, min_prefix_tokens - 1, -1):
            prefix_counts: Counter = Counter()
            for cid, toks in agg_children:
                if len(toks) > length:   # must have something AFTER the prefix
                    prefix_counts[tuple(t.lower() for t in toks[:length])] += 1
            for pfx, cnt in prefix_counts.most_common():
                if cnt >= min_siblings:
                    # Reject low-quality prefixes: all-stopword, or starting/
                    # ending with a stopword (e.g. "the number of", "the").
                    # A good factored parent is a real concept phrase.
                    pfx_l = [t.lower() for t in pfx]
                    if all(t in _STOPWORDS for t in pfx_l):
                        continue
                    if pfx_l[0] in _STOPWORDS or pfx_l[-1] in _STOPWORDS:
                        continue
                    group = [(cid, toks) for cid, toks in agg_children
                              if len(toks) > length
                              and tuple(t.lower() for t in toks[:length]) == pfx]
                    if len(group) >= min_siblings:
                        best_prefix = list(pfx)
                        best_group  = group
                        break
            if best_prefix:
                break
        if not best_prefix:
            continue

        # Build the new intermediate parent
        new_id = max(node_map) + 1
        # Title-case the prefix using the original child capitalisation
        # (take it from the first matched child's tokens)
        orig_tokens = best_group[0][1][:len(best_prefix)]
        prefix_name = ' '.join(orig_tokens)
        new_node = _make_node(new_id, prefix_name, ntype='aggregation',
                               desc=(f'Factored common prefix: "{prefix_name}" | '
                                     f'Siblings: {len(best_group)} | '
                                     f'Source: sibling factoring [Castanet 2007]'))
        new_node['label_provenance'] = {
            'label_source':     'factored_common_prefix',
            'evidence_terms':   [c[0] for c in best_group],
            'confidence':       1.0,
            'llm_used':         False,
            'llm_rejected':     False,
        }
        new_node['structure_provenance'] = {
            'route':            'sibling_factoring',
            'aspect_method':    None,
            'silhouette':       None,
            'slot_coverage':    None,
            'factored_from':    [c[0] for c in best_group],
            'common_prefix_tokens': len(best_prefix),
        }
        # Rename the factored siblings (strip the prefix from their names)
        for cid, toks in best_group:
            child = node_map[cid]
            new_name = ' '.join(toks[len(best_prefix):]).strip()
            if new_name:
                child['name'] = new_name
        # Rewire parent → new_node → factored siblings
        moved_ids = {c[0] for c in best_group}
        parent['related'] = [c for c in parent['related']
                             if int(c) not in moved_ids]
        parent['related'].append(new_id)
        new_node['related'] = [c[0] for c in best_group]
        nodes.append(new_node)
        node_map[new_id] = new_node
        inserted += 1
        # Re-examine this parent in case multiple prefix groups exist
        queue.append(parent_id)
        # Also examine the new parent for further nesting
        queue.append(new_id)
    return inserted

# ──────────────────────────────────────────────────────────────────────────────
# POST-BUILD PASS 2  — TRACO-INSPIRED LOW-QUALITY NODE PRUNING
# ──────────────────────────────────────────────────────────────────────────────
def prune_low_quality_aggregations(nodes: list,
                                     tfidf: TfidfVectorizer = None,
                                     min_coherence: float = 0.0,
                                     max_child_ratio: float = 1.0) -> int:
    """
    Conservative noise pruning.

    ONLY rule applied by default: dissolve aggregation nodes whose name is
    pure noise — single-word stopword titles ('And', 'Them'), dictionary
    markers ('Key', 'Note'), or all-stopword titles ('To Be', 'Have A Lot Of').

    Group anchors (route='group_anchor') and sibling-factoring nodes
    (route='sibling_factoring') are NEVER pruned — they are structural and
    legitimately have short names that may not score well on TF-IDF metrics.

    Rules B (parent-child coherence) and C (lopsided split) are intentionally
    DISABLED by default — both metrics punish good hierarchies where children
    are legitimately more specific than parents (low TF-IDF cosine) or where
    a slot mining branch happens to be dominated by one large subgroup.
    They can be opted into by passing tfidf and tightening the thresholds.

    Inspired by TraCo (Wu et al., AAAI 2024): affinity / rationality /
    diversity failures are common in hierarchical topic models.  Here we
    enforce the most conservative form of that — only obviously-noise names.

    Returns the number of nodes dissolved.
    """
    node_map = {int(n['id']): n for n in nodes}
    parent_of: dict = {}
    for n in nodes:
        for c in n.get('related', []):
            parent_of.setdefault(int(c), int(n['id']))

    FORBIDDEN = frozenset(_STOPWORDS) | frozenset({
        'key', 'note', 'item', 'label', 'group', 'cluster',
    })
    # Routes that are ALWAYS structural — never prune these even if the name
    # is short / stopword-like (e.g. 'MOT' is a group anchor, not noise).
    PROTECTED_ROUTES = frozenset({'group_anchor', 'sibling_factoring'})

    to_dissolve: list = []
    for n in nodes:
        if n.get('type') != 'aggregation':
            continue
        nid = int(n['id'])
        if nid not in parent_of:
            continue
        sp = n.get('structure_provenance', {})
        if sp.get('route') in PROTECTED_ROUTES:
            continue

        name_lower = str(n.get('name', '')).strip().lower()
        if not name_lower:
            to_dissolve.append(nid)
            continue

        # Only Rule A is enforced by default.  A token is "noise" if it is a
        # stopword/marker OR shorter than 2 chars AND not a digit (numeric
        # labels like '2' / '4' / '6' / '8' are kept — they may be meaningful,
        # e.g. PAL pattern counts).
        toks = name_lower.split()
        def _is_noise_tok(t: str) -> bool:
            if t in FORBIDDEN:
                return True
            if len(t) < 2 and not t.isdigit():
                return True
            return False
        all_noise = bool(toks) and all(_is_noise_tok(t) for t in toks)
        if name_lower in FORBIDDEN or all_noise:
            to_dissolve.append(nid)

    # Dissolve: promote children to grandparent
    dissolved = 0
    for nid in to_dissolve:
        n = node_map.get(nid)
        if not n:
            continue
        gp_id = parent_of.get(nid)
        if gp_id is None:
            continue
        gp = node_map.get(gp_id)
        if not gp:
            continue
        gp['related'] = [c for c in gp['related'] if int(c) != nid]
        for cid in n.get('related', []):
            if int(cid) not in gp['related']:
                gp['related'].append(int(cid))
            parent_of[int(cid)] = gp_id
        n['isShown'] = False
        n['type']    = 'dissolved'
        n.setdefault('structure_provenance', {})['dissolved_reason'] = \
            'noise_label_only'
        dissolved += 1
    return dissolved

def build_dynamic_lod_tree(can: pd.DataFrame,
                            aspect_reprs: list,
                            aspect_labels: list,
                            tfidf: TfidfVectorizer,
                            max_depth: int = 6,
                            min_cluster_size: int = 2,
                            sil_threshold: float = 0.04,
                            max_clusters_per_split: int = 8,
                            project: str = 'project',
                            local_nmf: bool = True,
                            min_local_nmf_size: int = 8,
                            max_aspects: int = 10,
                            sbert_model=None,
                            llm_label_fn: Optional[Callable] = None,
                            use_slot_mining: bool = True,
                            text_col_names: Optional[list] = None,
                            use_fastopic: bool = True,
                            fastopic_min_size: int = 8,
                            llm_role_classifier_fn: Optional[Callable] = None,
                            use_role_decomposition: bool = True,
                            llm_role_namer_fn: Optional[Callable] = None,
                            role_regularity_threshold: float = 0.40,
                            per_row_role_extractor_fn: Optional[Callable] = None,
                            use_per_row_role_extraction: bool = True) -> list:
    """
    Build a dynamic top-down LoD tree.

    Entry strategy (new — fully data-driven, no hardcoding):
      • If detected _group metadata provides L1/L2 structure, materialise those
        path segments as aggregation nodes first.
      • Then apply NMF aspect discovery and GMM clustering *locally* inside each
        terminal group (not globally across all variables).
      • Falls back to global NMF from root when no group structure is found.

    Within each recursive split [ZHU §3.3 adapted]:
      • Evaluate K aspects by silhouette — select the highest  (simplified
        best-aspect split; not the full probabilistic search of Zhu Eq.6/7).
      • Single-variable clusters are attached directly without an aggregation
        wrapper (singleton prevention).
    """
    texts  = can['_text'].fillna('').astype(str).tolist()
    n_vars = len(can)

    # ── build leaf attribute nodes (ids 1..n_vars) ───────────────────────────
    nodes: list    = [_make_node(0, project, ntype='root', desc='Root node')]
    var_nodes: dict = {}   # can positional index → node id
    for i, (_, row) in enumerate(can.iterrows(), start=1):
        nd = _make_node(i, row['_label'], ntype='attribute',
                        desc=row['_text'], dtype='determine')
        nd['metadata'] = {
            'leaf_id': str(row['_id']),
            'row_index': int(row['_row']),
            'group': row['_group'],
        }
        nodes.append(nd)
        var_nodes[int(row.name)] = i
    node_map: dict = {int(n['id']): n for n in nodes}

    # ── recursive splitter ────────────────────────────────────────────────────
    def _recurse(vi_global: list,
                 cur_reprs: list,
                 cur_labels: list,
                 cur_tfidf: TfidfVectorizer,
                 parent_id: int,
                 depth: int,
                 parent_path: str,
                 aspect_method_tag: str = 'nmf'):
        """
        vi_global  : global positional indices into `can` for this node's variables.
        cur_reprs  : list of K arrays, each shape (len(vi_global), d).
                     Rows correspond positionally to vi_global — no global indexing.
        cur_labels : NMF aspect labels for cur_reprs.
        cur_tfidf  : TF-IDF vectorizer fitted on this scope's texts.
        """
        if depth >= max_depth or len(vi_global) < min_cluster_size:
            for vi in vi_global:
                _add_child(node_map, parent_id, var_nodes[vi])
            return

        # NEW: shortcut for tiny homogeneous clusters — same _group, ≤3 vars.
        # Avoids spending GMM/silhouette evaluation on already-meaningful leaves.
        if len(vi_global) <= 3:
            groups_here = {str(can.iloc[vi]['_group']) for vi in vi_global}
            if len(groups_here) == 1:
                for vi in vi_global:
                    _add_child(node_map, parent_id, var_nodes[vi])
                return

        sub_texts = [texts[vi] for vi in vi_global]

        # Evaluate every aspect — pick the one with highest silhouette [ZHU §3.3]
        best_sil, best_k_idx, best_lbls = -1.0, -1, None
        for k_idx, rep in enumerate(cur_reprs):
            if rep.shape[0] < 3:
                continue
            lbls, _, sil = cluster_aspect_gmm(
                rep, max_k=min(max_clusters_per_split, len(vi_global) // 2))
            if sil > best_sil:
                best_sil, best_k_idx, best_lbls = sil, k_idx, lbls

        if best_k_idx == -1 or best_sil < sil_threshold or best_lbls is None:
            for vi in vi_global:
                _add_child(node_map, parent_id, var_nodes[vi])
            return

        # NEW: split-quality rejection (TraCo-inspired structural check).
        # Reject splits that are extremely imbalanced or mostly singletons —
        # silhouette can be high even when one cluster swallows most variables.
        cluster_sizes = list(Counter(best_lbls).values())
        max_ratio     = max(cluster_sizes) / sum(cluster_sizes)
        n_singletons  = sum(1 for s in cluster_sizes if s == 1)
        if max_ratio > 0.85 or n_singletons > len(cluster_sizes) // 2:
            for vi in vi_global:
                _add_child(node_map, parent_id, var_nodes[vi])
            return

        # Group variables by cluster assignment
        cluster_to_global: dict = defaultdict(list)
        for local_pos, (vi, cl) in enumerate(zip(vi_global, best_lbls)):
            cluster_to_global[int(cl)].append(vi)

        aspect_name     = cur_labels[best_k_idx]
        global_to_local = {vi: idx for idx, vi in enumerate(vi_global)}

        for cl_id, cl_global in cluster_to_global.items():
            if not cl_global:
                continue

            # Singleton prevention: attach single-variable clusters directly [NEW]
            if len(cl_global) == 1:
                _add_child(node_map, parent_id, var_nodes[cl_global[0]])
                continue

            cl_texts  = [texts[vi] for vi in cl_global]
            cl_groups = [str(can.iloc[vi]['_group']) for vi in cl_global]
            lbl, prov = label_cluster(cl_texts, sub_texts, cur_tfidf,
                                       cluster_groups=cl_groups,
                                       parent_path=parent_path,
                                       llm_label_fn=llm_label_fn,
                                       return_provenance=True)
            desc = (f'Aspect: {aspect_name} | '
                    f'Silhouette: {best_sil:.3f} | '
                    f'Variables: {len(cl_global)}')
            nid = _next_id(nodes)
            nd  = _make_node(nid, lbl, desc=desc)
            nd['label_provenance'] = prov
            nd['structure_provenance'] = {
                'route':           'aspect_clustering',
                'aspect_method':   aspect_method_tag,
                'silhouette':      round(float(best_sil), 4),
                'slot_coverage':   None,
            }
            nodes.append(nd)
            node_map[nid] = nd
            _add_child(node_map, parent_id, nid)

            # Slice embeddings to this cluster's local positions and recurse
            cl_pos     = [global_to_local[vi] for vi in cl_global]
            sub_reprs  = [rep[cl_pos] for rep in cur_reprs]
            _recurse(cl_global, sub_reprs, cur_labels, cur_tfidf,
                     nid, depth + 1, f'{parent_path} > {lbl}',
                     aspect_method_tag=aspect_method_tag)

    # ── group-anchored entry  ─────────────────────────────────────────────────
    # The _group column is built from user-confirmed group_cols — fully data-driven.
    # We materialise each path segment as an aggregation node, then run NMF/GMM
    # only inside each terminal group.

    group_node_ids: dict = {}   # path string → node id

    def _get_or_create_group_node(path: str, parent_id: int) -> int:
        if path in group_node_ids:
            return group_node_ids[path]
        seg_name = path.split(' > ')[-1].strip()
        nid = _next_id(nodes)
        nd  = _make_node(nid, seg_name, ntype='aggregation', desc=f'Group: {path}')
        nd['structure_provenance'] = {
            'route':           'group_anchor',
            'aspect_method':   None,
            'silhouette':      None,
            'slot_coverage':   None,
        }
        nodes.append(nd)
        node_map[nid] = nd
        _add_child(node_map, parent_id, nid)
        group_node_ids[path] = nid
        return nid

    # Collect global variable positions per terminal group path
    terminal_groups: dict = defaultdict(list)
    for vi, (_, row) in enumerate(can.iterrows()):
        terminal_groups[str(row['_group'])].append(vi)

    non_ungrouped = [g for g in terminal_groups if g.strip().lower() != 'ungrouped']

    if not non_ungrouped:
        # No group structure detected → fall back to global NMF from root
        full_reprs = [rep[list(range(n_vars))] for rep in aspect_reprs]
        _recurse(list(range(n_vars)), full_reprs, aspect_labels, tfidf,
                 0, 0, project)
    else:
        # Optional Streamlit progress bar — visible feedback for large datasets
        # like HCP where the per-group loop dominates runtime.
        try:
            import streamlit as _st_progress
            _pbar = _st_progress.progress(0.0, text='Building groups…')
            _show_progress = True
        except Exception:
            _pbar = None
            _show_progress = False

        sorted_groups = sorted(terminal_groups.items())
        n_groups_total = len(sorted_groups)

        for _g_idx, (group_path, vi_list) in enumerate(sorted_groups):
            if _show_progress and _pbar is not None:
                try:
                    short_path = group_path[:60] + ('…' if len(group_path) > 60 else '')
                    _pbar.progress((_g_idx + 1) / max(1, n_groups_total),
                                    text=f'[{_g_idx + 1}/{n_groups_total}] '
                                         f'{short_path}  ({len(vi_list)} vars)')
                except Exception:
                    pass
            # Build L1/L2 path nodes from detected group metadata
            segments = [s.strip() for s in group_path.split(' > ') if s.strip()]
            pid = 0
            for depth_seg in range(len(segments)):
                cumpath = ' > '.join(segments[:depth_seg + 1])
                pid = _get_or_create_group_node(cumpath, pid)
            existing_depth = len(segments)

            if len(vi_list) < min_cluster_size:
                for vi in vi_list:
                    _add_child(node_map, pid, var_nodes[vi])
                continue

            # ── Aspect-discovery callback for this group ──
            # Top-level: optionally uses FASTopic (slow, transformer-based,
            # semantic).  Sub-recursion: NMF only (fast, lexical).  This split
            # gives the best of both: rich top-level structure + quick deeper
            # splits.  Critical for HCP performance (~10× speedup vs running
            # FASTopic in every sub-recursion).
            def _aspect_recurse_for_group(vi_sub: list, parent: int,
                                            depth_left: int = 99,
                                            is_top_level: bool = False) -> int:
                local_texts = [texts[vi] for vi in vi_sub]
                use_reprs, use_labels, use_tfidf = None, None, None
                aspect_method_tag = 'sliced_global'

                if (is_top_level and use_fastopic and _FASTOPIC_AVAILABLE
                        and len(vi_sub) >= fastopic_min_size):
                    try:
                        f_tfidf, _, _, _, f_H, _, f_labels = discover_aspects_fastopic(
                            local_texts, max_aspects=max_aspects)
                        f_reprs = per_aspect_representations(
                            local_texts, f_H, f_tfidf, sbert_model)
                        use_reprs, use_labels, use_tfidf = f_reprs, f_labels, f_tfidf
                        aspect_method_tag = 'fastopic'
                    except Exception:
                        use_reprs = None
                if use_reprs is None and local_nmf and len(vi_sub) >= min_local_nmf_size:
                    try:
                        l_tfidf, _, _, _, l_H, _, l_labels = discover_aspects(
                            local_texts, max_aspects)
                        l_reprs = per_aspect_representations(
                            local_texts, l_H, l_tfidf, sbert_model)
                        use_reprs, use_labels, use_tfidf = l_reprs, l_labels, l_tfidf
                        aspect_method_tag = 'nmf'
                    except Exception:
                        use_reprs = None
                if use_reprs is None:
                    use_reprs  = [rep[vi_sub] for rep in aspect_reprs]
                    use_labels, use_tfidf = aspect_labels, tfidf

                before = sum(1 for n in nodes if n.get('type') == 'aggregation')
                effective_depth = max(0, min(depth_left, max_depth))
                _recurse(vi_sub, use_reprs, use_labels, use_tfidf,
                         parent, max(0, max_depth - effective_depth), group_path,
                         aspect_method_tag=aspect_method_tag)
                after = sum(1 for n in nodes if n.get('type') == 'aggregation')
                return max(0, after - before)

            local_texts = [texts[vi] for vi in vi_list]
            _top_grp = group_path.split(' > ')[0].strip()

            role_built = False
            role_regularity = None
            route_label = None

            # ── ROUTING 1A: PER-ROW LLM ROLE EXTRACTION (Zhu et al. EMNLP 2025) ──
            # The cleanest semantic route: one LLM call per row extracts
            # measure / statistic / condition / subtype values directly from
            # the description text.  Strict substring grounding prevents
            # hallucination.  Bypasses SBERT phrase clustering entirely.
            if (use_per_row_role_extraction
                    and per_row_role_extractor_fn is not None
                    and len(vi_list) >= 4):
                try:
                    role_result = discover_roles_via_per_row_extraction(
                        local_texts,
                        text_col_names=text_col_names,
                        per_row_extractor_fn=per_row_role_extractor_fn,
                        group_name=_top_grp,
                        regularity_threshold=role_regularity_threshold,
                    )
                    role_regularity = role_result.get('regularity')
                    # Always capture audit data, even when valid=False, so the
                    # Role Decomposition tab can show what the LLM proposed
                    # and why proposals were rejected.  group_path + vi_list
                    # let the display map each audit row back to its variable.
                    try:
                        st.session_state.a2_per_row_audit.append({
                            'group_name':   _top_grp,
                            'group_path':   group_path,
                            'vi_list':      [int(v) for v in vi_list],
                            'n_rows':       len(vi_list),
                            'coverage':     role_result.get('coverage'),
                            'valid':        role_result.get('valid'),
                            'roles_final':  list(role_result.get('roles', {}).keys()),
                            'summary':      role_result.get('extractor_summary', {}),
                            'per_row_audit': role_result.get('per_row_audit', []),
                        })
                    except Exception:
                        pass
                    if role_result['valid']:
                        role_built = build_role_hierarchy(
                            vi_list, role_result, can,
                            pid, group_path, nodes, node_map, var_nodes,
                            max_depth_remaining=max(2, max_depth - existing_depth),
                            post_split_fn=lambda vi_grp, p, d:
                                _aspect_recurse_for_group(vi_grp, p, d,
                                                            is_top_level=False),
                            min_post_split_size=max(6, min_cluster_size + 4))
                        if role_built:
                            route_label = 'per_row_llm_extraction'
                except Exception:
                    role_built = False

            # ── ROUTING 1B: SBERT phrase clustering fallback (Option D original) ──
            if not role_built and (use_role_decomposition and sbert_model is not None
                    and len(vi_list) >= 6):
                try:
                    role_result = discover_roles_via_sbert_phrase_clustering(
                        local_texts,
                        text_col_names=text_col_names,
                        sbert_model=sbert_model,
                        llm_role_namer_fn=llm_role_namer_fn,
                        group_name=_top_grp,
                        regularity_threshold=role_regularity_threshold,
                    )
                    role_regularity = role_result.get('regularity')
                    if role_result['valid']:
                        role_built = build_role_hierarchy(
                            vi_list, role_result, can,
                            pid, group_path, nodes, node_map, var_nodes,
                            max_depth_remaining=max(2, max_depth - existing_depth),
                            post_split_fn=lambda vi_grp, p, d:
                                _aspect_recurse_for_group(vi_grp, p, d,
                                                            is_top_level=False),
                            min_post_split_size=max(6, min_cluster_size + 4))
                        if role_built:
                            route_label = 'sbert_phrase_clustering'
                except Exception:
                    role_built = False
            if role_built:
                anchor = node_map.get(pid)
                if anchor is not None:
                    anchor.setdefault('structure_provenance', {})
                    anchor['structure_provenance']['phrase_regularity'] = role_regularity
                    anchor['structure_provenance']['route_used'] = route_label
                continue

            # ── ROUTING 2: phrase-slot mining (IE / mutual-exclusion) ──
            slot_built = False
            if use_slot_mining and len(vi_list) >= 6:
                try:
                    slot_result = mine_phrase_slots(
                        local_texts,
                        text_col_names=text_col_names,
                        group_name=_top_grp,
                        llm_role_classifier_fn=llm_role_classifier_fn,
                    )
                    if slot_result['valid']:
                        slot_built = build_slot_hierarchy(
                            vi_list, slot_result, can,
                            pid, group_path, nodes, node_map, var_nodes,
                            max_depth_remaining=max(2, max_depth - existing_depth),
                            post_slot_split_fn=lambda vi_grp, p, d:
                                _aspect_recurse_for_group(vi_grp, p, d,
                                                            is_top_level=False),
                            min_post_slot_size=max(6, min_cluster_size + 4))
                except Exception:
                    slot_built = False
            if slot_built:
                anchor = node_map.get(pid)
                if anchor is not None:
                    anchor.setdefault('structure_provenance', {})
                    anchor['structure_provenance']['phrase_regularity'] = role_regularity
                    anchor['structure_provenance']['route_used'] = 'slot_mining_fallback'
                continue

            # ── ROUTING 3: aspect discovery (FASTopic / NMF) ──
            anchor = node_map.get(pid)
            if anchor is not None:
                anchor.setdefault('structure_provenance', {})
                anchor['structure_provenance']['phrase_regularity'] = role_regularity
                anchor['structure_provenance']['route_used'] = 'aspect_clustering_fallback'
            _aspect_recurse_for_group(vi_list, pid, is_top_level=True)

    # Clear the progress bar
    try:
        if _show_progress and _pbar is not None:
            _pbar.empty()
    except Exception:
        pass

    # ── POST-BUILD PASS 1 — sibling common-prefix factoring [Castanet 2007] ──
    try:
        n_factored = factor_sibling_common_prefixes(
            nodes, min_siblings=3, min_prefix_tokens=2)
    except Exception:
        n_factored = 0

    # ── POST-BUILD PASS 2 — conservative noise-label pruning ──────────────────
    try:
        n_dissolved = prune_low_quality_aggregations(nodes)
    except Exception:
        n_dissolved = 0

    # ── POST-BUILD PASS 3 — strip group prefix from child labels ──────────────
    # 'DMS — Total Errors' under DMS → 'Total Errors'.  Reduces visual
    # redundancy without altering tree structure.
    try:
        n_stripped = strip_group_prefix_from_labels(nodes)
    except Exception:
        n_stripped = 0

    # ── POST-BUILD PASS 4 — enforce single parent (collapse DAG → tree) ───────
    # Keeps each variable under its most specific parent so the hierarchy is a
    # true tree, matching the viewer/evaluator assumptions and rendering proportionally.
    try:
        n_reparented = enforce_single_parent(nodes)
    except Exception:
        n_reparented = 0

    # ── POST-BUILD PASS 5 — drop aggregation nodes left childless by PASS 4 ───
    try:
        n_empty_pruned = prune_empty_aggregations(nodes)
    except Exception:
        n_empty_pruned = 0

    # Annotate the root with post-build statistics
    if nodes and nodes[0].get('type') == 'root':
        nodes[0]['post_build_stats'] = {
            'sibling_factor_nodes_inserted': int(n_factored),
            'low_quality_nodes_dissolved':   int(n_dissolved),
            'group_prefix_labels_stripped':  int(n_stripped),
            'dag_links_removed':             int(n_reparented),
            'empty_aggregations_pruned':     int(n_empty_pruned),
        }

    # Deduplicate children
    for nd in nodes:
        nd['related'] = list(dict.fromkeys(int(x) for x in nd['related']))

    return nodes

# ──────────────────────────────────────────────────────────────────────────────
# STEP 9  — EVALUATION  [ZHU Table 2 / TraCo / TICL §3.4]
# ──────────────────────────────────────────────────────────────────────────────
def purity_score(true_labels, pred_labels) -> float:
    true = np.array(true_labels); pred = np.array(pred_labels)
    total = len(true)
    if total == 0:
        return 0.0
    score = 0
    for cl in np.unique(pred):
        mask = pred == cl
        if not mask.any():
            continue
        counts = np.bincount(true[mask].astype(int))
        score += counts.max()
    return score / total

def evaluate(true_labels, pred_labels) -> dict:
    le = LabelEncoder()
    tl = le.fit_transform([str(x) for x in true_labels])
    pl = np.array(pred_labels, dtype=int)
    return {
        'NMI':    round(float(normalized_mutual_info_score(tl, pl)), 4),
        'ARI':    round(float(adjusted_rand_score(tl, pl)),          4),
        'Purity': round(purity_score(tl, pl),                        4),
    }

def hierarchy_quality_metrics(nodes: list, tfidf: TfidfVectorizer) -> dict:
    """
    Compute lightweight structural quality metrics inspired by TraCo (AAAI 2024).

    [TraCo] Wu et al. identify three failure modes in hierarchical topic models:
    low affinity (children unrelated to parent), low rationality (children not
    more specific than parent), and low diversity (sibling topics too similar).

    Here we measure:
    • parent-child coherence: mean cosine(parent_label, child_label) — proxy for affinity.
    • sibling diversity: mean (1 - pairwise cosine) among siblings — proxy for diversity.
    """
    node_map = {n['id']: n for n in nodes}
    coherence_scores, diversity_scores = [], []

    for n in nodes:
        if n['type'] != 'aggregation' or not n['related']:
            continue
        children     = [node_map[c] for c in n['related'] if c in node_map]
        child_labels = [c['name'] for c in children
                        if c.get('type') == 'aggregation' and c['name'] != 'Group']
        if len(child_labels) < 2:
            continue

        try:
            parent_vec  = tfidf.transform([n['name']])
            child_vecs  = tfidf.transform(child_labels)
            # Parent-child coherence
            coh = float(cosine_similarity(parent_vec, child_vecs).mean())
            coherence_scores.append(coh)
            # Sibling diversity
            sib_sims = cosine_similarity(child_vecs)
            np.fill_diagonal(sib_sims, 0)
            diversity_scores.append(float(1 - sib_sims.mean()))
        except Exception:
            pass

    return {
        'mean_parent_child_coherence': round(np.mean(coherence_scores), 4) if coherence_scores else 0.0,
        'mean_sibling_diversity':      round(np.mean(diversity_scores),  4) if diversity_scores else 0.0,
        'n_aggregation_nodes':         len([n for n in nodes if n['type'] == 'aggregation']),
        'n_singleton_splits':          len([n for n in nodes
                                           if n['type'] == 'aggregation'
                                           and len(n['related']) == 1]),
    }

# ──────────────────────────────────────────────────────────────────────────────
# DISPLAY-TIME ONE-CHILD CHAIN COMPRESSION  (visualization only — structure preserved in JSON)
# ──────────────────────────────────────────────────────────────────────────────
def compress_one_child_chains(nodes: list) -> list:
    """
    Display-only transformation: collapse chains where an aggregation node has
    exactly one aggregation child (e.g. 'DMS → DMS Recommended Standard').
    The collapsed node displays as 'DMS / DMS Recommended Standard'.
    Structural data in the exported JSON is NOT modified — this returns a new
    node list used only for visualisation.
    """
    nodes = _filter_dissolved(nodes)
    nm = {int(n['id']): dict(n) for n in nodes}   # deep-ish copy
    parent_of: dict = {}
    for n in nodes:
        for c in n.get('related', []):
            parent_of.setdefault(int(c), int(n['id']))

    def _is_chain_link(n):
        if n.get('type') != 'aggregation':
            return False
        children = n.get('related', [])
        return (len(children) == 1
                and nm.get(int(children[0]), {}).get('type') == 'aggregation')

    changed = True
    while changed:
        changed = False
        for nid, n in list(nm.items()):
            if _is_chain_link(n):
                child_id = int(n['related'][0])
                child    = nm[child_id]
                # Merge: child takes parent's id, parent's slot, but combined name
                merged_name = f"{n['name']} / {child['name']}"
                new_node = dict(child)
                new_node['id']   = nid
                new_node['name'] = merged_name
                new_node['desc'] = f"{n.get('desc','')} | {child.get('desc','')}"
                # Rewire child's children to nid (already nid)
                nm[nid] = new_node
                # Remove the original child node
                if child_id in nm:
                    del nm[child_id]
                # Re-point any references to child_id → nid
                for other in nm.values():
                    other['related'] = [nid if int(c) == child_id else int(c)
                                        for c in other.get('related', [])]
                changed = True
                break

    return list(nm.values())

# ──────────────────────────────────────────────────────────────────────────────
# VISUALISATION HELPERS
# ──────────────────────────────────────────────────────────────────────────────
def _filter_dissolved(nodes: list) -> list:
    """
    Drop dissolved/hidden nodes for visualisation.  Children of dissolved
    nodes were already promoted to the grandparent by the pruning pass, so
    dropping the dissolved wrapper here makes the tree render cleanly.
    """
    drop_ids = {int(n['id']) for n in nodes
                 if n.get('type') == 'dissolved' or n.get('isShown') is False}
    if not drop_ids:
        return nodes
    out = []
    for n in nodes:
        if int(n['id']) in drop_ids:
            continue
        m = dict(n)
        m['related'] = [int(c) for c in n.get('related', [])
                         if int(c) not in drop_ids]
        out.append(m)
    return out

def _leaf_ids(nodes: list, nid: int) -> list:
    m = {int(n['id']): n for n in nodes}
    out = []
    def rec(x):
        n = m.get(int(x))
        if not n: return
        if n.get('type') == 'attribute': out.append(int(x)); return
        for c in n.get('related', []): rec(int(c))
    rec(nid)
    return list(dict.fromkeys(out))

def _parent_map(nodes: list) -> dict:
    pm = {}
    for n in nodes:
        for c in n.get('related', []):
            if int(c) not in pm:
                pm[int(c)] = int(n['id'])
    return pm

def _tree_value_map(nodes: list, pm: dict) -> dict:
    """
    Leaf count per node measured along the *rendered* tree (each node has
    exactly one parent, per `pm`).  Plotly draws sectors using that same
    single-parent structure, so values built this way always satisfy
    parent == sum(children) — which is what branchvalues='total' requires.

    The full hierarchy can be a DAG (a variable promoted under more than one
    role branch), in which case `_leaf_ids` double-counts a shared leaf and a
    parent's unique-leaf count comes out *less* than the sum of its children's
    counts.  Feeding those numbers to a 'total' chart blanks it.  Counting on
    the rendered tree instead avoids that without changing the hierarchy.
    """
    kids = {}
    for child, par in pm.items():
        kids.setdefault(int(par), []).append(int(child))
    nodemap = {int(n['id']): n for n in nodes}
    memo = {}
    def count(nid: int) -> int:
        if nid in memo:
            return memo[nid]
        memo[nid] = 1  # guard against cycles while recursing
        n = nodemap.get(nid)
        if n is not None and n.get('type') == 'attribute':
            memo[nid] = 1
            return 1
        ch = kids.get(nid, [])
        v = sum(count(c) for c in ch) if ch else 1
        memo[nid] = max(1, v)
        return memo[nid]
    return {nid: count(nid) for nid in nodemap}

def _wrap_hover(text: str, width: int = 80) -> str:
    """Soft-wrap long descriptions onto multiple <br>-separated lines so the
    Plotly hover tooltip shows the full text instead of being cut off."""
    import textwrap as _tw
    s = str(text or '')
    if not s:
        return ''
    lines = []
    for raw_line in s.split('\n'):
        lines.extend(_tw.wrap(raw_line, width=width) or [''])
    return '<br>'.join(lines)

def plot_sunburst(nodes: list, max_depth: int = 4):
    nodes = _filter_dissolved(nodes)
    pm = _parent_map(nodes)
    vm = _tree_value_map(nodes, pm)
    ids, labels, parents, values, hover = [], [], [], [], []
    for n in nodes:
        nid = int(n['id'])
        lc  = len(_leaf_ids(nodes, nid))
        ids.append(str(nid))
        labels.append(str(n.get('name', ''))[:40])
        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
        values.append(vm.get(nid, 1))
        hover.append(f"<b>{n.get('name','')}</b><br>Type: {n.get('type','')}<br>"
                     f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc',''))}")
    fig = go.Figure(go.Sunburst(
        ids=ids, labels=labels, parents=parents, values=values,
        branchvalues='total', hovertext=hover, hoverinfo='text',
        maxdepth=max_depth, insidetextorientation='radial',
        marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
    fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
                      title=dict(text='Click sector to drill down — click centre to go back',
                                 font=dict(size=13), x=0.5))
    return fig

def plot_treemap(nodes: list):
    nodes = _filter_dissolved(nodes)
    pm = _parent_map(nodes)
    vm = _tree_value_map(nodes, pm)
    ids, labels, parents, values, hover = [], [], [], [], []
    for n in nodes:
        nid = int(n['id'])
        lc  = len(_leaf_ids(nodes, nid))
        ids.append(str(nid))
        labels.append(str(n.get('name', ''))[:40])
        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
        values.append(vm.get(nid, 1))
        hover.append(f"<b>{n.get('name','')}</b><br>Variables: {lc}<br>"
                     f"{_wrap_hover(n.get('desc',''))}")
    fig = go.Figure(go.Treemap(
        ids=ids, labels=labels, parents=parents, values=values,
        branchvalues='total', hovertext=hover, hoverinfo='text',
        textinfo='label+value',
        marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
    fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
    return fig

# ──────────────────────────────────────────────────────────────────────────────
# NODE-LINK TREE  — Reingold-Tilford layout (matches Approach 1.1 interface)
# ──────────────────────────────────────────────────────────────────────────────
def _a2_node_color(n: dict) -> str:
    t = n.get('type', '')
    if t == 'root':      return '#c44e52'
    if t == 'attribute': return '#4C72B0'
    if t == 'collapsed': return '#bbbbbb'
    return '#8C8C8C'

def _display_graph(nodes: list, max_depth: int = 4, show_hidden: bool = False):
    """Walk tree to chosen depth, inserting 'collapsed' placeholders for cut-off branches."""
    m = {int(n['id']): n for n in nodes}
    dnodes: dict = {}
    edges: list  = []
    counter = 10 ** 9

    def rec(nid, depth):
        nonlocal counter
        n = m.get(int(nid))
        if not n:
            return
        if not show_hidden and n.get('isShown') is False and depth > 0:
            return
        dnodes[int(nid)] = n
        if depth >= max_depth and n.get('related'):
            counter += 1
            cid = counter
            n_leaves = len(_leaf_ids(nodes, nid))
            dnodes[cid] = {'id': cid,
                           'name': f'… {n_leaves} variables',
                           'type': 'collapsed', 'dtype': 'determine',
                           'related': [], 'desc': f"Collapsed: {n.get('name')}",
                           'isShown': True}
            edges.append((int(nid), cid))
            return
        for c in n.get('related', []):
            ch = m.get(int(c))
            if not ch:
                continue
            if not show_hidden and ch.get('isShown') is False:
                continue
            edges.append((int(nid), int(c)))
            rec(int(c), depth + 1)

    rec(0, 0)
    return list(dnodes.values()), edges

def _positions(dnodes: list, edges: list):
    """Reingold-Tilford style positions: x=depth, y=subtree-aware vertical."""
    H_SCALE = 3.0
    V_SPACE = 1.8
    children: dict = defaultdict(list)
    for p, c in edges:
        children[p].append(c)
    pos: dict = {}
    counter = {'v': 0}

    def rec(nid, depth):
        ch = children.get(nid, [])
        if not ch:
            y_pos = counter['v'] * V_SPACE
            counter['v'] += 1
            pos[nid] = (depth * H_SCALE, y_pos)
            return y_pos
        child_ys = [rec(c, depth + 1) for c in ch]
        y_pos = float(np.mean(child_ys))
        pos[nid] = (depth * H_SCALE, y_pos)
        return y_pos

    rec(0, 0)
    return pos

def plot_node_link(nodes: list, max_depth: int = 4,
                    show_hidden: bool = False, show_leaf_labels: bool = False):
    """
    Node-link tree with elbow edges (matches Approach 1.1 layout).
    Best for exploring structure at moderate depth — Sunburst remains
    recommended for large hierarchies per Taxonomizer (Bian et al. 2020).
    """
    nodes = _filter_dissolved(nodes)
    dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
    pos = _positions(dnodes, edges)

    # Elbow edges
    ex, ey = [], []
    for p, c in edges:
        if p not in pos or c not in pos:
            continue
        x0, y0 = pos[p]
        x1, y1 = pos[c]
        xm = (x0 + x1) / 2
        ex += [x0, xm, xm, x1, None]
        ey += [y0, y0, y1, y1, None]
    traces = [go.Scatter(x=ex, y=ey, mode='lines',
                          line=dict(width=1, color='#c8c8c8'),
                          hoverinfo='skip', showlegend=False)]

    agg_xs, agg_ys, agg_labels, agg_colors, agg_hover = [], [], [], [], []
    lf_xs,  lf_ys,  lf_labels,  lf_colors,  lf_hover  = [], [], [], [], []

    for n in dnodes:
        nid = int(n['id'])
        if nid not in pos:
            continue
        x, y = pos[nid]
        lc   = len(_leaf_ids(nodes, nid))
        lab  = n.get('name', str(nid))
        htxt = (f"<b>{n.get('name','')}</b><br>"
                f"Type: {n.get('type','')}<br>"
                f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc',''))}")
        col  = _a2_node_color(n)

        if n.get('type') in ('root', 'aggregation', 'collapsed'):
            display_lab = (lab + (f' ({lc})' if lc else ''))[:50]
            agg_xs.append(x); agg_ys.append(y)
            agg_labels.append(display_lab); agg_colors.append(col); agg_hover.append(htxt)
        else:
            display_lab = lab[:40] if show_leaf_labels else ''
            lf_xs.append(x); lf_ys.append(y)
            lf_labels.append(display_lab); lf_colors.append(col); lf_hover.append(htxt)

    if agg_xs:
        traces.append(go.Scatter(
            x=agg_xs, y=agg_ys, mode='markers+text',
            text=agg_labels, textposition='middle right',
            hovertext=agg_hover, hoverinfo='text',
            marker=dict(size=16, color=agg_colors,
                        line=dict(color='white', width=2)),
            showlegend=False))
    if lf_xs:
        traces.append(go.Scatter(
            x=lf_xs, y=lf_ys, mode='markers+text',
            text=lf_labels, textposition='middle right',
            hovertext=lf_hover, hoverinfo='text',
            marker=dict(size=7, color=lf_colors, symbol='circle',
                        opacity=0.75, line=dict(color='white', width=1)),
            showlegend=False))

    n_leaves = max(12, len(lf_xs))
    fig = go.Figure(traces)
    fig.update_layout(
        height=max(700, min(4000, int(n_leaves * 32))),
        margin=dict(l=20, r=220, t=40, b=20),
        plot_bgcolor='white', paper_bgcolor='white',
        xaxis=dict(visible=False, fixedrange=False),
        yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
        dragmode='pan',
        annotations=[dict(
            text='Tip: Sunburst is better for large hierarchies [Taxonomizer 2020]',
            xref='paper', yref='paper', x=0.0, y=1.01,
            showarrow=False, font=dict(size=11, color='grey'), align='left')]
    )
    return fig

# ──────────────────────────────────────────────────────────────────────────────
# STREAMLIT APP
# ──────────────────────────────────────────────────────────────────────────────
st.set_page_config(page_title='Approach 2 — Multi-Aspect Hierarchy',
                   layout='wide')
st.title('Approach 2 — Role-Decomposed Metadata Hierarchy')
st.caption('Group anchoring → LLM role extraction → role-nested LoD tree. '
           'Full method details and citations in the Method tab.')

# Session-state init — defensive: always ensure these keys exist with safe
# defaults.  Streamlit can occasionally drop attribute-style access if the key
# was set via [setter] in a previous run; using setdefault here works on both
# new and existing sessions.
st.session_state.setdefault('a2_nodes', None)
st.session_state.setdefault('a2_can',   None)
st.session_state.setdefault('a2_meta',  {})
st.session_state.setdefault('a2_per_row_audit', [])   # list of group audits

# Local-LLM auto-detection: probe Ollama server.  LLM refinement defaults to
# ON when Ollama is reachable; user can switch it off any time.  Env vars
# OLLAMA_URL and OLLAMA_MODEL override the localhost / qwen2.5:3b defaults.
_ollama_url     = os.environ.get('OLLAMA_URL', OLLAMA_URL_DEFAULT).strip() or OLLAMA_URL_DEFAULT
_ollama_model   = os.environ.get('OLLAMA_MODEL', OLLAMA_MODEL_DEFAULT).strip() or OLLAMA_MODEL_DEFAULT
_ollama_reachable = _ping_ollama(_ollama_url)

# Groq detection: env-var GROQ_API_KEY makes the cloud option available.
_groq_url       = os.environ.get('GROQ_URL', GROQ_URL_DEFAULT).strip() or GROQ_URL_DEFAULT
_groq_model     = os.environ.get('GROQ_MODEL', GROQ_MODEL_DEFAULT).strip() or GROQ_MODEL_DEFAULT
_groq_key_env   = os.environ.get('GROQ_API_KEY', '').strip()

_default_provider = 'groq' if (_groq_key_env and _LLM_CLIENT_AVAILABLE) else 'ollama'
_default_llm_on   = (
    (_ollama_reachable or bool(_groq_key_env)) and _LLM_CLIENT_AVAILABLE)

with st.sidebar:
    st.header('1 · Input')
    uploads = st.file_uploader('Metadata / data-dictionary file(s)',
                                type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
                                accept_multiple_files=True)

    st.header('2 · Algorithm')
    max_aspects     = st.slider('Max aspects (K upper bound)', 3, 15, 6)
    max_depth       = st.slider('Max tree depth', 2, 10, 6)
    min_cluster_sz  = st.slider('Min variables per cluster', 1, 10, 2)
    sil_thresh      = st.slider('Silhouette threshold',
                                0.01, 0.30,
                                value=0.04 if not _SBERT_AVAILABLE else 0.05,
                                step=0.01)
    max_k_split     = st.slider('Max child clusters per split', 2, 12, 5)
    use_sbert       = st.checkbox('SBERT embeddings',
                                  value=_SBERT_AVAILABLE,
                                  disabled=not _SBERT_AVAILABLE)
    local_nmf       = st.checkbox('Local NMF fallback', value=True)
    min_local_nmf   = st.slider('Min group size for NMF', 5, 30, 6)
    use_slot_mining = st.checkbox('Phrase-slot mining', value=True)
    use_fastopic   = st.checkbox(
        'FASTopic aspects',
        value=_FASTOPIC_AVAILABLE,
        disabled=not _FASTOPIC_AVAILABLE,
    )
    fastopic_min_sz = st.slider('Min group size for FASTopic', 6, 100, 40)
    if not _FASTOPIC_AVAILABLE:
        st.warning('FASTopic not installed — using NMF.')

    st.header('3 · LLM provider')
    provider_options = ['Ollama (local)', 'Groq (cloud)']
    provider_default_idx = 1 if _default_provider == 'groq' else 0
    provider_label = st.radio('Provider', provider_options,
                              index=provider_default_idx, horizontal=True)
    llm_provider = 'groq' if provider_label.startswith('Groq') else 'ollama'

    if llm_provider == 'groq':
        if not _LLM_CLIENT_AVAILABLE:
            st.warning('`openai` package not installed.')
        elif _groq_key_env:
            st.success('GROQ_API_KEY detected → ready')
        groq_key_in = st.text_input(
            'Groq API key', value=_groq_key_env, type='password',
            help='Free key at console.groq.com/keys, or set GROQ_API_KEY env var.')
        groq_model_in = st.text_input('Groq model', value=_groq_model)
        llm_base_url = _groq_url
        llm_model    = groq_model_in
        llm_api_key  = groq_key_in
        llm_ready    = bool(groq_key_in and _LLM_CLIENT_AVAILABLE)
    else:
        if not _LLM_CLIENT_AVAILABLE:
            st.warning('`openai` package not installed.')
        elif _ollama_reachable:
            st.success(f'Ollama reachable at `{_ollama_url}`')
        else:
            st.warning(f'Ollama not reachable at `{_ollama_url}`.')
        ollama_url_in   = st.text_input('Ollama URL', value=_ollama_url)
        ollama_model_in = st.text_input('Ollama model', value=_ollama_model)
        llm_base_url = ollama_url_in
        llm_model    = ollama_model_in
        llm_api_key  = ''
        llm_ready    = bool(_LLM_CLIENT_AVAILABLE and _ollama_reachable)

    st.header('4 · LLM features')
    use_per_row_role_extraction = st.checkbox(
        'Per-row role extraction (primary route)',
        value=llm_ready,
        disabled=not llm_ready,
        help='One LLM call per variable extracts measure / statistic / '
             'condition / subtype, grounded to the description text.'
    )
    use_llm = st.checkbox(
        'LLM label refinement',
        value=llm_ready,
        disabled=not llm_ready,
    )
    use_llm_roles = st.checkbox(
        'LLM phrase-role classifier (fallback)',
        value=llm_ready,
        disabled=not llm_ready,
    )
    use_role_decomposition = st.checkbox(
        'SBERT phrase clustering (fallback)',
        value=_SBERT_AVAILABLE and llm_ready,
        disabled=not (_SBERT_AVAILABLE and llm_ready),
    )
    role_namer_constrained = st.checkbox(
        'Constrained role vocabulary',
        value=True,
    )
    role_regularity_threshold = st.slider(
        'Min phrase regularity', 0.05, 0.80, 0.20, 0.05,
    )

    st.header('5 · Project')
    project_name = st.text_input('Project name', value='project')

# ── load and configure files ──────────────────────────────────────────────────
if uploads:
    import tempfile
    tmp    = Path(tempfile.mkdtemp())
    raw_by = {}
    cfg_by = {}

    st.subheader('Step 1 — Inspect metadata')
    for f in uploads:
        p = tmp / safe_name(f.name)
        p.write_bytes(f.getbuffer())
        try:
            df = load_any(p)
            raw_by[f.name] = df
            cfg_by[f.name] = detect_roles(df)
            with st.expander(f'{f.name}', expanded=False):
                st.write(f'Rows: **{len(df):,}**  Columns: **{len(df.columns)}**')
                st.dataframe(df.head(8), use_container_width=True)
        except Exception as e:
            st.error(f'Could not load {f.name}: {e}')

    st.subheader('Step 2 — Confirm column roles')
    configs = {}
    for name, df in raw_by.items():
        cols = list(df.columns)
        auto = cfg_by[name]
        with st.expander(f'{name}', expanded=True):
            c1, c2 = st.columns(2)
            with c1:
                leaf  = st.multiselect('Leaf variable column(s)', cols,
                                       default=[c for c in auto['leaf_cols']  if c in cols],
                                       key=f'lf_{name}')
                group = st.multiselect('Group / task column(s)', cols,
                                       default=[c for c in auto['group_cols'] if c in cols],
                                       key=f'gr_{name}')
            with c2:
                text  = st.multiselect('Description column(s)', cols,
                                       default=[c for c in auto['text_cols']  if c in cols],
                                       key=f'tx_{name}')
                meta  = st.multiselect('Type / unit column(s)', cols,
                                       default=[c for c in auto['meta_cols']  if c in cols],
                                       key=f'mt_{name}')
            configs[name] = {'leaf_cols': leaf, 'group_cols': group,
                             'text_cols': text,  'meta_cols':  meta}

    if st.button('Build Approach 2 Hierarchy', type='primary'):
        try:
            # Clear stale audit data from any previous build
            st.session_state.a2_per_row_audit = []

            cans  = [build_canonical(df, configs[name], name)
                     for name, df in raw_by.items()]
            can   = pd.concat(cans, ignore_index=True)
            st.session_state.a2_can = can
            texts = can['_text'].fillna('').astype(str).tolist()

            # Global NMF (used as fallback and for facet trees)
            with st.spinner('Step 3 — Discovering global aspects via NMF …'):
                tfidf, X, nmf, W, H, K, alabels = discover_aspects(texts, max_aspects)
                st.session_state.a2_meta = {
                    'K': K, 'aspect_labels': alabels, 'tfidf': tfidf, 'W': W, 'H': H
                }
                st.info(f'Global aspects discovered: **{K}** — {" | ".join(alabels)}')

            sbert_model = None
            if use_sbert and _SBERT_AVAILABLE:
                with st.spinner('Loading SBERT model …'):
                    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

            with st.spinner('Step 4 — Building global per-aspect representations …'):
                reprs = per_aspect_representations(texts, H, tfidf, sbert_model)

            # Build LLM function trio — same provider + endpoint for all.
            llm_fn = None
            if use_llm:
                llm_fn = make_llm_label_fn(llm_base_url, llm_model,
                                             provider=llm_provider,
                                             api_key=llm_api_key)
                if llm_fn is None:
                    st.warning('LLM unreachable — deterministic labels only.')

            llm_role_namer = None
            if use_role_decomposition:
                llm_role_namer = make_llm_role_namer_fn(
                    llm_base_url, llm_model,
                    constrained=role_namer_constrained,
                    provider=llm_provider, api_key=llm_api_key)

            llm_role_fn = None
            if use_llm_roles:
                llm_role_fn = make_llm_role_classifier_fn(
                    llm_base_url, llm_model,
                    provider=llm_provider, api_key=llm_api_key)

            # NEW: per-row role extractor (Zhu et al. EMNLP 2025) — primary route
            per_row_extractor = None
            if use_per_row_role_extraction:
                per_row_extractor = make_per_row_role_extractor_fn(
                    llm_base_url, llm_model,
                    provider=llm_provider, api_key=llm_api_key)
                if per_row_extractor:
                    st.info(f'Per-row role extraction: **{llm_provider}** · `{llm_model}`')
                else:
                    st.warning('LLM unreachable for per-row role extraction.')

            # Collect detected text-column names across all uploaded configs —
            # used by phrase-slot mining to identify description-like fields.
            all_text_cols: list = []
            for cfg in configs.values():
                for c in cfg.get('text_cols', []):
                    if c not in all_text_cols:
                        all_text_cols.append(c)

            # Replace the module-level FIELD_NAME_NOISE with a noise set DERIVED
            # from the actual detected column names — zero hardcoding.  All
            # downstream calls (label_cluster, _bigram_preferred_terms) read
            # the module-level name so this swap propagates everywhere.
            import sys as _sys
            _sys.modules[__name__].FIELD_NAME_NOISE = build_field_noise(configs)

            with st.spinner('Step 6 — Building group-anchored LoD tree '
                            '(slot mining → FASTopic → NMF) …'):
                nodes = build_dynamic_lod_tree(
                    can, reprs, alabels, tfidf,
                    max_depth=max_depth,
                    min_cluster_size=min_cluster_sz,
                    sil_threshold=sil_thresh,
                    max_clusters_per_split=max_k_split,
                    project=project_name,
                    local_nmf=local_nmf,
                    min_local_nmf_size=min_local_nmf,
                    max_aspects=max_aspects,
                    sbert_model=sbert_model,
                    llm_label_fn=llm_fn,
                    use_slot_mining=use_slot_mining,
                    text_col_names=all_text_cols,
                    use_fastopic=use_fastopic,
                    fastopic_min_size=fastopic_min_sz,
                    llm_role_classifier_fn=llm_role_fn,
                    use_role_decomposition=use_role_decomposition,
                    llm_role_namer_fn=llm_role_namer,
                    role_regularity_threshold=role_regularity_threshold,
                    per_row_role_extractor_fn=per_row_extractor,
                    use_per_row_role_extraction=use_per_row_role_extraction,
                )
                st.session_state.a2_nodes = nodes


            # Concise build summary — per-route node counts
            route_counts: dict = Counter()
            for n in nodes:
                if n.get('type') == 'aggregation':
                    route_counts[
                        n.get('structure_provenance', {}).get('route', '—')] += 1
            n_leaves   = len([n for n in nodes if n.get('type') == 'attribute'])
            n_internal = len([n for n in nodes if n.get('type') == 'aggregation'])
            route_str  = ' · '.join(f'{r}: {c}' for r, c in route_counts.most_common())
            st.success(f'Done — {n_leaves} variables · {n_internal} internal nodes '
                       f'({route_str})')

            # If any LLM call hit a rate-limit (429), the model ran out of
            # tokens — tell the user to switch model in the sidebar and rebuild.
            ran_out = any(
                ('RateLimit' in str(r) or '429' in str(r))
                for a in (st.session_state.get('a2_per_row_audit') or [])
                for r in (a.get('summary') or {})
            )
            if ran_out:
                st.error(f'Ran out of tokens on `{llm_model}`. '
                         f'Switch to another Groq model in the sidebar '
                         f'(e.g. llama-3.1-8b-instant) and rebuild.')
        except Exception as e:
            st.error(f'Build failed: {e}')
            import traceback; st.code(traceback.format_exc())

# ── display ───────────────────────────────────────────────────────────────────
# Robust session-state reads — use .get() so a partial/incomplete build that
# wrote some keys but not others doesn't crash the display layer.
if st.session_state.get('a2_nodes') is None:
    st.info('Upload a metadata file and click **Build Approach 2 Hierarchy** to start.')
    st.stop()

nodes  = st.session_state.get('a2_nodes')
can    = st.session_state.get('a2_can')
meta   = st.session_state.get('a2_meta') or {}

tabs = st.tabs(['LoD Tree', 'Evaluation', 'Role Decomposition',
                'Label Provenance', 'Metadata', 'Export', 'Method'])

with tabs[0]:
    # ── Visualization controls (above chart — easy to find, matches Approach 1.1) ─
    vc1, vc2, vc3, vc4, vc5 = st.columns([2, 2, 1, 1, 1])
    with vc1:
        viz_mode = st.radio(
            'View mode',
            ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
            horizontal=True, index=0,
            help='Sunburst best for large hierarchies [Taxonomizer]. '
                 'Node-link best for moderate depth structure inspection.'
        )
    with vc2:
        depth_display = st.slider('Depth (Level of Detail)', 1, 8, 4, 1)
    with vc3:
        show_leaf_labels = st.checkbox('Leaf labels', value=False)
    with vc4:
        show_hidden = st.checkbox('Hidden nodes', value=False)
    with vc5:
        compress_chains = st.checkbox('Compress chains', value=True,
                                       help='Merge one-child aggregation chains '
                                            '(e.g. "DMS → DMS Recommended Standard") '
                                            'for display. Export JSON keeps original structure.')
    st.divider()

    display_nodes = compress_one_child_chains(nodes) if compress_chains else nodes

    if viz_mode == 'Sunburst (drill-down)':
        st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
                        use_container_width=True)
    elif viz_mode == 'Treemap':
        st.plotly_chart(plot_treemap(display_nodes), use_container_width=True)
    else:
        st.plotly_chart(plot_node_link(display_nodes, depth_display,
                                        show_hidden, show_leaf_labels),
                        use_container_width=True)

    n_l = len([n for n in nodes if n.get('type') == 'attribute'])
    n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
    # max depth
    pm  = _parent_map(nodes)
    def _node_depth(nid):
        d = 0; cur = nid
        while cur in pm:
            cur = pm[cur]; d += 1
        return d
    max_d = max((_node_depth(n['id']) for n in nodes), default=0)
    c1, c2, c3, c4 = st.columns(4)
    c1.metric('Variables', n_l)
    c2.metric('Internal nodes', n_i)
    c3.metric('Global aspects', meta.get('K', '?'))
    c4.metric('Max depth', max_d)

with tabs[1]:
    import hierarchy_eval as he

    st.markdown('### Evaluation')
    if can is None or meta.get('W') is None:
        st.info('Run the builder first.')
    else:
        st.caption(
            'No manually curated reference taxonomy is available for this experiment. '
            'The primary metrics below are **reference-free** — they assess the hierarchy '
            'itself. Group-column figures later are descriptive self-consistency checks only. '
            'Identical definitions to the Baseline and Approach 1 apps make the primary '
            'diagnostics directly comparable.'
        )

        # ── PRIMARY: reference-free hierarchy quality (compute on demand) ──────
        # These use SBERT, which is slow to load. Computing them only on a button
        # click keeps the tree, sliders and Save button instant.
        st.markdown('#### Primary — reference-free hierarchy quality')
        if st.button('▶Compute reference-free metrics', key='a2_eval_btn'):
            with st.spinner('Computing reference-free metrics (loads SBERT once)…'):
                tm   = he.traco_metrics(nodes)
                npmi = he.npmi_coherence(nodes, can['_text'].tolist())
            st.session_state['a2_eval_cache'] = {'tm': tm, 'npmi': npmi}

        _ev = st.session_state.get('a2_eval_cache')
        if _ev:
            tm, npmi = _ev['tm'], _ev['npmi']
            p1, p2, p3 = st.columns(3)
            p1.metric('Parent–child coherence', tm['pc_coherence'],
                      help='TraCo (Wu et al., AAAI 2024). Children nest under parent theme.')
            p2.metric('Sibling diversity', tm['sibling_diversity'],
                      help='TraCo (Wu et al., AAAI 2024). Higher = distinct siblings; LOW = redundant.')
            p3.metric('NPMI label coherence', npmi,
                      help='Lau et al., EACL 2014. Label terms genuinely co-occur in the data.')
            st.caption(f'Embedding backend: **{tm["encoder"]}**.')
        else:
            st.info('Click the button above to compute coherence / diversity / NPMI '
                    '(takes a few seconds the first time while SBERT loads).')

        # ── Label-quality proxies (interpretability) ──────────────────────────
        st.markdown('#### Label quality *(interpretability — reference-free)*')
        lq = he.label_quality(nodes)
        l1, l2, l3 = st.columns(3)
        l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
                  help='% of internal labels that read as a real concept (short noun '
                       'phrase, WordNet head) rather than a "/"-joined term fragment.')
        l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
                  help='% of internal labels duplicating a sibling label (lower is better).')
        l3.metric('Avg label words', lq['avg_label_words'],
                  help='Mean label length in words.')

        # ── Structural statistics ─────────────────────────────────────────────
        st.markdown('#### Structural statistics')
        sm = he.structural_stats(nodes)
        s1, s2, s3, s4, s5 = st.columns(5)
        s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
        s2.metric('Max leaf depth',    sm['max_depth'])
        s3.metric('Avg leaf depth',    sm['avg_leaf_depth'])
        s4.metric('Avg branching',     sm['avg_branching_factor'])
        s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%")

        # ── Group-structure self-consistency (descriptive, NOT accuracy) ───────
        st.markdown('#### Group-structure self-consistency *(descriptive — not accuracy)*')
        st.caption(
            'The group column is a **construction input** (group-anchored L1/L2), so this '
            'only confirms the NMF aspect partition reflects its own input — expected high, '
            'NOT a quality signal and NOT comparable across methods as an accuracy metric.'
        )
        true_labels = can['_group'].apply(
            lambda x: str(x).split(' > ')[0].strip()).tolist()
        W        = meta['W']
        pred_nmf = np.argmax(W, axis=1).tolist()
        metrics  = evaluate(true_labels, pred_nmf)
        st.metric('ARI (self-consistency)', metrics['ARI'])

        # ── legacy global aspect table (diagnostic only) ─────────────────────
        with st.expander('Legacy global NMF aspect table (diagnostic — not the '
                          'primary result)'):
            st.caption('Global NMF aspects are a coarse lexical fallback. The '
                       'hierarchy is built from per-variable role extraction — '
                       'see the Role Decomposition tab for the actual result.')
            alabs = meta.get('aspect_labels', [])
            W_df  = pd.DataFrame(
                W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
            W_df.insert(0, 'Variable', can['_label'].tolist())
            st.dataframe(W_df.round(4), use_container_width=True)

with tabs[2]:
    st.markdown('### Role decomposition')
    st.caption('Each variable decomposed into measure / statistic / condition / subtype.')

    if can is None:
        st.info('Build a hierarchy first.')
    else:
        # ── Per-group summary ─────────────────────────────────────────────────
        st.markdown('#### Per-group routing')
        reg_rows = []
        for n in nodes:
            sp = n.get('structure_provenance', {})
            if sp.get('route') == 'group_anchor' and 'phrase_regularity' in sp:
                reg_rows.append({
                    'Group':       n.get('name', ''),
                    'Regularity':  sp.get('phrase_regularity'),
                    'Route used':  sp.get('route_used', '—'),
                })
        if reg_rows:
            reg_df = pd.DataFrame(reg_rows).sort_values(
                'Regularity', ascending=False, na_position='last')
            st.dataframe(reg_df, use_container_width=True, hide_index=True)

        # ── Per-variable role table ───────────────────────────────────────────
        st.markdown('#### Per-variable role table')

        # Primary source: raw per-row LLM extractions captured in the audit
        # (complete — includes roles that didn't become tree levels, e.g.
        # condition values skipped by singleton prevention).
        # Fallback per variable: roles collected from tree-ancestor nodes.
        audit_roles_by_vi: dict = {}
        for a in (st.session_state.get('a2_per_row_audit') or []):
            vi_list_a = a.get('vi_list') or []
            for r in a.get('per_row_audit', []):
                ridx = r.get('row_idx')
                accepted = r.get('accepted_roles') or {}
                if ridx is not None and ridx < len(vi_list_a) and accepted:
                    audit_roles_by_vi[int(vi_list_a[ridx])] = accepted

        # Tree-walk fallback (roles that became hierarchy levels)
        node_map_disp = {int(n['id']): n for n in nodes}
        parent_lookup: dict = {}
        for n in nodes:
            for c in n.get('related', []):
                parent_lookup.setdefault(int(c), int(n['id']))

        def _tree_roles_for_attr(node_id: int) -> dict:
            roles_here: dict = {}
            cur = node_id
            while cur in parent_lookup:
                cur = parent_lookup[cur]
                cur_node = node_map_disp.get(cur)
                if not cur_node:
                    break
                lp = cur_node.get('label_provenance', {})
                role = (lp.get('role')
                         or cur_node.get('structure_provenance', {}).get('slot_role'))
                if role and role not in roles_here:
                    roles_here[role] = cur_node.get('name', '')
            return roles_here

        role_rows = []
        for vi, (_, crow) in enumerate(can.iterrows()):
            # Audit roles win; tree roles fill any gaps
            roles_here = dict(_tree_roles_for_attr(vi + 1))   # attribute ids = 1..n
            for r, v in (audit_roles_by_vi.get(vi) or {}).items():
                if v:
                    roles_here[r] = v
            row = {
                'Group':    str(crow.get('_group', '')).split(' > ')[0].strip(),
                'Variable': str(crow.get('_label', '')),
            }
            for std_role in ('measure', 'statistic', 'condition',
                              'subtype', 'outcome', 'modifier'):
                row[std_role.title()] = roles_here.pop(std_role, '')
            if roles_here:
                row['Other roles'] = '; '.join(
                    f'{r}: {v}' for r, v in roles_here.items())
            role_rows.append(row)

        if role_rows:
            role_df = pd.DataFrame(role_rows)
            st.dataframe(role_df, use_container_width=True, hide_index=True)
            st.download_button(
                'Download per-variable role CSV',
                data=role_df.to_csv(index=False).encode('utf-8'),
                file_name=f'{safe_name(project_name)}_approach2_role_decomposition.csv',
                mime='text/csv',
            )

        # ── Per-row LLM extractor audit ───────────────────────────────────────
        audits = st.session_state.get('a2_per_row_audit', []) or []
        if audits:
            st.markdown('#### Extraction audit')

            # Summary table per group
            sum_rows = []
            for a in audits:
                sum_rows.append({
                    'Group':         a.get('group_name', ''),
                    'Rows':          a.get('n_rows', 0),
                    'Coverage':      a.get('coverage'),
                    'Valid':         a.get('valid'),
                    'Roles found':   ', '.join(a.get('roles_final', []))[:60],
                    'Reasons':       ', '.join(f'{k}:{v}' for k, v in
                                                (a.get('summary', {}) or {}).items()),
                })
            st.dataframe(pd.DataFrame(sum_rows), use_container_width=True,
                          hide_index=True)

            # Drill-down per group
            grp_names = [a.get('group_name', '?') for a in audits]
            if grp_names:
                sel_grp = st.selectbox(
                    'Drill into a group to see per-row proposals + rejections:',
                    grp_names)
                sel_audit = next((a for a in audits
                                    if a.get('group_name') == sel_grp), None)
                if sel_audit:
                    row_rows = []
                    for r in sel_audit.get('per_row_audit', [])[:60]:
                        accepted = r.get('accepted_roles', {}) or {}
                        rejected = r.get('rejected', []) or []
                        row_rows.append({
                            'Row #':      r.get('row_idx', ''),
                            'Description': r.get('description_snippet', ''),
                            'Accepted':   '; '.join(f'{k}={v}'
                                                     for k, v in accepted.items())[:140],
                            'Rejected':   '; '.join(
                                f'{x[0]}={x[1]!r} (missing stems: {x[2]})'
                                if isinstance(x, (list, tuple)) and len(x) >= 3
                                else str(x) for x in rejected)[:200],
                            'Reason':     r.get('reason', ''),
                        })
                    if row_rows:
                        st.dataframe(pd.DataFrame(row_rows),
                                      use_container_width=True, hide_index=True)
                        # Download as CSV for offline analysis
                        csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
                        st.download_button(
                            'Download per-row audit for this group',
                            data=csv_bytes,
                            file_name=f'{safe_name(project_name)}_audit_{safe_name(sel_grp)}.csv',
                            mime='text/csv',
                        )
        else:
            st.info('No role assignments recorded yet — Option D may have '
                    'fallen back to slot mining or aspect clustering for all '
                    'groups in this dataset.')

with tabs[3]:
    st.markdown('### Label provenance')
    st.caption('Audit trail: which stage produced each node label.')
    rows = []
    for n in nodes:
        if n.get('type') != 'aggregation':
            continue
        p = n.get('label_provenance', {})
        s = n.get('structure_provenance', {})
        rows.append({
            'Node':         n.get('name', ''),
            'Source':       p.get('label_source', '—'),
            'Route':        s.get('route', '—'),
            'Aspect method': s.get('aspect_method') or '—',
            'Silhouette':   s.get('silhouette') if s.get('silhouette') is not None else '—',
            'LLM used':     p.get('llm_used', False),
            'LLM rejected': p.get('llm_rejected', False),
            'LLM proposed': p.get('llm_raw_label', ''),
            'LLM reason':   p.get('llm_reason', '')[:60],
            'Confidence':   round(float(p.get('confidence', 1.0)), 3),
            'Evidence':     ', '.join(str(t) for t in p.get('evidence_terms', []))[:120],
        })
    if not rows:
        st.info('No internal nodes yet — build a hierarchy first.')
    else:
        prov_df = pd.DataFrame(rows)

        # ── Labels by source ──────────────────────────────────────────────────
        source_counts = prov_df['Source'].value_counts()
        st.write('**Labels by source**')
        cols_src = st.columns(min(5, max(2, len(source_counts))))
        for i, (src, cnt) in enumerate(source_counts.items()):
            cols_src[i % len(cols_src)].metric(str(src), int(cnt))

        # ── Structure routes ──────────────────────────────────────────────────
        am_counts = prov_df['Aspect method'].value_counts()
        st.write('**Structure routes used**')
        cols_am = st.columns(min(5, max(2, len(am_counts))))
        for i, (am, cnt) in enumerate(am_counts.items()):
            cols_am[i % len(cols_am)].metric(str(am), int(cnt))

        # ── LLM usage — split per-row extraction from the downstream refiner ──
        # Per-row nodes are LLM-BUILT (source 'per_row_llm_role'); the refiner
        # only renames deterministically-labeled nodes (source 'llm' when its
        # proposal is accepted).  Counting them together made the panel read
        # "N calls, 0 accepted" even on a fully successful build.
        n_per_row        = int((prov_df['Source'] == 'per_row_llm_role').sum())
        refiner_accepted = int((prov_df['Source'] == 'llm').sum())
        refiner_rejected = int(((prov_df['LLM rejected'] == True)  # noqa: E712
                                 & (prov_df['Source'] != 'per_row_llm_role')).sum())
        st.write('**LLM usage**')
        cL1, cL2, cL3 = st.columns(3)
        cL1.metric('Per-row extraction nodes', n_per_row)
        cL2.metric('Refiner accepted', refiner_accepted)
        cL3.metric('Refiner rejected', refiner_rejected)
        if refiner_accepted == 0 and refiner_rejected == 0 and n_per_row > 0:
            st.caption('Label refiner did not run — the tree was built entirely '
                        'by per-row extraction, leaving no deterministic labels '
                        'to refine.')
        if refiner_rejected > 0:
            with st.expander('Rejected refiner proposals'):
                rej = prov_df[(prov_df['LLM rejected'] == True)  # noqa: E712
                              & (prov_df['LLM proposed'].astype(str).str.len() > 0)]
                if len(rej):
                    st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
                                  use_container_width=True, hide_index=True)

        # ── Full provenance table ─────────────────────────────────────────────
        st.write('**Full per-node provenance**')
        st.dataframe(prov_df, use_container_width=True, hide_index=True)

with tabs[4]:
    if can is not None:
        st.dataframe(can.drop(columns=['_row'], errors='ignore'),
                     use_container_width=True)

with tabs[5]:
    # ── derive a per-CSV base name from the uploaded files ────────────────────
    # Uses the actual uploaded file names so different CSVs get different
    # output filenames (e.g. ai-mind-…json vs HCP_S1200_…json).
    csv_basis = ''
    if can is not None and '_source' in can.columns:
        sources = [str(s) for s in can['_source'].dropna().unique().tolist()]
        # Drop extensions, join with '+' if multiple files merged
        bases = []
        for s in sources:
            stem = Path(s).stem
            bases.append(safe_name(stem))
        csv_basis = '+'.join(bases) if bases else safe_name(project_name)
    if not csv_basis:
        csv_basis = safe_name(project_name)

    lod_fname = f'{csv_basis}_approach2_lod.json'

    st.caption(f'Filename basis: **{csv_basis}**  '
                f'(taken from the uploaded CSV — different CSVs export under different names)')

    col1, col2 = st.columns(2)
    with col1:
        if nodes:
            st.download_button(
                'LoD tree JSON',
                data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
                file_name=f'{csv_basis}_approach2_lod.json',
                mime='application/json',
                use_container_width=True,
            )
    with col2:
        if can is not None:
            st.download_button(
                'Canonical CSV',
                data=can.to_csv(index=False).encode('utf-8'),
                file_name=f'{csv_basis}_approach2_canonical.csv',
                mime='text/csv',
                use_container_width=True,
            )

    st.divider()
    # ── Save directly into the project's outputs/approach_2/ folder ────────────
    _out_dir = Path(__file__).resolve().parent / 'outputs' / 'approach_2'
    st.markdown('### Save to project folder')
    st.caption(
        'The download buttons above go to your browser’s Downloads folder (a browser '
        f'restriction). This button instead writes the files into `{_out_dir}` with the '
        'dataset name — convenient for `evaluate_all.py`.'
    )
    if st.button('Save all to outputs/approach_2/', type='primary',
                 use_container_width=True):
        try:
            _out_dir.mkdir(parents=True, exist_ok=True)
            saved = []
            if nodes:
                (_out_dir / f'{csv_basis}_approach2_lod.json').write_text(
                    json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
                saved.append(f'{csv_basis}_approach2_lod.json')
            if can is not None:
                can.to_csv(_out_dir / f'{csv_basis}_approach2_canonical.csv', index=False)
                saved.append(f'{csv_basis}_approach2_canonical.csv')
            st.success(f'Saved to `{_out_dir}`:\n\n- ' + '\n- '.join(saved))
        except Exception as _e:
            st.error(f'Could not save: {_e}')

with tabs[6]:
    st.markdown("""
## Approach 2 — Role-Decomposed Hierarchy via SBERT Phrase Clustering
### Option D primary route + slot mining + FASTopic + constrained LLM labels

### Algorithm

```
Step 1  Build metadata text objects (variable name + description + group)
        [GON §3] — Gonçalves et al. (2019)

Step 2  Group-anchored L1/L2 structure  [NEW]
        — detected group columns → path nodes (no hardcoding)
        — e.g. category > assessment, or task > variant
        — falls back to global NMF if no groups detected

Step 3a Phrase-slot mining (slot-first routing)  [IE / slot induction]
        — For each terminal group, attempt deterministic phrase-slot
          decomposition of variable descriptions:
            • extract concept-prefix per row
            • mine repeated n-grams (1–4 tokens, ≥2 rows)
            • compute mutual-exclusion across phrase pairs:
                M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
            • cluster phrases by mutual exclusion → slots
        — A slot is a set of phrases that rarely co-occur within a row but
          each co-occur with phrases from other slots (= alternatives at
          the same semantic position).
        — Activates only when slot structure is statistically strong:
            • ≥ 2 slots discovered
            • ≥ 55% row coverage
            • each slot ≥ 2 distinct phrases
        — No domain hardcoding: phrase content is discovered from the data;
          slot names = the highest-coverage phrase in each slot.
        — When valid, the local hierarchy is built directly from slot values.
        — When invalid (free-form descriptions, e.g. parts of HCP), routing
          falls through to NMF (Step 3b).

Step 3b Local NMF aspect discovery per terminal group  [ZHU §3.1 adapted]
        — Fallback path when slot mining does not apply.
        — NMF runs inside each group, not across all variables
        — prevents globally-dominant terms from polluting local aspects
        — K selected by reconstruction-error elbow (deterministic)
        — falls back to global embeddings for small groups (< min_local_nmf_size)

Step 4  Per-aspect variable representations  [ZHU §3.1]
        — for each aspect k: filter text to top-T terms → SBERT or masked TF-IDF

Step 5  Independent per-aspect GMM clustering  [ZHU §3.2]
        — GMM with diagonal covariance + BIC for stable k selection
        — runs inside each group's aspect space

Step 6  Simplified best-aspect split  [ZHU §3.3 adapted]
        — at each node: evaluate all K aspects by silhouette score
        — highest silhouette → GMM split → child aggregation nodes
        — NOTE: this is a silhouette-based greedy split, not the full
          probabilistic search of Zhu et al. Eq. 6/7
        — singleton prevention: 1-variable clusters attach directly (no wrapper node)

Step 5a UPSTREAM LLM phrase-role classification  [TopicGPT, NAACL 2024 adapted]
        — One LLM call per terminal group:
          input  : (i) all repeated mined phrases (verbatim from the corpus)
                   (ii) 2–4 sample variable descriptions for context
                   (iii) the group name
          output : {role_name: [phrases]}  e.g. {measure: [...], statistic: [...],
                                                  condition: [...]}
        — Anti-hallucination:
          • every phrase in the returned roles MUST match an input phrase
            verbatim (validator drops anything else)
          • role names must be 1–2 generic English words (drops fancy/long names)
          • ≥ 2 valid roles required; else falls back to mutual-exclusion
        — Phrase ↔ role mapping drives the slot hierarchy in Step 6:
          measure-like roles become outer levels, condition-like roles inner.
        — STRUCTURAL use of the LLM (TopicGPT discovers topics).  Distinct
          from the downstream label-refiner step (TopicTag, see Step 7e).

Step 7  Node labeling  [ZHU §4.3 / TopicTag DocEng 2024]
        a) description-prefix phrase shared by ≥60% of cluster
        b) group-purity prefix: if ≥70% share one _group top-level value
        c) data-driven boilerplate + FIELD_NAME_NOISE filter
        d) bigram-preferred discriminative TF-IDF suffix
        e) OPTIONAL downstream LLM refinement [TopicTag]:
           — receives only evidence terms + parent path + sample descriptions
           — strict grounding check: every label word must appear in evidence
           — rejected proposals fall back to deterministic label
           — provenance stored on each node (label_source, confidence, evidence_terms)

Step 8  Evaluation  [TraCo AAAI 2024 / Lau EACL 2014 / TICL §3.4]
        — reference-free parent-child coherence and sibling diversity
        — reference-free NPMI label coherence and label-quality proxies
        — optional group-structure self-consistency, explicitly not manual-reference accuracy
        — label and structure provenance audit tables
```

### Key design decisions

| Decision | Rationale |
|---|---|
| FASTopic replaces NMF as primary aspect discovery | NMF (1999) is lexical only; FASTopic (NeurIPS 2024) uses pretrained Transformer + Dual Semantic-relation Reconstruction → semantic, not lexical. |
| NMF kept as fallback | Required for very small groups or when FASTopic / SBERT model is unavailable. |
| Slot mining tried first | Decomposes variables along multiple semantic dimensions before any topic model. No document-level method (NMF, BERTopic, FASTopic) can do this — they all collapse one variable into one vector. |
| No facet trees | Removed: a single coherent LoD tree is easier to defend than parallel views of one clustering. |
| Deterministic labels = default thesis result | Reproducible without API access. LLM is opt-in re-phrasing only. |
| LLM via local Ollama | Localhost OpenAI-compatible endpoint (`http://localhost:11434/v1`) → LLM ON by default whenever Ollama is reachable; easy to disable. Override `OLLAMA_URL` / `OLLAMA_MODEL` env vars for non-default deployments. No external API, no key management, fully reproducible from a known model checkpoint. |
| Strict LLM grounding | Every label word must appear in evidence — labels come from the CSV, LLM only rewords. |
| Per-node provenance | Audit trail: `label_source ∈ {description_prefix, tfidf_bigram, group_anchor, phrase_slot, llm, fallback}`. |

### Thesis wording (defense-safe)

*Approach 2 is a dataset-constrained multi-aspect hierarchy with strict separation
between structural decisions and label generation. The hierarchy topology is produced
deterministically: detected group metadata anchors the upper levels, IE-style phrase-slot
mining decomposes variables along multiple semantic dimensions when description structure
permits, and FASTopic (NeurIPS 2024) discovers latent semantic aspects in the remaining
groups, with NMF retained as a lexical fallback for small groups. Concept labels are
generated by a deterministic five-stage pipeline whose evidence comes exclusively from
the dataset itself. An optional TopicTag-style LLM refinement layer may re-phrase these
labels, but every LLM proposal must pass a grounding check — each word in the proposed
label must appear in the extracted evidence — and every label records its provenance
(source stage, confidence, evidence terms). The LLM can neither alter the tree structure
nor introduce vocabulary absent from the input CSV.*

### Papers used

| Ref | Citation | Role in this method |
|---|---|---|
| [ZHU] | Zhu et al. (2025). *Context-Aware Hierarchical Taxonomy Generation via LLM-Guided Multi-Aspect Clustering.* EMNLP 2025. | Main scaffold — adapted (FASTopic+NMF replace LLM aspect generation; greedy silhouette replaces Eq. 6/7 search). |
| [FASTopic] | Wu et al. (2024). *FASTopic: Pretrained Transformer is a Fast, Adaptive, Stable, and Transferable Topic Model.* NeurIPS 2024 (arXiv:2405.17978). | Recent SOTA replacement for NMF — semantic topic discovery via Dual Semantic-relation Reconstruction with optimal transport. |
| [IE-Slot] | Established IE literature on slot induction (surveyed *ACM Computing Surveys* 2022). | Phrase-slot mining adaptation — decomposes one variable into multiple alternative-phrase signals before clustering. |
| [GON] | Gonçalves et al. (2019). ESWC 2019. | Canonical metadata text-object construction. |
| [TopicGPT] | Pham et al. (2024). *TopicGPT: A Prompt-based Topic Modeling Framework.* NAACL 2024 (arXiv:2311.01449). | **STRUCTURAL** upstream LLM use — discovers semantic-role schema from mined phrases (one call per group); drives slot ordering in the hierarchy. Anti-hallucination contract: every phrase verbatim from corpus. |
| [TopicTag] | Eren et al. (2024). DocEng 2024 (arXiv:2407.19616). | Constrained LLM label-refinement pattern — LLM only names existing clusters, never modifies structure. Downstream use only. |
| [Qwen2.5] | Qwen Team (2024). *Qwen 2.5 Technical Report.* arXiv:2412.15115. | Open instruction-tuned model used as the local LLM (via Ollama) for label refinement. Replaces a hosted LLM for full offline reproducibility. |
| [TraCo] | Wu et al. (2024). AAAI 2024 (arXiv:2401.14113). | Diagnostic metrics: parent-child coherence + sibling diversity. |
| [TaxoAdapt] | Kargupta et al. (2025). ACL 2025 (arXiv:2506.10737). | Multidimensional taxonomy motivation. |
| [SC-Taxo] | (2026). arXiv:2605.00620. | Future work — bidirectional semantic consistency. |
| [BISE-26] | Motamedi, Novalija, Rei (2026). Springer BISE. | Validates group-anchored entry strategy. |
| [TICL] | Kejriwal et al. (2022). EAAI 108, 104548. | Concept-label evaluation framework. |

### Known limitations (honest)

* **FASTopic is still document-level** — better than NMF semantically, but a single variable
  is still one vector. Multi-dimension decomposition relies on phrase-slot mining.
* **Greedy silhouette split** — not the full probabilistic search of Zhu et al. Eq. 6/7.
* **TraCo metrics are diagnostic only** — measured, not enforced (no neural transport plan).
* **LLM labels are only as recent as your Anthropic model** — model choice affects reproducibility;
  the deterministic pipeline is the canonical thesis result.
""")