diff --git "a/version2/approach_2.py" "b/version2/approach_2.py"
new file mode 100644--- /dev/null
+++ "b/version2/approach_2.py"
@@ -0,0 +1,4405 @@
+# approach_2.py — Approach 2: Dataset-Constrained Multi-Aspect Hierarchy
+#
+# ALGORITHM (no domain hardcoding):
+#
+# Step 1 Build metadata text objects [GON §3]
+# Step 2 Group-anchored L1/L2 from detected _group metadata [BISE-26]
+# Step 3 Per terminal group — routing in priority order:
+# (a) Phrase-slot mining [IE / slot induction]
+# decomposes one variable into multiple alternative-phrase
+# signals before clustering — fixes the document-level ceiling
+# that NMF/BERTopic/FASTopic share.
+# (b) FASTopic semantic aspect discovery [Wu et al. NeurIPS 2024]
+# transformer-based Dual Semantic-relation Reconstruction with
+# optimal transport — recent SOTA replacement for NMF.
+# (c) NMF lexical fallback [ZHU §3.1 adapted]
+# retained for small groups or when FASTopic is unavailable.
+# Step 4 Per-aspect variable representations [ZHU §3.1]
+# Step 5 Independent per-aspect clustering: GMM+BIC small / KMeans large [ZHU §3.2]
+# Step 6 Top-down LoD tree (simplified silhouette best-aspect split) [ZHU §3.3 adapted]
+# Step 7 Node labeling — deterministic by default:
+# description-prefix → group anchor → IDF + FIELD_NAME filter →
+# bigram-preferred discriminative TF-IDF suffix
+# OPTIONAL: constrained LLM re-phrasing [TopicTag, DocEng 2024]
+# — every label word must appear in evidence (grounding check)
+# — provenance stored per node (audit trail)
+# Step 8 Reference-free diagnostics: parent-child coherence, sibling diversity,
+# label coherence, label quality, structure/provenance audit
+# [TraCo, AAAI 2024]
+#
+# Facet trees (Castanet, 2007) removed in this version — a single coherent LoD tree.
+#
+# PAPERS:
+# [ZHU] Zhu et al. (2025). EMNLP 2025. Main scaffold (adapted)
+# [FASTopic] Wu et al. (2024). NeurIPS 2024. Semantic aspect discovery
+# [GON] Gonçalves et al. (2019). ESWC. Canonical text objects
+# [TopicTag] Eren et al. (2024). DocEng. Constrained LLM label refinement
+# [TraCo] Wu et al. (2024). AAAI. Affinity + diversity metrics
+# [TICL] Kejriwal et al. (2022). EAAI. Optional clustering diagnostics
+# [BISE-26] Motamedi et al. (2026). BISE. Group-anchored entry validation
+# [IE-Slot] IE / slot-induction literature (surveyed Xu et al., FCS 2024).
+
+from __future__ import annotations
+import json
+import os
+import warnings
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Callable, Optional
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
+from sklearn.decomposition import NMF, TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.mixture import GaussianMixture
+from sklearn.preprocessing import LabelEncoder
+
+warnings.filterwarnings('ignore')
+
+try:
+ from sentence_transformers import SentenceTransformer
+ _SBERT_AVAILABLE = True
+except Exception: # broken torch raises OSError (WinError 127), not ImportError
+ _SBERT_AVAILABLE = False
+
+try:
+ from openai import OpenAI as _OpenAIClient
+ _LLM_CLIENT_AVAILABLE = True
+except ImportError:
+ _LLM_CLIENT_AVAILABLE = False
+
+# Ollama defaults (overridable via env vars OLLAMA_URL / OLLAMA_MODEL).
+OLLAMA_URL_DEFAULT = 'http://localhost:11434/v1'
+OLLAMA_MODEL_DEFAULT = 'qwen2.5:3b-instruct'
+GROQ_URL_DEFAULT = 'https://api.groq.com/openai/v1'
+GROQ_MODEL_DEFAULT = 'qwen/qwen3-32b'
+
+def _ping_ollama(base_url: str = OLLAMA_URL_DEFAULT, timeout: float = 1.5) -> bool:
+ """Quick reachability check for the local Ollama server."""
+ if not _LLM_CLIENT_AVAILABLE:
+ return False
+ try:
+ import urllib.request as _urlreq
+ # /v1/models is OpenAI-compat; Ollama also exposes /api/tags
+ with _urlreq.urlopen(base_url.rstrip('/v1') + '/api/tags',
+ timeout=timeout) as r:
+ return r.status == 200
+ except Exception:
+ return False
+
+def _make_llm_client(provider: str, base_url: str, api_key: str = '') -> Optional[object]:
+ """
+ Build an OpenAI-compatible client for either local Ollama or cloud Groq.
+
+ Both providers expose an OpenAI-compatible REST endpoint, so the same
+ openai.OpenAI client class works for both — only the base_url and
+ auth differ.
+ """
+ if not _LLM_CLIENT_AVAILABLE:
+ return None
+ if provider == 'groq':
+ if not api_key:
+ return None
+ return _OpenAIClient(base_url=base_url, api_key=api_key)
+ # Ollama ignores the key but the SDK requires a non-empty string
+ if not _ping_ollama(base_url):
+ return None
+ return _OpenAIClient(base_url=base_url, api_key='ollama-local')
+
+def _parse_json_response(raw: str) -> dict:
+ """
+ Robustly extract a JSON object from an LLM response.
+
+ Handles the response shapes seen across providers/models:
+ • plain JSON: {"role": "measure"}
+ • markdown-fenced: ```json\n{...}\n```
+ • reasoning-model preamble: ...\n{...}
+ (Qwen3 / DeepSeek-R1 style models emit thinking traces before the
+ answer when JSON mode is unavailable — e.g. qwen/qwen3-32b on Groq)
+ • prose then JSON: Here is the result: {...}
+
+ Strategy: strip blocks, then take the substring from the first
+ '{' to the last '}' and parse it. Raises ValueError if no JSON found.
+ """
+ s = (raw or '').strip()
+ if not s:
+ raise ValueError('empty response')
+ # Strip reasoning blocks (Qwen3 / R1 style)
+ while '' in s:
+ start = s.find('')
+ end = s.find('', start)
+ if end == -1:
+ # Unclosed think block — drop everything from onward,
+ # the JSON (if any) would be before it
+ s = s[:start]
+ break
+ s = s[:start] + s[end + len(''):]
+ s = s.strip()
+ # Take first '{' .. last '}' — covers fences and prose prefixes/suffixes
+ i, j = s.find('{'), s.rfind('}')
+ if i == -1 or j == -1 or j <= i:
+ raise ValueError('no JSON object in response')
+ return json.loads(s[i:j + 1])
+
+def _safe_chat_completion(client, model: str, prompt: str,
+ max_tokens: int = 200, temperature: float = 0.1):
+ """
+ Call an OpenAI-compatible chat completion endpoint with automatic
+ fallback for providers that don't support JSON mode on a given model.
+
+ Some models on Groq (notably some Qwen 3 variants) reject
+ `response_format={"type":"json_object"}` with HTTP 400 BadRequestError.
+ This wrapper first tries WITH JSON mode (better reliability when
+ supported), and if the provider rejects it with a bad-request error,
+ retries WITHOUT. Prompts in this codebase already say 'Output JSON only'
+ and we strip ```json fences after parsing, so the retry path still
+ works deterministically.
+ """
+ base_args = {
+ 'model': model,
+ 'messages': [{'role': 'user', 'content': prompt}],
+ 'temperature': temperature,
+ 'max_tokens': max_tokens,
+ }
+ try:
+ return client.chat.completions.create(
+ **base_args, response_format={'type': 'json_object'})
+ except Exception as e:
+ # Retry without JSON mode on bad-request / unsupported-feature errors
+ err_name = type(e).__name__
+ err_text = str(e)
+ if ('BadRequest' in err_name or '400' in err_text
+ or 'response_format' in err_text):
+ return client.chat.completions.create(**base_args)
+ raise
+
+try:
+ from fastopic import FASTopic # type: ignore[import-not-found]
+ _FASTOPIC_AVAILABLE = True
+except Exception: # fastopic pulls in torch; broken torch raises OSError, not ImportError
+ _FASTOPIC_AVAILABLE = False
+
+# ──────────────────────────────────────────────────────────────────────────────
+# ROLE KEYS (domain-agnostic)
+# ──────────────────────────────────────────────────────────────────────────────
+LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split()
+GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
+TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation'.split()
+META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding range min max scale'.split()
+
+# ──────────────────────────────────────────────────────────────────────────────
+# FILE LOADING
+# ──────────────────────────────────────────────────────────────────────────────
+def safe_name(n: str) -> str:
+ return ''.join(c if c.isalnum() or c in '-_.' else '_' for c in n)
+
+def try_read_csv(path: Path) -> pd.DataFrame:
+ best, best_score = None, -1
+ # Try explicit comma first (most common), then let Python sniff, then other separators.
+ # Reject results with only 1 column — likely a parsing failure.
+ for enc in ['utf-8-sig', 'utf-8', 'latin1']:
+ for sep in [',', '\t', ';', '|', None]:
+ try:
+ df = pd.read_csv(path, sep=sep, engine='python', encoding=enc,
+ on_bad_lines='skip')
+ if df.shape[1] < 2:
+ continue
+ s = df.shape[1] * 10 - float(df.isna().mean().mean())
+ if s > best_score:
+ best, best_score = df, s
+ except Exception:
+ pass
+ if best is None:
+ raise ValueError(f'Could not read {path.name}')
+ best.columns = [str(c).strip().replace(';', '') for c in best.columns]
+ return best
+
+def load_any(path: Path) -> pd.DataFrame:
+ s = path.suffix.lower()
+ if s in ['.csv', '.tsv', '.txt']:
+ return try_read_csv(path)
+ if s in ['.xlsx', '.xls']:
+ return pd.read_excel(path)
+ if s == '.json':
+ obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
+ if isinstance(obj, list):
+ return pd.json_normalize(obj)
+ if isinstance(obj, dict):
+ for v in obj.values():
+ if isinstance(v, list):
+ return pd.json_normalize(v)
+ raise ValueError(f'Unsupported: {s}')
+
+# ──────────────────────────────────────────────────────────────────────────────
+# COLUMN ROLE DETECTION (domain-agnostic heuristic scoring)
+# ──────────────────────────────────────────────────────────────────────────────
+def _norm(c: str) -> str:
+ return ''.join(ch if ch.isalnum() else ' ' for ch in str(c).lower())
+
+def _ks(c: str, keys: list) -> int:
+ return sum(1 for k in keys if k in _norm(c))
+
+def detect_roles(df: pd.DataFrame) -> dict:
+ n = max(len(df), 1)
+ out = []
+ for col in df.columns:
+ s = df[col]
+ nun = int(s.nunique(dropna=True))
+ ur = nun / n
+ avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
+ # Raised group uniqueness ceiling from 80 → 300 so large datasets (e.g. HCP
+ # with 100+ assessment categories) are not excluded.
+ out.append({
+ 'column': str(col),
+ 'leaf_score': 4 * _ks(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0),
+ 'group_score': 4 * _ks(col, GROUP_KEYS) + (3 if 1 < nun < min(n * 0.5, 300) else 0),
+ 'text_score': 5 * _ks(col, TEXT_KEYS) + (4 if avg > 50 else 0),
+ 'meta_score': 4 * _ks(col, META_KEYS) + (2 if 1 < nun < min(n * 0.8, 100) else 0),
+ })
+ prof = pd.DataFrame(out)
+ leaf = prof.sort_values('leaf_score', ascending=False).head(1)['column'].tolist()
+ text = (prof[prof.text_score >= 4]
+ .sort_values('text_score', ascending=False)['column'].tolist()) or leaf[:]
+ group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf))]
+ .sort_values('group_score', ascending=False).head(3)['column'].tolist())
+ meta = (prof[(prof.meta_score >= 4) & (~prof.column.isin(text + leaf + group))]
+ .sort_values('meta_score', ascending=False).head(4)['column'].tolist())
+ return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'meta_cols': meta}
+
+def sv(x) -> str:
+ return '' if pd.isna(x) else str(x).strip()
+
+def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
+ """Build normalised per-variable rows with a unified _text field [GON §3]."""
+ leaf_cols = cfg.get('leaf_cols', [])
+ group_cols = cfg.get('group_cols', [])
+ text_cols = cfg.get('text_cols', [])
+ meta_cols = cfg.get('meta_cols', [])
+ rows = []
+ for i, row in df.iterrows():
+ label = (' / '.join(p for p in [sv(row.get(c, '')) for c in leaf_cols] if p)
+ or f'var_{i}')
+ group = (' > '.join(p for p in [sv(row.get(c, '')) for c in group_cols]
+ if p and p.lower() not in ['nan', 'none'])
+ or 'Ungrouped')
+ all_cols = list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols))
+ text = ' | '.join(f'{c}: {sv(row.get(c, ""))}' for c in all_cols
+ if sv(row.get(c, '')))
+ rows.append({'_source': source, '_row': int(i), '_label': label,
+ '_id': f'{group}.{label}', '_group': group, '_text': text})
+ can = pd.DataFrame(rows)
+ cnt: dict = defaultdict(int)
+ ids = []
+ for lid in can['_id']:
+ cnt[lid] += 1
+ ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
+ can['_id'] = ids
+ return can
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 3 — ASPECT DISCOVERY VIA NMF [ZHU §3.1 adapted]
+# ──────────────────────────────────────────────────────────────────────────────
+def _elbow_k(errors: list, k_range: range) -> int:
+ if len(errors) <= 2:
+ return list(k_range)[0]
+ diffs = np.diff(errors)
+ drops = np.diff(diffs)
+ if drops.max() - drops.min() < 1e-8:
+ return max(2, int(np.sqrt(len(errors))))
+ elbow_idx = int(np.argmax(drops)) + 1
+ return list(k_range)[min(elbow_idx, len(k_range) - 1)]
+
+def discover_aspects(texts: list, max_aspects: int = 10):
+ """
+ Discover K latent semantic aspects via NMF on TF-IDF [ZHU §3.1 adapted].
+
+ Replaces Zhu et al.'s LLM aspect generation with NMF (deterministic, no
+ hallucination). K is selected by reconstruction-error elbow.
+
+ Returns tfidf, X, nmf, W, H, K, labels.
+ """
+ tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
+ max_features=3000, min_df=1)
+ X = tfidf.fit_transform(texts)
+ terms = np.array(tfidf.get_feature_names_out())
+ n_vars = X.shape[0]
+
+ k_range = range(2, min(max_aspects + 1, n_vars // 2 + 1, 16))
+ if len(k_range) < 2:
+ k_range = range(2, 3)
+
+ errors = []
+ for k in k_range:
+ m = NMF(n_components=k, random_state=42, max_iter=400, init='nndsvda')
+ m.fit_transform(X)
+ errors.append(m.reconstruction_err_)
+
+ K = _elbow_k(errors, k_range)
+ nmf = NMF(n_components=K, random_state=42, max_iter=400, init='nndsvda')
+ W = nmf.fit_transform(X)
+ H = nmf.components_
+
+ # Aspect labels: top-4 terms per NMF component
+ labels = []
+ for k in range(K):
+ top_idx = np.argsort(H[k])[-4:][::-1]
+ labels.append(' / '.join(terms[top_idx]))
+
+ return tfidf, X, nmf, W, H, K, labels
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 3 (FASTopic variant) — semantic aspect discovery [Wu et al. NeurIPS 2024]
+# ──────────────────────────────────────────────────────────────────────────────
+def discover_aspects_fastopic(texts: list,
+ max_aspects: int = 10,
+ fallback_tfidf: bool = True):
+ """
+ Recent SOTA semantic aspect discovery via FASTopic [Wu et al. NeurIPS 2024,
+ arXiv:2405.17978].
+
+ FASTopic uses a pretrained Transformer (SBERT) to embed documents, then
+ learns topic/word embeddings via Dual Semantic-relation Reconstruction
+ (DSR) with optimal transport. Beats NMF, BERTopic, and CombinedTM on
+ standard topic benchmarks — semantic, not lexical, and reproducible.
+
+ Adapter shape matches `discover_aspects` so it is a drop-in replacement:
+ returns (tfidf, X, model, W, H, K, labels)
+ `tfidf` and `X` are still produced (used downstream by label_cluster +
+ masked TF-IDF representation fallbacks); FASTopic provides W (doc-topic),
+ H_proxy (topic-term scores derived from top words), K, and labels.
+
+ Falls back to NMF if FASTopic is not installed or the group is too small
+ for transformer training (< 6 documents).
+ """
+ n_vars = len(texts)
+ # Keep a TF-IDF matrix available for downstream code paths
+ tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
+ max_features=3000, min_df=1)
+ X = tfidf.fit_transform(texts)
+ terms = np.array(tfidf.get_feature_names_out())
+
+ if not _FASTOPIC_AVAILABLE or n_vars < 6:
+ if fallback_tfidf:
+ return discover_aspects(texts, max_aspects)
+ raise RuntimeError('FASTopic unavailable and TF-IDF fallback disabled')
+
+ # FASTopic K selection: bounded similar to NMF elbow but simpler — pick a
+ # reasonable K from corpus size (avoids overfitting tiny groups).
+ K = max(2, min(max_aspects, int(np.ceil(np.sqrt(n_vars))) + 1, n_vars - 1))
+
+ try:
+ model = FASTopic(num_topics=K,
+ doc_embed_model='all-MiniLM-L6-v2',
+ verbose=False)
+ # fit_transform returns (top_words_per_topic, doc_topic_dist)
+ result = model.fit_transform(texts)
+ if isinstance(result, tuple) and len(result) == 2:
+ top_words, doc_topic = result
+ else:
+ # Some FASTopic versions return only doc_topic; pull top words via API
+ doc_topic = result
+ top_words = [model.get_top_words(topic_id=k, num_top_words=10)
+ for k in range(K)]
+ except Exception:
+ # Robust fallback if FASTopic fails (small corpus, OOM, etc.)
+ if fallback_tfidf:
+ return discover_aspects(texts, max_aspects)
+ raise
+
+ W = np.asarray(doc_topic, dtype=np.float32)
+ if W.ndim != 2 or W.shape[0] != n_vars:
+ if fallback_tfidf:
+ return discover_aspects(texts, max_aspects)
+
+ # Build H_proxy: K × n_terms with weight = position-decay of each top word
+ n_terms = len(terms)
+ term_to_idx = {t: i for i, t in enumerate(terms)}
+ H_proxy = np.zeros((K, n_terms), dtype=np.float32)
+ labels = []
+ for k in range(K):
+ words_k = top_words[k] if k < len(top_words) else []
+ # Each entry may be 'word', or 'word score', or (word, score)
+ clean: list = []
+ for w in words_k:
+ if isinstance(w, (list, tuple)):
+ w = w[0]
+ w = str(w).split(' ')[0].strip().lower()
+ if w:
+ clean.append(w)
+ for rank, w in enumerate(clean):
+ if w in term_to_idx:
+ H_proxy[k, term_to_idx[w]] += 1.0 / (rank + 1)
+ labels.append(' / '.join(clean[:4]) if clean else f'aspect {k+1}')
+
+ return tfidf, X, model, W, H_proxy, K, labels
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 4 — PER-ASPECT VARIABLE REPRESENTATIONS [ZHU §3.1]
+# ──────────────────────────────────────────────────────────────────────────────
+def per_aspect_representations(texts: list, H: np.ndarray,
+ tfidf: TfidfVectorizer,
+ sbert_model=None) -> list:
+ """
+ Build K independent representation matrices — one per aspect [ZHU §3.1].
+
+ For each aspect k:
+ • identify top-T terms from H[k]
+ • filter variable texts to those terms → encode with SBERT (or masked TF-IDF)
+
+ Returns list of K arrays, each shape (n_vars, embed_dim).
+ """
+ terms = np.array(tfidf.get_feature_names_out())
+ X_arr = tfidf.transform(texts).toarray()
+ K = H.shape[0]
+ T = min(30, len(terms))
+ reprs = []
+
+ for k in range(K):
+ top_idx = np.argsort(H[k])[-T:]
+ top_terms = set(terms[top_idx])
+
+ if sbert_model is not None:
+ filtered = []
+ for txt in texts:
+ tokens = txt.lower().split()
+ kept = ' '.join(t for t in tokens if t in top_terms)
+ filtered.append(kept if kept.strip() else txt)
+ emb = sbert_model.encode(filtered, show_progress_bar=False,
+ batch_size=64, normalize_embeddings=True)
+ else:
+ mask = H[k]
+ emb = X_arr * mask[np.newaxis, :]
+ norms = np.linalg.norm(emb, axis=1, keepdims=True)
+ norms[norms == 0] = 1.0
+ emb = emb / norms
+
+ reprs.append(emb.astype(np.float32))
+
+ return reprs
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 5 — INDEPENDENT PER-ASPECT CLUSTERING [ZHU §3.2]
+# ──────────────────────────────────────────────────────────────────────────────
+def cluster_aspect_gmm(emb: np.ndarray, max_k: int = 8, fast_threshold: int = 30):
+ """
+ Cluster variables within one aspect space [ZHU §3.2].
+
+ Hybrid strategy:
+ • n ≤ fast_threshold → GMM + BIC (accurate, recommended for small clusters)
+ • n > fast_threshold → MiniBatchKMeans + silhouette selection
+ (engineering adaptation for large groups, e.g. HCP Cognition / FreeSurfer)
+
+ Both paths use diagonal covariance / SVD pre-reduction for numerical
+ stability on high-dimensional sparse embeddings.
+ """
+ n = emb.shape[0]
+ if n <= 2:
+ return np.zeros(n, dtype=int), 1, 0.0
+
+ d_target = min(20, emb.shape[1], n - 1)
+ if emb.shape[1] > d_target:
+ svd = TruncatedSVD(n_components=d_target, random_state=42)
+ emb_r = svd.fit_transform(emb)
+ else:
+ emb_r = emb.copy()
+
+ best_score, best_labels, best_k = -np.inf, None, 2
+
+ if n > fast_threshold:
+ # Fast path: MiniBatchKMeans + silhouette (large groups)
+ for k in range(2, min(max_k + 1, n)):
+ try:
+ km = MiniBatchKMeans(n_clusters=k, random_state=42,
+ n_init=3, batch_size=min(256, n),
+ max_iter=100)
+ labels = km.fit_predict(emb_r)
+ if len(set(labels)) < 2:
+ continue
+ sil = float(silhouette_score(emb_r, labels))
+ if sil > best_score:
+ best_score, best_labels, best_k = sil, labels, k
+ except Exception:
+ continue
+ else:
+ # Accurate path: GMM + BIC (small groups)
+ best_bic = np.inf
+ for k in range(2, min(max_k + 1, n)):
+ try:
+ gmm = GaussianMixture(n_components=k, random_state=42,
+ covariance_type='diag', reg_covar=1e-3,
+ max_iter=80, n_init=1,
+ init_params='random_from_data')
+ gmm.fit(emb_r)
+ bic = gmm.bic(emb_r)
+ labels = gmm.predict(emb_r)
+ if bic < best_bic:
+ best_bic, best_labels, best_k = bic, labels, k
+ except Exception:
+ continue
+
+ if best_labels is None:
+ best_labels = np.zeros(n, dtype=int)
+
+ sil = 0.0
+ if len(set(best_labels)) > 1:
+ try:
+ sil = float(silhouette_score(emb_r, best_labels))
+ except Exception:
+ pass
+
+ return best_labels.astype(int), best_k, sil
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 8a — OPTIONAL CONSTRAINED LLM LABEL REFINEMENT [TopicTag, DocEng 2024]
+# ──────────────────────────────────────────────────────────────────────────────
+def _light_stem(w: str) -> str:
+ """
+ Minimal English morphological normalisation — no NLTK dependency.
+
+ Used by the LLM grounding check so that 'latencies' matches 'latency',
+ 'errors' matches 'error', 'completion' matches 'completed', etc.
+ Avoids rejecting plurals and common tense variants while still requiring
+ every label word to derive from evidence vocabulary.
+
+ Based on Porter-stemmer-style suffix stripping (Porter 1980, adapted).
+ """
+ w = w.lower().strip()
+ for suffix in ('ization', 'isation', 'ousness', 'iveness',
+ 'ization', 'ities', 'iness',
+ 'ation', 'ments', 'ness',
+ 'ies', 'ied', 'ing', 'ers',
+ 'ed', 'es', 'er', 'ly', 's'):
+ if w.endswith(suffix) and len(w) > len(suffix) + 2:
+ return w[:-len(suffix)]
+ return w
+
+def make_llm_label_fn(base_url: str = OLLAMA_URL_DEFAULT,
+ model: str = OLLAMA_MODEL_DEFAULT,
+ provider: str = 'ollama',
+ api_key: str = '') -> Optional[Callable]:
+ """
+ Build a TopicTag-style constrained LLM label refinement function backed
+ by a local Ollama server (OpenAI-compatible API at /v1).
+
+ [TopicTag] Eren et al. (2024) run NMF to discover topics, then use an LLM
+ to generate human-readable concept labels from the NMF topic terms.
+ The LLM receives ONLY the extracted evidence from the CSV — it cannot
+ alter the tree, cannot introduce new vocabulary, and must pass a strict
+ grounding check (every word in the proposed label must appear in evidence).
+
+ Local-LLM choice (Qwen 2.5 3B Instruct via Ollama) is deliberate:
+ • zero cost, zero API dependency, fully reproducible
+ • no external data transmission (privacy + thesis defensibility)
+ • TopicTag itself benchmarks open models (Llama, Mistral) — using an
+ open local model matches the paper's evaluation setup more closely
+ than a closed hosted model.
+
+ Returns a callable (candidate, top_terms, parent_path, sample_texts)
+ → (label, metadata_dict). Returns None if Ollama is unreachable or the
+ openai client package is missing.
+ """
+ client = _make_llm_client(provider, base_url, api_key)
+ if client is None:
+ return None
+
+ def _refine(candidate: str, top_terms: list, parent_path: str,
+ sample_texts: list):
+ meta = {'confidence': 0.0, 'evidence_terms': [],
+ 'reason': '', 'raw_label': ''}
+ prompt = (
+ 'You are labeling a cluster in a metadata variable hierarchy.\n'
+ 'The label MUST be derived strictly from the evidence terms and '
+ 'sample variable descriptions provided. Do not introduce concepts '
+ 'or vocabulary that are not visible in the evidence.\n\n'
+ f'Parent path: {parent_path}\n'
+ f'Evidence terms (from NMF/TF-IDF over the cluster): {", ".join(top_terms[:10])}\n'
+ f'Sample variable descriptions:\n'
+ + '\n'.join(f' - {str(t)[:160]}' for t in sample_texts[:4]) + '\n'
+ f'\nCurrent candidate label: {candidate}\n\n'
+ 'Task: Return a concise 2–5 word concept label that PARAPHRASES '
+ 'the evidence into a cleaner concept name.\n'
+ 'Rules:\n'
+ '1. Every word in the label must appear in (or be an obvious '
+ 'morphological variant of) the evidence terms or sample descriptions.\n'
+ '2. Do not invent domain concepts that are not in the evidence.\n'
+ '3. Prefer multi-word noun phrases over single keywords.\n'
+ '4. Avoid generic words: data, score, variable, assessment, total, '
+ 'description, value, decimal.\n'
+ '5. Use base forms — singular nouns (Latency not Latencies), and '
+ 'avoid -ing / -ed verb suffixes unless required.\n'
+ '6. Output strict JSON only — no prose, no markdown.\n\n'
+ 'Output: {"label": "...", "evidence_terms": ["...", "..."], "confidence": 0.0}'
+ )
+ try:
+ # max_tokens generous: reasoning models (Qwen3) emit traces
+ # that consume budget before the JSON appears.
+ resp = _safe_chat_completion(client, model, prompt,
+ max_tokens=1200, temperature=0.2)
+ raw = (resp.choices[0].message.content or '').strip()
+ result = _parse_json_response(raw)
+ label = str(result.get('label', '')).strip()
+ conf = float(result.get('confidence', 0))
+ evid = result.get('evidence_terms', []) or []
+ meta['confidence'], meta['evidence_terms'] = conf, evid
+ meta['raw_label'] = label
+ if not label:
+ meta['reason'] = 'empty'
+ return candidate, meta
+ # NOTE: we IGNORE the LLM's self-reported confidence. Qwen 3B
+ # routinely returns conf ≈ 0.5 on perfectly good labels — using it
+ # as a gate rejected useful refinements. Grounding (below) is the
+ # real anti-hallucination check; if every label word stems back to
+ # the corpus, the label is accepted regardless of self-confidence.
+ #
+ # Stem-aware grounding accepts morphological variants
+ # (latencies ↔ latency, errors ↔ error, completion ↔ completed).
+ haystack_text = ' '.join(
+ str(s) for s in (
+ top_terms[:10] + evid + list(sample_texts[:4]) + [parent_path]
+ )
+ ).lower()
+ haystack_stems = {_light_stem(w) for w in haystack_text.split()
+ if len(w) >= 3}
+ label_words = [w for w in label.lower().split() if len(w) >= 3]
+ label_stems = {_light_stem(w) for w in label_words}
+ ungrounded = label_stems - haystack_stems
+ if label_words and ungrounded:
+ meta['reason'] = f'ungrounded_words: {sorted(ungrounded)}'
+ return candidate, meta
+ meta['reason'] = 'accepted'
+ return label, meta
+ except Exception as e:
+ meta['reason'] = f'exception: {type(e).__name__}'
+ return candidate, meta
+
+ return _refine
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 5a — UPSTREAM LLM PHRASE-ROLE CLASSIFIER [TopicGPT, NAACL 2024 adapted]
+#
+# Used to discover semantic roles (Measure / Statistic / Condition / Subtype
+# style dimensions) from the corpus and ASSIGN each mined phrase to a role.
+# This is fundamentally different from TopicTag-style label refinement:
+# • TopicTag (and the make_llm_label_fn above) uses LLMs to RENAME clusters
+# after the tree structure is already decided. LLM is downstream.
+# • This function uses an LLM to CREATE the role schema before the tree is
+# built. Slot ordering and parent-child placement are driven by the
+# LLM-derived roles. LLM is upstream — closer to TopicGPT (Pham et al.,
+# NAACL 2024), where LLMs perform topic discovery, not topic labeling.
+#
+# Anti-hallucination contract (preserved):
+# • Every phrase in the role assignment MUST come from the corpus verbatim.
+# Outputs are validated against the input phrase list — anything new is
+# dropped.
+# • Role NAMES are proposed by the LLM but kept to 1–2 generic English
+# words; long or fanciful role names are rejected.
+# • If validation fails, the caller falls back to deterministic mutual-
+# exclusion clustering (current slot mining behaviour). No silent
+# failure modes.
+# ──────────────────────────────────────────────────────────────────────────────
+def make_llm_role_classifier_fn(base_url: str = OLLAMA_URL_DEFAULT,
+ model: str = OLLAMA_MODEL_DEFAULT,
+ provider: str = 'ollama',
+ api_key: str = '') -> Optional[Callable]:
+ """
+ Build a callable that classifies repeated corpus phrases into semantic
+ roles using an LLM (Ollama or Groq).
+ """
+ client = _make_llm_client(provider, base_url, api_key)
+ if client is None:
+ return None
+
+ def _classify(phrases: list, sample_descs: list, group_name: str = ''):
+ meta = {'reason': '', 'raw': '', 'role_count': 0}
+ if not phrases:
+ meta['reason'] = 'no_phrases'
+ return None, meta
+ # Truncate to keep the prompt small and the model focused
+ phrases_list = list(dict.fromkeys(str(p) for p in phrases))[:60]
+ prompt = (
+ 'You are analysing a list of repeated phrases mined from a '
+ 'data-dictionary corpus.\n\n'
+ f'Group context: {group_name or "(unknown)"}\n\n'
+ 'Repeated phrases (verbatim from the corpus):\n'
+ + '\n'.join(f' - {p}' for p in phrases_list) + '\n\n'
+ 'Sample variable descriptions for context:\n'
+ + '\n'.join(f' - {str(s)[:180]}' for s in sample_descs[:4]) + '\n\n'
+ 'TASK: Group these phrases into 2–5 SEMANTIC ROLES. Each role '
+ 'represents one ORTHOGONAL DIMENSION of what the variable measures '
+ '(for example: what is measured, what statistic is used, under '
+ 'what condition).\n\n'
+ 'STRICT RULES:\n'
+ '1. Use 2 to 5 roles. Fewer is better when phrases do not really '
+ 'belong to different dimensions.\n'
+ '2. Role names: ONE OR TWO generic English words, lowercase. '
+ 'Examples of good role names: "measure", "statistic", "condition", '
+ '"outcome", "subtype", "modifier". Do not invent fancy or domain-'
+ 'specific role names.\n'
+ '3. Every phrase from the input list MUST appear EXACTLY ONCE in '
+ 'exactly one role. Copy phrases VERBATIM. Do not rephrase, '
+ 'normalise, plural-strip, or invent new phrases.\n'
+ '4. Phrases that are alternatives (rarely co-occur in the same '
+ 'variable) should go in the SAME role.\n'
+ '5. Phrases that describe DIFFERENT dimensions of the same '
+ 'variable should go in DIFFERENT roles.\n'
+ '6. Output strict JSON only — no prose, no markdown fences.\n\n'
+ 'Output schema:\n'
+ '{\n'
+ ' "roles": {\n'
+ ' "role_name_1": ["phrase a", "phrase b", ...],\n'
+ ' "role_name_2": ["phrase c", "phrase d", ...]\n'
+ ' }\n'
+ '}'
+ )
+ try:
+ resp = _safe_chat_completion(client, model, prompt,
+ max_tokens=2000, temperature=0.1)
+ raw = (resp.choices[0].message.content or '').strip()
+ meta['raw'] = raw[:400]
+ result = _parse_json_response(raw)
+ roles_raw = result.get('roles', {}) or {}
+ if not isinstance(roles_raw, dict):
+ meta['reason'] = 'roles_not_dict'
+ return None, meta
+
+ # ── Validation ────────────────────────────────────────────────────
+ input_set = {p.lower(): p for p in phrases_list}
+ roles_clean: dict = {}
+ seen_phrases: set = set()
+ for role_name, items in roles_raw.items():
+ # Role name must be 1–2 generic words
+ rn = str(role_name).strip().lower()
+ if not rn or len(rn.split()) > 2 or len(rn) > 24:
+ continue
+ kept: list = []
+ if not isinstance(items, list):
+ continue
+ for it in items:
+ s = str(it).strip().lower()
+ if not s or s in seen_phrases:
+ continue
+ if s not in input_set:
+ # Phrase invented by LLM — drop (anti-hallucination)
+ continue
+ kept.append(input_set[s]) # original casing
+ seen_phrases.add(s)
+ if len(kept) >= 2:
+ roles_clean[rn] = kept
+ if len(roles_clean) < 2:
+ meta['reason'] = 'too_few_valid_roles'
+ return None, meta
+ meta['reason'] = 'accepted'
+ meta['role_count'] = len(roles_clean)
+ return roles_clean, meta
+ except Exception as e:
+ meta['reason'] = f'exception: {type(e).__name__}'
+ return None, meta
+
+ return _classify
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 8b–d — NODE LABELING [ZHU §4.3 / TopicTag]
+# ──────────────────────────────────────────────────────────────────────────────
+# Generic metadata field-name boilerplate that appears in essentially any
+# data-dictionary header (description, value, name, ...). Dataset-specific
+# column tokens (e.g. 'fullDisplayName' for HCP, 'Decimal Places' for AI-MIND)
+# are added at runtime by build_field_noise() — derived from the user-confirmed
+# column names, never enumerated by hand.
+_GENERIC_FIELD_NOISE = frozenset({
+ 'description', 'definition', 'value', 'metadata', 'desc',
+ 'name', 'item', 'variable', 'field', 'attribute',
+ 'code', 'type', 'dtype',
+})
+
+def build_field_noise(configs: dict) -> frozenset:
+ """
+ Build the field-name boilerplate set entirely from the user-confirmed
+ column roles. For each detected column name we extract alphanumeric tokens
+ and add them to the noise set.
+
+ Examples (derived, NOT hardcoded):
+ HCP 'fullDisplayName' → {fulldisplayname}
+ HCP 'columnHeader' → {columnheader}
+ AI-MIND 'Decimal Places' → {decimal, places}
+
+ Result: the same effective filter as a hand-crafted list, but produced
+ deterministically from whatever columns the current CSV has. Datasets we
+ have never seen get an automatically-tailored noise set.
+ """
+ noise = set(_GENERIC_FIELD_NOISE)
+ for cfg in (configs or {}).values():
+ for col_list in cfg.values():
+ for col in col_list:
+ tokens = ''.join(c if c.isalnum() else ' '
+ for c in str(col)).split()
+ noise.update(t.lower() for t in tokens if len(t) >= 2)
+ return frozenset(noise)
+
+# Module-level fallback used when label_cluster is called without a configs-
+# derived noise set. Replaced at build time by the Streamlit pipeline below.
+FIELD_NAME_NOISE: frozenset = _GENERIC_FIELD_NOISE
+
+def _extract_common_prefix_phrase(cluster_texts: list,
+ min_coverage: float = 0.6) -> str:
+ """
+ Many data dictionaries write 'Concept Name: definition...' in the
+ description. If most cluster members share a concept-name prefix, that
+ prefix IS the concept label. Fully data-driven — works on any dictionary
+ using the 'name: definition' convention.
+
+ Returns a Title-cased phrase, or '' if no shared prefix is strong enough.
+ """
+ # _text is "col1: val1 | col2: val2 | ...". Find description-like field
+ # and take its prefix before the inner colon.
+ prefixes = []
+ for t in cluster_texts:
+ for chunk in str(t).split(' | '):
+ if ':' not in chunk:
+ continue
+ key, val = chunk.split(':', 1)
+ key_l = key.lower()
+ if 'descrip' in key_l or 'def' in key_l or 'full' in key_l:
+ phrase = val.split(':')[0].strip()
+ tokens = phrase.split()
+ if 2 <= len(tokens) <= 6:
+ prefixes.append(tokens)
+ break
+
+ if not prefixes:
+ return ''
+
+ n_thresh = max(1, int(min_coverage * len(prefixes)))
+ max_len = max(len(p) for p in prefixes)
+ for length in range(min(6, max_len), 1, -1):
+ starts = Counter(tuple(p[:length]) for p in prefixes if len(p) >= length)
+ if not starts:
+ continue
+ top, cnt = starts.most_common(1)[0]
+ if cnt >= n_thresh:
+ return ' '.join(top).title()
+ return ''
+
+def _bigram_preferred_terms(diff: np.ndarray, terms: np.ndarray,
+ boilerplate: set, prefix_lower: str,
+ n_terms: int) -> list:
+ """
+ Pick top-n discriminative terms, preferring bigrams and removing redundancy.
+
+ Rules:
+ 1. Skip short tokens (<3 chars), boilerplate, field-name noise, and any
+ token already in the prefix.
+ 2. When a bigram is selected, drop any previously-picked unigram that is
+ a substring of it.
+ 3. Skip unigrams that are substrings of any already-picked bigram.
+ """
+ order = np.argsort(diff)[::-1]
+ picked, picked_lower = [], []
+ for i in order:
+ t = terms[i]; tl = t.lower()
+ if (len(t) < 3 or tl in boilerplate or tl in FIELD_NAME_NOISE
+ or (prefix_lower and tl in prefix_lower)):
+ continue
+ is_unigram = ' ' not in t
+ # Rule 3: unigram already covered by a picked bigram?
+ if is_unigram and any(tl in pl for pl in picked_lower if ' ' in pl):
+ continue
+ # Rule 2: replace picked unigrams subsumed by this new bigram
+ if not is_unigram:
+ keep = [(p, pl) for p, pl in zip(picked, picked_lower)
+ if not (' ' not in pl and pl in tl)]
+ picked, picked_lower = [k[0] for k in keep], [k[1] for k in keep]
+ picked.append(t); picked_lower.append(tl)
+ if len(picked) >= n_terms:
+ break
+ return picked
+
+def label_cluster(cluster_texts: list, all_texts: list,
+ tfidf: TfidfVectorizer, n_terms: int = 3,
+ cluster_groups: Optional[list] = None,
+ parent_path: str = '',
+ llm_label_fn: Optional[Callable] = None,
+ return_provenance: bool = False):
+ """
+ Label a cluster node using a five-stage deterministic pipeline.
+
+ A) Description-prefix candidate: extract concept phrase from 'Name: def...'
+ pattern shared by ≥60% of cluster members.
+ B) Group-purity prefix: if ≥70% share a top-level _group value, prepend it.
+ C) Boilerplate filter: drop terms with TF-IDF IDF ≤ 1.7 plus FIELD_NAME_NOISE.
+ D) Bigram-preferred discriminative suffix [ZHU §4.3].
+ E) Optional constrained LLM refinement [TopicTag, DocEng 2024].
+
+ When return_provenance=True returns a (label, provenance_dict) tuple where
+ provenance records which stage produced the label (description_prefix /
+ tfidf_bigram / group_anchor / llm) plus evidence terms and (for LLM) the
+ confidence score and a grounding check result.
+ """
+ prov: dict = {'label_source': 'fallback',
+ 'evidence_terms': [],
+ 'confidence': 1.0,
+ 'llm_used': False,
+ 'llm_rejected': False}
+
+ if not cluster_texts:
+ return ('Group', prov) if return_provenance else 'Group'
+
+ terms = np.array(tfidf.get_feature_names_out())
+ idf_vals = tfidf.idf_
+ boilerplate = set(terms[idf_vals <= 1.7].tolist())
+
+ # ── B) group-purity prefix ───────────────────────────────────────────────
+ prefix = ''
+ if cluster_groups:
+ top_lvl = [str(g).split(' > ')[0].strip() for g in cluster_groups]
+ top_grp, cnt = Counter(top_lvl).most_common(1)[0]
+ if cnt / len(top_lvl) >= 0.70 and top_grp not in ('', 'Ungrouped'):
+ prefix = top_grp
+
+ # ── A) description-prefix candidate phrase ───────────────────────────────
+ phrase = _extract_common_prefix_phrase(cluster_texts)
+ if phrase and prefix and phrase.lower().startswith(prefix.lower()):
+ phrase = phrase[len(prefix):].strip()
+
+ # ── D) bigram-preferred discriminative terms ─────────────────────────────
+ top_terms_raw, words = [], []
+ try:
+ X_all = tfidf.transform(all_texts).toarray()
+ X_clus = tfidf.transform(cluster_texts).toarray()
+ diff = X_clus.mean(axis=0) - X_all.mean(axis=0)
+ top_terms_raw = [terms[i] for i in np.argsort(diff)[::-1][:20]]
+ words = _bigram_preferred_terms(diff, terms, boilerplate,
+ prefix.lower(), n_terms)
+ except Exception:
+ pass
+
+ # ── compose deterministic candidate (records which stage produced it) ────
+ if phrase:
+ candidate = f'{prefix} — {phrase}' if prefix else phrase
+ prov['label_source'] = 'description_prefix'
+ prov['evidence_terms'] = [phrase] + ([prefix] if prefix else [])
+ elif words:
+ suffix = ' / '.join(w.title() for w in words)
+ candidate = f'{prefix} — {suffix}' if prefix else suffix
+ prov['label_source'] = 'tfidf_bigram'
+ prov['evidence_terms'] = list(words) + ([prefix] if prefix else [])
+ elif prefix:
+ candidate = prefix
+ prov['label_source'] = 'group_anchor'
+ prov['evidence_terms'] = [prefix]
+ else:
+ candidate = 'Group'
+
+ # ── E) optional constrained LLM refinement [TopicTag] ────────────────────
+ # LLM is used ONLY as a re-phraser of evidence already present in the
+ # cluster. Refinement is rejected if the returned label is not grounded
+ # in the evidence terms — keeping the user's contract that "labels should
+ # come from the csv itself".
+ if llm_label_fn and candidate != 'Group':
+ prov['llm_used'] = True
+ try:
+ refined, llm_meta = llm_label_fn(
+ candidate, top_terms_raw[:10],
+ parent_path, cluster_texts[:4])
+ # Always record the raw LLM proposal and the grounding-check reason
+ # — even when rejected — so the Provenance tab can show "what did
+ # the LLM suggest and why was it dropped?"
+ prov['llm_raw_label'] = llm_meta.get('raw_label', '')
+ prov['llm_reason'] = llm_meta.get('reason', '')
+ if refined and refined != candidate:
+ candidate = refined
+ prov['label_source'] = 'llm'
+ prov['confidence'] = llm_meta.get('confidence', 0.0)
+ prov['evidence_terms'] = llm_meta.get('evidence_terms',
+ prov['evidence_terms'])
+ else:
+ prov['llm_rejected'] = True
+ except Exception as _e:
+ prov['llm_rejected'] = True
+ prov['llm_reason'] = f'exception: {type(_e).__name__}'
+
+ if return_provenance:
+ return candidate, prov
+ return candidate
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 5b — PHRASE-SLOT MINING (data-driven IE-style slot induction)
+#
+# Information-extraction adaptation for structured metadata dictionaries.
+# Many data dictionaries (e.g. AI-MIND cognitive tests, HCP Study Completion)
+# express each variable as a regular phrase combining several semantic
+# dimensions (measure type, statistic, condition, etc.). Document-level
+# methods (NMF, BERTopic, CTM) cannot separate these because all dimensions
+# collapse into one vector. Phrase-slot mining decomposes each description
+# into multiple phrases, identifies which phrases are mutually-exclusive
+# alternatives across the corpus, and uses each alternative-set as a slot.
+#
+# No domain hardcoding: slot phrases are discovered from n-gram co-occurrence
+# patterns in the actual descriptions. Slot names are derived from the
+# phrases themselves; if structure is too weak, the algorithm reports
+# `valid=False` and the caller falls back to NMF.
+# ──────────────────────────────────────────────────────────────────────────────
+# ── Generic dictionary markers stripped before slot mining ────────────────────
+# These are universal data-dictionary conventions (`KEY: ...`, `Note: ...`,
+# `Question: ...`) — not domain knowledge. Without stripping, they become
+# false slot phrases (e.g. AI-MIND output contained `Key` nodes promoted from
+# `KEY: DMS Percent Correct ...`).
+_LEADING_MARKERS = frozenset({
+ 'key', 'note', 'notes', 'definition', 'description', 'desc',
+ 'question', 'q', 'item', 'value', 'meaning', 'label',
+})
+
+def _strip_leading_markers(text: str) -> str:
+ """Iteratively strip leading dictionary markers like 'KEY:' or 'Note:'."""
+ out = text
+ for _ in range(4): # bounded — never more than a few stacked markers
+ if ':' not in out:
+ break
+ head, rest = out.split(':', 1)
+ if head.strip().lower() in _LEADING_MARKERS:
+ out = rest.strip()
+ else:
+ break
+ return out
+
+def _split_concept_and_body(text: str) -> tuple:
+ """
+ Split a 'Concept Name: definition sentence' description into its concept
+ prefix and its definition body.
+
+ Data dictionaries overwhelmingly use the convention
+ :
+ e.g. 'DMS Correct Latency Standard Deviation (SD) (0 second delay):
+ The standard deviation of response latencies for trials ...'
+
+ The concept phrase is the clean, canonical label; the body is explanatory
+ prose that introduces boilerplate ('the number of times', 'a subject
+ revisits a box ...') and weaker surface phrases ('response latencies').
+
+ Returns (concept, body). If no clear concept/body boundary exists, concept
+ is the whole text and body is ''. Generic — no domain knowledge.
+ """
+ s = _strip_leading_markers(str(text).strip())
+ if ':' not in s:
+ return s, ''
+ head, body = s.split(':', 1)
+ head = head.strip()
+ body = body.strip()
+ # Only treat `head` as a concept prefix if it's phrase-length (not a whole
+ # sentence) — a real concept name is short.
+ if 1 <= len(head.split()) <= 14:
+ return head, body
+ return s, ''
+
+# Generic data-dictionary definition-prose openers. Role values starting with
+# these are explanatory fragments, not concept labels — reject them. Generic
+# English, not domain vocabulary.
+_BOILERPLATE_VALUE_PREFIXES = (
+ 'the number of', 'number of', 'the subject', 'a subject', 'the participant',
+ 'a participant', 'this measure', 'this variable', 'calculated across',
+ 'calculated as', 'the percentage of', 'the proportion of', 'the total number',
+ 'the mean of', 'the median of', 'the standard deviation of', 'the amount of',
+ 'the time', 'the length of', 'expressed as', 'defined as', 'measured as',
+)
+
+def _is_boilerplate_value(val: str) -> bool:
+ """True if a role value looks like a definition fragment rather than a label."""
+ v = str(val).strip().lower()
+ if not v:
+ return True
+ if any(v.startswith(p) for p in _BOILERPLATE_VALUE_PREFIXES):
+ return True
+ # Sentence-length values are definitions, not concept labels
+ if len(v.split()) > 6:
+ return True
+ return False
+
+# ── English stopwords for phrase-quality filtering ────────────────────────────
+# Reject phrases like 'and', 'them', 'to be', 'have a lot of' from becoming
+# slot nodes. This is general English filtering, not domain knowledge.
+_STOPWORDS = frozenset(
+ 'a an the and or but of in on at by for with about into during '
+ 'through over under above below from to as is are was were be being '
+ 'been have has had do does did this that these those they them their '
+ 'it its he she his her you your we our i my me us not no nor '
+ 'how often when where which who why what '
+ 'lot lots much many more most some any all none '
+ 'very also too just so such only even still even also '
+ 'one two three first second ' # 'second' as standalone — careful: kept in n-grams via context
+ .split()
+)
+
+def _phrase_is_meaningful(phrase: str, group_name_lower: str = '') -> bool:
+ """
+ Reject phrases that should never be a hierarchy node.
+
+ Rules (all generic, no domain knowledge):
+ • all-stopword phrases ('and', 'to be', 'have a lot of')
+ • boundary stopwords ('the response time' → 'the' boundary)
+ • phrase equals the parent group name ('DMS' inside DMS branch)
+ • single dictionary-marker words ('key', 'note')
+ • pure numeric tokens
+ """
+ p = phrase.strip().lower()
+ if not p:
+ return False
+ if p == group_name_lower:
+ return False
+ if p in _LEADING_MARKERS:
+ return False
+ tokens = p.split()
+ if not tokens:
+ return False
+ # Boundary stopwords
+ if tokens[0] in _STOPWORDS or tokens[-1] in _STOPWORDS:
+ return False
+ # Need at least one content token (non-stopword, length ≥ 2)
+ content = [t for t in tokens if t not in _STOPWORDS and len(t) >= 2]
+ if not content:
+ return False
+ # Single-token phrases must be a meaningful word, not a bare digit
+ if len(tokens) == 1 and tokens[0].isdigit():
+ return False
+ return True
+
+# ──────────────────────────────────────────────────────────────────────────────
+# OPTION D — SBERT PHRASE CLUSTERING + CONSTRAINED LLM ROLE NAMING
+#
+# Following the EDC pattern (Zhang & Soh, EMNLP 2024) and ZOES (arXiv
+# 2506.04458, 2025):
+# Extract → mine repeated phrases from each variable description
+# Define → semantically cluster phrases via SBERT + AgglomerativeClustering
+# Canonicalize → constrained-vocabulary LLM names each cluster with a role
+# (measure / statistic / condition / subtype / outcome / modifier)
+#
+# Anti-hallucination contract:
+# • Phrases are extracted verbatim from the corpus, never invented
+# • LLM only NAMES discovered clusters — cannot move phrases
+# • Role names are constrained to a fixed generic English vocabulary
+# (configurable via constrained=True/False)
+# • Anonymous fallback when LLM is unavailable or invalid (cluster_0, ...)
+#
+# Different from Approach 1: Approach 1 embeds the WHOLE description as one
+# vector; this embeds each EXTRACTED PHRASE as its own vector. Approach 1
+# clusters variables; this clusters phrases. Approach 1 produces a single
+# similarity tree; this produces a multi-role decomposition + nested tree.
+# ──────────────────────────────────────────────────────────────────────────────
+_ALLOWED_ROLES_CONSTRAINED = frozenset({
+ 'measure', 'statistic', 'condition', 'subtype', 'outcome', 'modifier',
+})
+
+# Generic role priority for hierarchy nesting (no domain knowledge).
+# Higher priority = outermost (closest to root) level inside the group.
+_ROLE_PRIORITY = {
+ 'measure': 6,
+ 'outcome': 5,
+ 'statistic': 4,
+ 'condition': 3,
+ 'modifier': 2,
+ 'subtype': 1,
+}
+
+def make_llm_role_namer_fn(base_url: str = OLLAMA_URL_DEFAULT,
+ model: str = OLLAMA_MODEL_DEFAULT,
+ constrained: bool = True,
+ provider: str = 'ollama',
+ api_key: str = '') -> Optional[Callable]:
+ """
+ Returns a callable (phrases, sample_descs) → (role_name, meta).
+ Works with Ollama (local) or Groq (cloud).
+ """
+ client = _make_llm_client(provider, base_url, api_key)
+ if client is None:
+ return None
+
+ def _name_cluster(phrases: list, sample_descs: list = None):
+ meta = {'reason': '', 'raw': '', 'constrained': constrained}
+ if not phrases:
+ meta['reason'] = 'empty_phrases'
+ return None, meta
+ sample_descs = sample_descs or []
+
+ head = (
+ 'Below is a cluster of semantically related phrases mined from a '
+ 'data-dictionary corpus. The phrases were grouped by sentence-'
+ 'transformer embedding similarity.\n\n'
+ 'Phrases in cluster:\n'
+ + '\n'.join(f' - {p}' for p in phrases[:20]) + '\n\n'
+ 'Sample variable descriptions for context:\n'
+ + '\n'.join(f' - {str(s)[:160]}' for s in sample_descs[:3]) + '\n\n'
+ )
+ if constrained:
+ prompt = head + (
+ 'Choose EXACTLY ONE semantic role from this fixed list that '
+ 'best names what these phrases share. Pick the SINGLE best fit:\n'
+ ' - measure (the base concept being measured)\n'
+ ' - statistic (mean, median, standard deviation, total, ...)\n'
+ ' - condition (when/where/under what circumstances)\n'
+ ' - outcome (result/output type)\n'
+ ' - subtype (a specific kind of measure)\n'
+ ' - modifier (a qualifier)\n\n'
+ 'Output strict JSON only — no markdown:\n'
+ '{"role": "measure"}'
+ )
+ else:
+ prompt = head + (
+ 'Propose ONE generic 1–2 word lowercase English noun that names '
+ 'the semantic role these phrases share. Examples of good roles: '
+ 'measure, statistic, condition, subtype, location, time, '
+ 'identifier, frequency, severity, quality.\n\n'
+ 'Output strict JSON only:\n'
+ '{"role": "..."}'
+ )
+
+ try:
+ # max_tokens must cover reasoning-model traces (Qwen3)
+ # before the tiny JSON answer appears.
+ resp = _safe_chat_completion(client, model, prompt,
+ max_tokens=800, temperature=0.1)
+ raw = (resp.choices[0].message.content or '').strip()
+ meta['raw'] = raw[:200]
+ result = _parse_json_response(raw)
+ role = str(result.get('role', '')).strip().lower()
+ if not role:
+ meta['reason'] = 'empty_role'
+ return None, meta
+ if constrained:
+ if role not in _ALLOWED_ROLES_CONSTRAINED:
+ meta['reason'] = f'role_not_allowed: {role}'
+ return None, meta
+ else:
+ toks = role.split()
+ if len(toks) > 2 or len(role) > 24:
+ meta['reason'] = 'role_too_long'
+ return None, meta
+ if not all(t.isalpha() and t.islower() for t in toks):
+ meta['reason'] = 'role_not_alpha_lower'
+ return None, meta
+ meta['reason'] = 'accepted'
+ return role, meta
+ except Exception as e:
+ meta['reason'] = f'exception: {type(e).__name__}'
+ return None, meta
+
+ return _name_cluster
+
+def _extract_phrases_for_role_clustering(texts: list,
+ text_col_names: Optional[list],
+ group_name: str,
+ min_phrase_count: int = 2
+ ) -> tuple:
+ """
+ Shared helper: extract repeated meaningful phrases from a group of
+ descriptions. Returns (phrases_list, per_row_phrase_sets, regularity).
+ Regularity = fraction of rows that contain ≥2 mined phrases.
+ """
+ text_keys = ({c.strip().lower() for c in (text_col_names or [])}
+ if text_col_names else None)
+ n = len(texts)
+
+ # 1. extract description-prefix per row
+ prefixes = []
+ for t in texts:
+ prefix = ''
+ for chunk in str(t).split(' | '):
+ if ':' not in chunk:
+ continue
+ key, val = chunk.split(':', 1)
+ key_l = key.strip().lower()
+ if text_keys is not None and key_l not in text_keys:
+ continue
+ prefix = val.split(':')[0].strip()
+ break
+ if not prefix:
+ prefix = str(t).strip()
+ prefix = _strip_leading_markers(prefix)
+ prefixes.append(prefix)
+
+ # 2. normalise
+ def _norm(p):
+ s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
+ for ch in str(p).lower())
+ return ' '.join(s.split())
+ norm = [_norm(p) for p in prefixes]
+
+ # 3. mine n-grams 1..4 ≥3 chars
+ row_phrases: list = []
+ phrase_count: Counter = Counter()
+ for text in norm:
+ tokens = text.split()
+ row_set = set()
+ for ngram_n in range(1, 5):
+ for i in range(len(tokens) - ngram_n + 1):
+ phrase = ' '.join(tokens[i:i + ngram_n])
+ if len(phrase) >= 3:
+ row_set.add(phrase)
+ row_phrases.append(row_set)
+ for p in row_set:
+ phrase_count[p] += 1
+
+ # 4. keep repeated, retain maximal per row
+ keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
+ refined = []
+ for row in row_phrases:
+ sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
+ kept: list = []
+ for p in sorted_by_len:
+ if not any(p != q and p in q for q in kept):
+ kept.append(p)
+ refined.append(set(kept))
+
+ phrase_count = Counter()
+ for r in refined:
+ for p in r:
+ phrase_count[p] += 1
+
+ group_lower = (group_name or '').strip().lower()
+ phrases = sorted([p for p, c in phrase_count.items()
+ if c >= min_phrase_count
+ and _phrase_is_meaningful(p, group_lower)])
+
+ regularity = sum(1 for r in refined if len(r) >= 2) / max(1, n)
+ return phrases, refined, round(regularity, 4)
+
+# ──────────────────────────────────────────────────────────────────────────────
+# PER-ROW LLM ROLE EXTRACTOR [Zhu et al. EMNLP 2025 — proper implementation]
+#
+# For each variable description, ONE LLM call extracts role values directly:
+# measure = base concept being measured (e.g. "Correct Latency")
+# statistic = statistical aggregation (e.g. "Standard Deviation")
+# condition = experimental condition (e.g. "0 second delay")
+# subtype = specific error / outcome subtype (e.g. "Incorrect Colour")
+#
+# Anti-hallucination contract:
+# • Each returned value MUST be a literal substring of the input description
+# (grounding check rejects anything else).
+# • LLM cannot invent new roles — only the four canonical roles are output.
+# • Empty string is a valid output ("this row has no statistic" etc.).
+#
+# Cost: ONE call per variable. AI-MIND ≈ 108 calls, HCP ≈ 813.
+# ──────────────────────────────────────────────────────────────────────────────
+_PER_ROW_ROLES = ('measure', 'statistic', 'condition', 'subtype')
+
+def make_per_row_role_extractor_fn(base_url: str = OLLAMA_URL_DEFAULT,
+ model: str = OLLAMA_MODEL_DEFAULT,
+ provider: str = 'ollama',
+ api_key: str = '') -> Optional[Callable]:
+ """
+ Returns a callable (description, sample_descs_in_group) → (roles_dict, meta).
+
+ Implements the Zhu et al. (2025, EMNLP) multi-aspect encoding pattern:
+ instead of clustering phrases and naming clusters, ask the LLM to extract
+ each role value directly from each row's description. Each extracted
+ value must be a verbatim substring (anti-hallucination).
+ """
+ client = _make_llm_client(provider, base_url, api_key)
+ if client is None:
+ return None
+
+ def _extract(concept: str, sample_concepts: list = None,
+ full_text: str = None):
+ meta = {'reason': '', 'raw': ''}
+ if not concept or len(str(concept).strip()) < 3:
+ return {}, {'reason': 'empty_description'}
+
+ # `concept` = the clean concept-name phrase (preferred extraction source)
+ # `full_text` = concept + definition body (used for subtype + grounding)
+ concept = str(concept)[:300]
+ full = str(full_text or concept)[:600]
+ sample_concepts = sample_concepts or []
+ prompt = (
+ 'You are extracting semantic role values from one variable in a '
+ 'data dictionary.\n\n'
+ f'CONCEPT NAME (use this for measure / statistic / condition):\n'
+ f' {concept}\n\n'
+ f'FULL DEFINITION (use ONLY for subtype, and only if needed):\n'
+ f' {full}\n\n'
+ 'Similar concept names in the same group (context):\n'
+ + '\n'.join(f' - {str(s)[:120]}' for s in sample_concepts[:3]) + '\n\n'
+ 'TASK: Extract values for these four semantic roles. Take '
+ 'measure, statistic and condition from the CONCEPT NAME wherever '
+ 'possible. Every value MUST be a verbatim substring of the CONCEPT '
+ 'NAME or FULL DEFINITION. Empty string if a role does not apply.\n\n'
+ ' measure = the base quantity being measured\n'
+ ' (e.g. "reaction time", "accuracy", "score")\n'
+ ' statistic = a statistical aggregation operator\n'
+ ' (e.g. "mean", "median", "standard deviation", "total")\n'
+ ' condition = an experimental condition or scope\n'
+ ' (e.g. "baseline", "follow-up", "task condition")\n'
+ ' subtype = a specific subtype / kind / error type\n'
+ ' (e.g. "error type", "response type", "trial type")\n\n'
+ 'STRICT RULES:\n'
+ '1. Each value COPIED VERBATIM — do not invent, summarise, paraphrase.\n'
+ '2. Prefer short concept phrases over long definition fragments.\n'
+ '3. Do NOT return a value that is a sentence or starts with "the '
+ 'number of", "the subject", "calculated across" — those are '
+ 'definition prose, not labels.\n'
+ '4. Empty string "" for roles that do not apply.\n'
+ '5. Output strict JSON only:\n\n'
+ '{"measure": "...", "statistic": "...", "condition": "...", "subtype": ""}'
+ )
+
+ try:
+ resp = _safe_chat_completion(client, model, prompt,
+ max_tokens=1500, temperature=0.1)
+ raw = (resp.choices[0].message.content or '').strip()
+ meta['raw'] = raw[:300]
+ result = _parse_json_response(raw)
+ except Exception as e:
+ meta['reason'] = f'exception: {type(e).__name__}: {str(e)[:80]}'
+ return {}, meta
+
+ # Grounding is checked against the FULL text (concept + body) so that
+ # subtype values living in the definition body still pass.
+ ground_lower = full.lower()
+ ground_tokens = [w.strip(',.()[]{}"\'') for w in ground_lower.split()]
+ ground_stems = {_light_stem(w) for w in ground_tokens
+ if len(w) >= 3 and w not in _STOPWORDS}
+
+ roles: dict = {}
+ rejected: list = []
+ for role in _PER_ROW_ROLES:
+ val = result.get(role, '')
+ if not isinstance(val, str):
+ continue
+ val_clean = val.strip().strip('"').strip("'")
+ if not val_clean:
+ continue
+ # P2: reject definition-prose fragments before grounding
+ if _is_boilerplate_value(val_clean):
+ rejected.append((role, val_clean, ['boilerplate']))
+ continue
+ # Strict substring (cheapest, most common)
+ if val_clean.lower() in ground_lower:
+ roles[role] = val_clean
+ continue
+ # Token-stem grounding fallback (morphological variants)
+ val_tokens = [w.strip(',.()[]{}"\'') for w in val_clean.lower().split()]
+ val_stems = {_light_stem(w) for w in val_tokens
+ if len(w) >= 3 and w not in _STOPWORDS}
+ if val_stems and val_stems.issubset(ground_stems):
+ roles[role] = val_clean
+ else:
+ missing = sorted(val_stems - ground_stems) if val_stems else ['(no content tokens)']
+ rejected.append((role, val_clean, missing))
+
+ meta['reason'] = 'accepted' if roles else 'all_rejected'
+ meta['rejected'] = rejected
+ meta['n_extracted'] = len(roles)
+ meta['n_rejected'] = len(rejected)
+ return roles, meta
+
+ return _extract
+
+def discover_roles_via_per_row_extraction(
+ texts: list,
+ text_col_names: Optional[list],
+ per_row_extractor_fn: Callable,
+ group_name: str = '',
+ regularity_threshold: float = 0.40) -> dict:
+ """
+ Zhu et al. (EMNLP 2025) style: one LLM call per row extracts measure /
+ statistic / condition / subtype values directly from each description.
+
+ Returns the same dict shape as discover_roles_via_sbert_phrase_clustering()
+ so it can flow into build_role_hierarchy().
+ """
+ n = len(texts)
+ if n < 4:
+ return {'valid': False, 'reason': 'too_few_rows',
+ 'regularity': 0.0, 'roles': {}, 'row_assignments': []}
+
+ # Extract a "concept-prefix" snippet per row for context to the extractor
+ text_keys = ({c.strip().lower() for c in (text_col_names or [])}
+ if text_col_names else None)
+
+ def _row_desc_value(t: str) -> str:
+ """Return the raw description-column value (everything after 'desc:')."""
+ for chunk in str(t).split(' | '):
+ if ':' not in chunk:
+ continue
+ key, val = chunk.split(':', 1)
+ key_l = key.strip().lower()
+ if text_keys is not None and key_l not in text_keys:
+ continue
+ return val.strip()
+ return str(t).strip()
+
+ # For each row split into (concept-name prefix, definition body).
+ # The concept feeds measure/statistic/condition; the full text grounds
+ # subtype and the grounding check.
+ row_concepts: list = []
+ row_fulls: list = []
+ for t in texts:
+ dv = _row_desc_value(t)
+ concept, body = _split_concept_and_body(dv)
+ row_concepts.append(concept)
+ row_fulls.append((concept + ' ' + body).strip() if body else concept)
+
+ sample_descs = row_concepts[:3]
+
+ # Call extractor per row. Use Streamlit progress bar so the user sees
+ # what's happening (especially important for HCP).
+ try:
+ import streamlit as _st_p
+ pbar = _st_p.progress(0.0, text=f'Extracting roles for "{group_name}"…')
+ show_progress = True
+ except Exception:
+ pbar = None
+ show_progress = False
+
+ per_row_roles: list = []
+ per_row_audit: list = [] # full audit trail (proposed + rejected per row)
+ all_phrases_by_role: dict = defaultdict(list)
+ extractor_meta_summary: dict = defaultdict(int)
+
+ for i, concept in enumerate(row_concepts):
+ if show_progress and pbar is not None:
+ try:
+ pbar.progress((i + 1) / max(1, n),
+ text=f'[{group_name}] row {i+1}/{n}')
+ except Exception:
+ pass
+ try:
+ roles, meta = per_row_extractor_fn(
+ concept, sample_descs, full_text=row_fulls[i])
+ except Exception as e:
+ roles, meta = {}, {'reason': f'exception: {type(e).__name__}'}
+ per_row_roles.append(roles)
+ # Audit trail: store per-row details for the Role Decomposition tab
+ per_row_audit.append({
+ 'row_idx': i,
+ 'description_snippet': str(concept)[:120],
+ 'accepted_roles': dict(roles),
+ 'rejected': meta.get('rejected', []),
+ 'reason': meta.get('reason', ''),
+ 'raw': meta.get('raw', ''),
+ })
+ extractor_meta_summary[meta.get('reason', 'unknown')] += 1
+ for role, val in roles.items():
+ if val and val not in all_phrases_by_role[role]:
+ all_phrases_by_role[role].append(val)
+
+ try:
+ if show_progress and pbar is not None:
+ pbar.empty()
+ except Exception:
+ pass
+
+ # Keep only roles that have ≥ 2 distinct values across the corpus
+ roles_final: dict = {}
+ for role in _PER_ROW_ROLES:
+ vals = all_phrases_by_role.get(role, [])
+ if len(vals) >= 2:
+ roles_final[role] = vals
+
+ # Coverage = fraction of rows with ≥1 non-empty role assignment
+ covered = sum(1 for r in per_row_roles if any(r.get(rl) for rl in roles_final))
+ coverage = covered / max(1, n)
+ regularity = coverage # for per-row extractor, coverage is regularity
+
+ valid = (len(roles_final) >= 2 and coverage >= regularity_threshold)
+
+ return {
+ 'roles': roles_final,
+ 'row_assignments': per_row_roles,
+ 'coverage': round(coverage, 4),
+ 'regularity': round(regularity, 4),
+ 'valid': valid,
+ 'role_source': 'per_row_llm_extraction',
+ 'extractor_summary': dict(extractor_meta_summary),
+ 'per_row_audit': per_row_audit,
+ 'group_name': group_name,
+ }
+
+def discover_roles_via_sbert_phrase_clustering(
+ texts: list,
+ text_col_names: Optional[list],
+ sbert_model,
+ llm_role_namer_fn: Optional[Callable] = None,
+ min_phrase_count: int = 2,
+ min_role_size: int = 2,
+ n_clusters_range: tuple = (2, 6),
+ group_name: str = '',
+ regularity_threshold: float = 0.40) -> dict:
+ """
+ Option D core: discover semantic-role schema for a group via
+ SBERT phrase clustering + constrained-vocab LLM cluster naming.
+
+ Pipeline (EDC / ZOES style):
+ 1. Extract repeated phrases per row (shared helper).
+ 2. Compute group regularity = fraction of rows with ≥2 mined phrases.
+ If < regularity_threshold → return invalid (caller falls back to
+ existing slot mining / FASTopic / NMF path).
+ 3. SBERT-embed each unique phrase.
+ 4. Agglomerative-cluster phrases by cosine similarity; select K by
+ silhouette score (range 2..6).
+ 5. Name each cluster via LLM (constrained vocab). Anonymous fallback
+ when LLM is off or rejects.
+ 6. Per-row: assign one phrase per role (longest mined phrase wins ties).
+ 7. Return roles + row_assignments + diagnostics.
+
+ Returns a dict in the same shape as mine_phrase_slots() so it can flow
+ straight into build_slot_hierarchy / build_role_hierarchy.
+ """
+ n = len(texts)
+ if n < 4 or sbert_model is None:
+ return {'valid': False, 'reason': 'too_few_rows_or_no_sbert',
+ 'regularity': 0.0, 'roles': {}, 'row_assignments': []}
+
+ # 1. extract phrases
+ phrases, refined, regularity = _extract_phrases_for_role_clustering(
+ texts, text_col_names, group_name, min_phrase_count)
+
+ if regularity < regularity_threshold:
+ return {'valid': False, 'reason': f'low_regularity ({regularity:.2f})',
+ 'regularity': regularity, 'roles': {}, 'row_assignments': []}
+ if len(phrases) < 4:
+ return {'valid': False, 'reason': 'too_few_phrases',
+ 'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+ # 2. SBERT embed
+ try:
+ embs = sbert_model.encode(phrases, normalize_embeddings=True,
+ show_progress_bar=False, batch_size=64)
+ except Exception as e:
+ return {'valid': False, 'reason': f'sbert_failed: {type(e).__name__}',
+ 'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+ # 3. agglomerative + silhouette K selection
+ best_score, best_labels, best_k = -1.0, None, 2
+ for k in range(n_clusters_range[0],
+ min(n_clusters_range[1] + 1, len(phrases))):
+ try:
+ ac = AgglomerativeClustering(n_clusters=k, metric='cosine',
+ linkage='average')
+ labels = ac.fit_predict(embs)
+ if len(set(labels)) < 2:
+ continue
+ sil = float(silhouette_score(embs, labels, metric='cosine'))
+ if sil > best_score:
+ best_score, best_labels, best_k = sil, labels, k
+ except Exception:
+ continue
+
+ if best_labels is None:
+ return {'valid': False, 'reason': 'no_clusters',
+ 'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+ # 4. group phrases by cluster id
+ clusters_by_id: dict = defaultdict(list)
+ for p, lbl in zip(phrases, best_labels):
+ clusters_by_id[int(lbl)].append(p)
+ valid_clusters = {cid: ps for cid, ps in clusters_by_id.items()
+ if len(ps) >= min_role_size}
+ if len(valid_clusters) < 2:
+ return {'valid': False, 'reason': 'too_few_valid_clusters',
+ 'regularity': regularity, 'roles': {}, 'row_assignments': []}
+
+ # 5. name each cluster
+ sample_descs = [str(t)[:200] for t in texts[:3]]
+ roles: dict = {}
+ naming_meta: dict = {}
+ used_names: set = set()
+ for cid, cluster_phrases in valid_clusters.items():
+ role_name = None
+ if llm_role_namer_fn:
+ role_name, name_meta = llm_role_namer_fn(cluster_phrases, sample_descs)
+ naming_meta[cid] = name_meta
+ if not role_name:
+ role_name = f'cluster_{cid}'
+ # Disambiguate if LLM gave the same name to two clusters
+ original = role_name
+ suffix = 2
+ while role_name in used_names:
+ role_name = f'{original}_{suffix}'
+ suffix += 1
+ used_names.add(role_name)
+ roles[role_name] = cluster_phrases
+
+ # 6. per-row role assignment (longest phrase per role per row)
+ phrase_to_role = {p: r for r, ps in roles.items() for p in ps}
+ row_assignments: list = []
+ covered = 0
+ for r in refined:
+ assignment: dict = {}
+ for p in r:
+ role = phrase_to_role.get(p)
+ if not role:
+ continue
+ if role not in assignment or len(p) > len(assignment[role]):
+ assignment[role] = p
+ if assignment:
+ covered += 1
+ row_assignments.append(assignment)
+
+ coverage = covered / n
+ valid = (coverage >= 0.50 and len(roles) >= 2)
+
+ return {
+ 'roles': roles,
+ 'row_assignments': row_assignments,
+ 'coverage': round(coverage, 4),
+ 'regularity': regularity,
+ 'valid': valid,
+ 'role_source': 'sbert_phrase_clustering',
+ 'phrase_silhouette': round(float(best_score), 4),
+ 'n_clusters': best_k,
+ 'naming_meta': naming_meta,
+ }
+
+def build_role_hierarchy(vi_list: list,
+ role_result: dict,
+ can: pd.DataFrame,
+ parent_id: int,
+ parent_path: str,
+ nodes: list,
+ node_map: dict,
+ var_nodes: dict,
+ max_depth_remaining: int = 4,
+ post_split_fn: Optional[Callable] = None,
+ min_post_split_size: int = 4) -> bool:
+ """
+ Build a role-nested hierarchy from the Option D role decomposition.
+ Roles ordered by generic priority:
+ measure > outcome > statistic > condition > modifier > subtype
+ Variables without a value for a given role skip that level.
+
+ Returns True on success (≥ 2 aggregation nodes added), else False.
+ """
+ if not role_result.get('valid'):
+ return False
+ roles = role_result['roles']
+ row_assigns = role_result['row_assignments']
+ if len(vi_list) != len(row_assigns):
+ return False
+
+ role_names = sorted(roles.keys(),
+ key=lambda r: _ROLE_PRIORITY.get(r, 0),
+ reverse=True)
+ if not role_names:
+ return False
+
+ aggregations_made = [0]
+ vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}
+
+ def _terminal_attach(vi_grp: list, parent: int, depth_left: int):
+ if (post_split_fn is not None
+ and len(vi_grp) >= min_post_split_size
+ and depth_left > 0):
+ try:
+ added = post_split_fn(vi_grp, parent, depth_left)
+ if added > 0:
+ aggregations_made[0] += added
+ return
+ except Exception:
+ pass
+ for vi in vi_grp:
+ _add_child(node_map, parent, var_nodes[vi])
+
+ def _split(vi_subset: list, role_idx: int, current_parent: int,
+ depth_remaining: int):
+ if (depth_remaining <= 0 or role_idx >= len(role_names)
+ or len(vi_subset) <= 1):
+ _terminal_attach(vi_subset, current_parent, depth_remaining)
+ return
+
+ role = role_names[role_idx]
+ groups: dict = defaultdict(list)
+ unassigned: list = []
+ for vi in vi_subset:
+ local = vi_to_local.get(vi)
+ val = row_assigns[local].get(role) if local is not None else None
+ if val:
+ groups[val].append(vi)
+ else:
+ unassigned.append(vi)
+
+ if len(groups) <= 1:
+ _split(vi_subset, role_idx + 1, current_parent, depth_remaining)
+ return
+
+ # BUGFIX: read role_source from the result so the same builder
+ # correctly labels per-row LLM extraction nodes vs SBERT clustering
+ # nodes. Previously this was hardcoded to 'sbert_phrase_clustering',
+ # hiding which route actually ran in the exported provenance.
+ _route_src = role_result.get('role_source', 'sbert_phrase_clustering')
+ _label_src = ('per_row_llm_role'
+ if _route_src == 'per_row_llm_extraction'
+ else 'sbert_phrase_role')
+ _node_source_str = ('per-row LLM extraction (Zhu et al. 2025)'
+ if _route_src == 'per_row_llm_extraction'
+ else 'SBERT phrase cluster + LLM role naming')
+ for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
+ if len(vi_grp) == 1:
+ _add_child(node_map, current_parent, var_nodes[vi_grp[0]])
+ continue
+ nid = _next_id(nodes)
+ nd = _make_node(nid, val.title(),
+ desc=(f'Role: {role} | Value: "{val}" | '
+ f'Variables: {len(vi_grp)} | '
+ f'Source: {_node_source_str}'))
+ nd['label_provenance'] = {
+ 'label_source': _label_src,
+ 'evidence_terms': [val],
+ 'confidence': 1.0,
+ 'llm_used': True,
+ 'llm_rejected': False,
+ 'role': role,
+ }
+ nd['structure_provenance'] = {
+ 'route': _route_src,
+ 'aspect_method': _route_src,
+ 'slot_role': role,
+ 'phrase_silhouette': role_result.get('phrase_silhouette'),
+ 'regularity': role_result.get('regularity'),
+ 'n_clusters': role_result.get('n_clusters'),
+ }
+ nodes.append(nd)
+ node_map[nid] = nd
+ _add_child(node_map, current_parent, nid)
+ aggregations_made[0] += 1
+ _split(vi_grp, role_idx + 1, nid, depth_remaining - 1)
+
+ if unassigned:
+ _terminal_attach(unassigned, current_parent, depth_remaining)
+
+ _split(vi_list, 0, parent_id, max_depth_remaining)
+ return aggregations_made[0] >= 2
+
+def strip_group_prefix_from_labels(nodes: list) -> int:
+ """
+ Post-build pass: for every aggregation node, if its name starts with the
+ parent group's name (case-insensitive), strip the prefix.
+
+ Effect: 'DMS — Total Errors' under DMS becomes 'Total Errors'.
+ 'Pal Total Errors' under PAL becomes 'Total Errors'.
+
+ Returns the number of labels modified.
+ """
+ node_map = {int(n['id']): n for n in nodes}
+ modified = 0
+
+ def _walk(nid: int, parent_name: str):
+ nonlocal modified
+ n = node_map.get(int(nid))
+ if not n:
+ return
+ if n.get('type') == 'aggregation' and parent_name:
+ current = str(n.get('name', '')).strip()
+ cn_lower = current.lower()
+ pn_lower = parent_name.strip().lower()
+ if pn_lower and (cn_lower.startswith(pn_lower + ' ')
+ or cn_lower.startswith(pn_lower + '—')
+ or cn_lower.startswith(pn_lower + '-')
+ or cn_lower.startswith(pn_lower + ':')
+ or cn_lower.startswith(pn_lower + '/')):
+ stripped = current[len(parent_name):].lstrip(' —-—:/').strip()
+ if stripped and len(stripped) >= 2:
+ n['name'] = stripped
+ modified += 1
+ new_parent = (n.get('name', '') if n.get('type') in ('aggregation', 'root')
+ else parent_name)
+ for cid in n.get('related', []):
+ _walk(int(cid), new_parent)
+
+ _walk(0, '')
+ return modified
+
+def enforce_single_parent(nodes: list) -> int:
+ """
+ POST-BUILD PASS 4 — guarantee the result is a tree (each node has exactly
+ one parent).
+
+ The role builder can attach a variable both directly to a group and again
+ under a sub-aggregation of that same group — e.g. a leaf under '3 Targets'
+ *and* under '3 Targets > False Alarm Sequences'. That makes the branch a
+ DAG, which (a) diverges from the single-parent tree format, (b) fragments
+ the branch, and (c) breaks proportional ('total') sunburst/treemap sizing.
+
+ For every node with more than one parent, keep the MOST SPECIFIC (deepest)
+ parent and detach it from the shallower ones. Keeping the deepest parent
+ removes the redundant direct attachment while preserving the finer
+ sub-grouping the role extractor discovered. Fully generic — no domain
+ knowledge, no hardcoded labels.
+
+ Returns the number of redundant parent links removed.
+ """
+ from collections import deque
+ node_map = {int(n['id']): n for n in nodes}
+ # depth = shortest distance from root (id 0) along child edges
+ depth = {0: 0}
+ dq = deque([0])
+ while dq:
+ cur = dq.popleft()
+ for c in node_map.get(cur, {}).get('related', []):
+ c = int(c)
+ if c not in depth:
+ depth[c] = depth[cur] + 1
+ dq.append(c)
+ parents = defaultdict(list)
+ for n in nodes:
+ for c in n.get('related', []):
+ parents[int(c)].append(int(n['id']))
+ removed = 0
+ for child, ps in parents.items():
+ if len(ps) <= 1:
+ continue
+ keep = max(ps, key=lambda p: depth.get(p, 0)) # deepest = most specific
+ for p in ps:
+ if p == keep:
+ continue
+ par = node_map.get(p)
+ if par and int(child) in par['related']:
+ par['related'] = [x for x in par['related'] if int(x) != int(child)]
+ removed += 1
+ return removed
+
+def prune_empty_aggregations(nodes: list) -> int:
+ """
+ POST-BUILD PASS 5 — drop aggregation nodes that ended up with no children.
+
+ `enforce_single_parent` can empty a shallow aggregation when all of its
+ variables were kept under a deeper/sibling parent (e.g. 'RVP Response
+ Latency' losing every leaf to a more specific group). An empty category
+ node is export noise — it renders as a blank sector and has no members.
+
+ Iteratively removes childless aggregation nodes and detaches them from
+ their parents (removal can empty a parent in turn). Root and attribute
+ nodes are never touched. Returns the number of nodes removed.
+ """
+ removed = 0
+ while True:
+ node_map = {int(n['id']): n for n in nodes}
+ empties = {int(n['id']) for n in nodes
+ if n.get('type') == 'aggregation' and not n.get('related')}
+ if not empties:
+ break
+ nodes[:] = [n for n in nodes if int(n['id']) not in empties]
+ for n in nodes:
+ if any(int(c) in empties for c in n.get('related', [])):
+ n['related'] = [int(c) for c in n['related'] if int(c) not in empties]
+ removed += len(empties)
+ return removed
+
+def mine_phrase_slots(texts: list,
+ text_col_names: Optional[list] = None,
+ min_phrase_count: int = 2,
+ min_slot_size: int = 2,
+ coverage_threshold: float = 0.55,
+ excl_threshold: float = 0.75,
+ group_name: str = '',
+ llm_role_classifier_fn: Optional[Callable] = None) -> dict:
+ """
+ Discover phrase slots in a group of variable descriptions.
+
+ Algorithm:
+ 1. Extract concept-prefix from each description (text before ':' in a
+ description-like column; full text if no such column).
+ 2. Tokenise + lowercase; generate n-grams (1–4 tokens, ≥3 chars).
+ 3. Keep n-grams that appear in ≥ min_phrase_count rows.
+ 4. For each row, retain only maximal phrases (drop sub-phrases of
+ longer phrases present in the same row).
+ 5. Compute mutual-exclusion score per phrase pair:
+ M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
+ Phrases with M[a,b] ≥ excl_threshold are 'alternatives' (rarely
+ appear together → likely fill the same slot in different rows).
+ 6. Cluster phrases into slots via greedy mutual-exclusion BFS, starting
+ from the most-frequent phrase.
+ 7. A slot is valid if it has ≥ min_slot_size distinct phrases.
+ 8. Coverage: fraction of rows that contain ≥1 phrase from ≥1 slot.
+ If coverage < threshold or < 2 slots survive → valid=False.
+
+ Returns:
+ {
+ 'slots': list[ {phrases: set[str], best: str} ],
+ 'row_assignments': list[ dict[slot_id → phrase] ] for each row,
+ 'coverage': float in [0,1],
+ 'valid': bool — True if slot structure is strong enough.
+ }
+ """
+ n = len(texts)
+ if n < 4:
+ return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
+
+ text_keys = ({c.strip().lower() for c in (text_col_names or [])}
+ if text_col_names else None)
+
+ # ── 1. extract concept prefixes ──────────────────────────────────────────
+ prefixes = []
+ for t in texts:
+ prefix = ''
+ for chunk in str(t).split(' | '):
+ if ':' not in chunk:
+ continue
+ key, val = chunk.split(':', 1)
+ key_l = key.strip().lower()
+ if text_keys is not None and key_l not in text_keys:
+ continue
+ prefix = val.split(':')[0].strip()
+ break
+ if not prefix:
+ prefix = str(t).strip()
+ # Strip 'KEY:', 'Note:', etc. before tokenising
+ prefix = _strip_leading_markers(prefix)
+ prefixes.append(prefix)
+
+ # ── 2. normalise: alphanumerics + parens preserved, others → spaces ──────
+ def _normalize(p: str) -> str:
+ s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
+ for ch in str(p).lower())
+ return ' '.join(s.split())
+
+ norm = [_normalize(p) for p in prefixes]
+
+ # ── 3. extract n-grams (1..4) per row, keep ≥3 chars ─────────────────────
+ row_phrases: list = []
+ phrase_count: Counter = Counter()
+ for text in norm:
+ tokens = text.split()
+ row_set = set()
+ for ngram_n in range(1, 5):
+ for i in range(len(tokens) - ngram_n + 1):
+ phrase = ' '.join(tokens[i:i + ngram_n])
+ if len(phrase) >= 3:
+ row_set.add(phrase)
+ row_phrases.append(row_set)
+ for p in row_set:
+ phrase_count[p] += 1
+
+ # ── 4. keep repeated phrases, retain only maximal phrases per row ────────
+ keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
+ refined = []
+ for row in row_phrases:
+ sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
+ kept: list = []
+ for p in sorted_by_len:
+ if not any(p != q and p in q for q in kept):
+ kept.append(p)
+ refined.append(set(kept))
+
+ # Recount after refinement, re-filter
+ phrase_count = Counter()
+ for r in refined:
+ for p in r:
+ phrase_count[p] += 1
+ # NEW: filter out non-meaningful phrases (stopwords, group-name echoes,
+ # dictionary markers) before they enter mutual-exclusion clustering.
+ group_lower = (group_name or '').strip().lower()
+ phrases = sorted([p for p, c in phrase_count.items()
+ if c >= min_phrase_count
+ and _phrase_is_meaningful(p, group_lower)])
+ if len(phrases) < 2 * min_slot_size:
+ return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
+
+ p_idx = {p: i for i, p in enumerate(phrases)}
+ n_p = len(phrases)
+ counts = np.array([phrase_count[p] for p in phrases])
+
+ # ── 5. co-occurrence + mutual-exclusion matrix ───────────────────────────
+ cooc = np.zeros((n_p, n_p), dtype=int)
+ for r in refined:
+ idxs = [p_idx[p] for p in r if p in p_idx]
+ for i in idxs:
+ for j in idxs:
+ if i != j:
+ cooc[i, j] += 1
+ min_counts = np.minimum.outer(counts, counts).astype(float)
+ min_counts[min_counts == 0] = 1.0
+ mut_excl = 1.0 - cooc / min_counts
+ np.fill_diagonal(mut_excl, 0)
+
+ # ── 6a. UPSTREAM ROUTE: ask the LLM to classify phrases into roles ───────
+ # The LLM proposes a role schema (e.g. {measure: [...], statistic: [...],
+ # condition: [...]}) — phrases are assigned to roles, role names provide
+ # semantic ordering for the hierarchy. Anti-hallucination: every phrase
+ # must come back verbatim, otherwise rejected by the validator inside
+ # make_llm_role_classifier_fn.
+ slot_source = 'mutual_exclusion'
+ role_names: list = []
+ slots: list = [] # list[set[int]] — phrase indices per slot
+
+ if llm_role_classifier_fn is not None:
+ try:
+ classified, classifier_meta = llm_role_classifier_fn(
+ phrases, texts, group_name)
+ except Exception:
+ classified, classifier_meta = None, {'reason': 'exception'}
+ if classified:
+ # Build slots in the order the LLM proposed them. Each role is
+ # one slot containing the phrase-index set.
+ for role_name, role_phrases in classified.items():
+ idx_set = {p_idx[p] for p in role_phrases if p in p_idx}
+ if len(idx_set) >= min_slot_size:
+ slots.append(idx_set)
+ role_names.append(role_name)
+ if len(slots) >= 2:
+ slot_source = 'llm_role_classification'
+
+ # ── 6b. FALLBACK: greedy mutual-exclusion BFS ────────────────────────────
+ if slot_source == 'mutual_exclusion':
+ visited: set = set()
+ slots = []
+ order = np.argsort(-counts)
+ for seed in order:
+ if seed in visited:
+ continue
+ slot = {int(seed)}
+ queue = [int(seed)]
+ while queue:
+ cur = queue.pop()
+ for j in np.where(mut_excl[cur] >= excl_threshold)[0]:
+ j = int(j)
+ if j in slot:
+ continue
+ if all(mut_excl[j, k] >= excl_threshold - 0.15 for k in slot):
+ slot.add(j)
+ queue.append(j)
+ if len(slot) >= min_slot_size:
+ slots.append(slot)
+ visited |= slot
+ # Synthesise anonymous role names from the most-frequent phrase in each
+ # slot — these become the visible slot tags in provenance.
+ role_names = [phrases[max(s, key=lambda i: counts[i])] for s in slots]
+
+ if len(slots) < 2:
+ return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
+
+ # ── 7. assign per-row phrase per slot ────────────────────────────────────
+ row_assignments: list = []
+ covered = 0
+ for r in refined:
+ row_idx = {p_idx[p] for p in r if p in p_idx}
+ assignment: dict = {}
+ any_match = False
+ for slot_id, slot in enumerate(slots):
+ matched = row_idx & slot
+ if matched:
+ best = max(matched, key=lambda i: counts[i])
+ assignment[slot_id] = phrases[best]
+ any_match = True
+ if any_match:
+ covered += 1
+ row_assignments.append(assignment)
+
+ coverage = covered / n
+ valid = (coverage >= coverage_threshold and len(slots) >= 2)
+
+ return {
+ 'slots': [{'phrases': {phrases[i] for i in s},
+ 'best': phrases[max(s, key=lambda i: counts[i])],
+ 'role_name': role_names[idx] if idx < len(role_names) else ''}
+ for idx, s in enumerate(slots)],
+ 'row_assignments': row_assignments,
+ 'coverage': round(coverage, 4),
+ 'valid': valid,
+ 'slot_source': slot_source, # 'llm_role_classification' or 'mutual_exclusion'
+ }
+
+
+def build_slot_hierarchy(vi_list: list,
+ slot_result: dict,
+ can: pd.DataFrame,
+ parent_id: int,
+ parent_path: str,
+ nodes: list,
+ node_map: dict,
+ var_nodes: dict,
+ max_depth_remaining: int = 4,
+ post_slot_split_fn: Optional[Callable] = None,
+ min_post_slot_size: int = 4) -> bool:
+ """
+ Build a hierarchy for `vi_list` using inferred phrase slots.
+
+ Slots are ordered by partition quality (more distinct values + fewer
+ singletons = higher priority). Each slot becomes one tree level.
+ Variables that lack a phrase at a given slot level skip that level.
+ Returns True on success; False if the result is too shallow to be useful
+ (caller should then fall back to NMF/GMM).
+ """
+ if not slot_result.get('valid'):
+ return False
+
+ slots = slot_result['slots']
+ row_assigns = slot_result['row_assignments']
+ if len(vi_list) != len(row_assigns):
+ return False
+
+ # Index map: global vi → local position in row_assigns
+ vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}
+
+ # Order slots by partition-quality + semantic-shape heuristics.
+ # Higher score → used at a shallower level in the hierarchy.
+ #
+ # Heuristics (all generic, no domain knowledge):
+ # + many distinct values, low singleton fraction (existing)
+ # + average phrase token-length (multi-word noun phrases preferred)
+ # + total row coverage of the slot
+ # − slots whose top phrases look like pure conditions
+ # (numeric token + temporal/quantity word)
+ # − slots where every phrase is just a number or 'all' / 'none' modifier
+ _CONDITION_HINTS = {'second', 'seconds', 'minute', 'minutes', 'hour',
+ 'hours', 'day', 'days', 'month', 'months', 'year',
+ 'years', 'week', 'weeks', 'box', 'boxes', 'token',
+ 'tokens', 'pattern', 'patterns', 'trial', 'trials'}
+
+ def _looks_like_condition(phrase: str) -> bool:
+ toks = phrase.split()
+ if not toks:
+ return False
+ has_num = any(t.isdigit() or t in {'all','none','simultaneous'} for t in toks)
+ has_hint = any(t in _CONDITION_HINTS for t in toks)
+ return has_num and has_hint
+
+ def _slot_score(slot_id: int) -> float:
+ vals = Counter()
+ for a in row_assigns:
+ v = a.get(slot_id)
+ if v:
+ vals[v] += 1
+ if not vals:
+ return -1.0
+ n_distinct = len(vals)
+ n_singletons = sum(1 for c in vals.values() if c == 1)
+ coverage = sum(vals.values()) / max(1, len(row_assigns))
+ avg_tokens = float(np.mean([len(p.split()) for p in vals]))
+ condition_frac = sum(1 for p in vals if _looks_like_condition(p)) / n_distinct
+
+ base = n_distinct - 0.6 * n_singletons
+ base += 0.4 * coverage
+ base += 0.3 * (avg_tokens - 1)
+ base -= 0.8 * condition_frac
+ return base
+
+ # When the LLM produced the role schema, trust its role ordering for slots
+ # whose role name is broadly "measure-like" (base concept) over
+ # "statistic/condition/subtype" (modifiers). This is generic English
+ # vocabulary, not domain knowledge — same heuristic used by IE slot-
+ # induction work (cf. Xu et al., FCS 2024 IE survey).
+ slot_source = slot_result.get('slot_source', 'mutual_exclusion')
+ _MEASURE_LIKE = {'measure', 'outcome', 'metric', 'variable', 'quantity'}
+ _STATISTIC_LIKE = {'statistic', 'stat', 'aggregate', 'summary'}
+ _CONDITION_LIKE = {'condition', 'modifier', 'context', 'setting'}
+ _SUBTYPE_LIKE = {'subtype', 'type', 'kind', 'category'}
+
+ def _role_priority(role: str) -> int:
+ r = (role or '').strip().lower()
+ if any(k in r for k in _MEASURE_LIKE): return 4
+ if any(k in r for k in _STATISTIC_LIKE): return 3
+ if any(k in r for k in _CONDITION_LIKE): return 2
+ if any(k in r for k in _SUBTYPE_LIKE): return 1
+ return 0 # unknown role — fall back to data-driven score
+
+ if slot_source == 'llm_role_classification':
+ ordered_slots = sorted(
+ range(len(slots)),
+ key=lambda i: (_role_priority(slots[i].get('role_name', '')),
+ _slot_score(i)),
+ reverse=True)
+ else:
+ ordered_slots = sorted(range(len(slots)), key=_slot_score, reverse=True)
+ ordered_slots = [s for s in ordered_slots if _slot_score(s) > 0]
+ if not ordered_slots:
+ return False
+
+ aggregations_made = [0] # mutable counter for fallback decision
+
+ def _attach_or_sub_recurse(vi_grp: list, parent: int, depth_left: int):
+ """
+ Terminal-leaf attach point inside slot mining. When slots are
+ exhausted but the cluster still has enough variables AND we have a
+ post-slot callback (NMF/FASTopic + GMM splitter), recurse further to
+ deepen the tree. Otherwise attach leaves directly.
+ """
+ if (post_slot_split_fn is not None
+ and len(vi_grp) >= min_post_slot_size
+ and depth_left > 0):
+ try:
+ added = post_slot_split_fn(vi_grp, parent, depth_left)
+ if added > 0:
+ aggregations_made[0] += added
+ return
+ except Exception:
+ pass
+ for vi in vi_grp:
+ _add_child(node_map, parent, var_nodes[vi])
+
+ def _split(vi_subset: list, slot_ord_pos: int, current_parent: int,
+ depth_remaining: int):
+ if (depth_remaining <= 0 or slot_ord_pos >= len(ordered_slots)
+ or len(vi_subset) <= 1):
+ _attach_or_sub_recurse(vi_subset, current_parent, depth_remaining)
+ return
+
+ slot_id = ordered_slots[slot_ord_pos]
+ groups: dict = defaultdict(list)
+ unassigned: list = []
+ for vi in vi_subset:
+ local = vi_to_local.get(vi)
+ val = row_assigns[local].get(slot_id) if local is not None else None
+ if val:
+ groups[val].append(vi)
+ else:
+ unassigned.append(vi)
+
+ # If this slot doesn't partition the subset, move to next slot
+ if len(groups) <= 1:
+ _split(vi_subset, slot_ord_pos + 1, current_parent, depth_remaining)
+ return
+
+ # The slot's role name (from LLM classification, when applicable)
+ slot_role = slots[slot_id].get('role_name', '') if slot_id < len(slots) else ''
+ for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
+ if len(vi_grp) == 1:
+ _add_child(node_map, current_parent, var_nodes[vi_grp[0]])
+ continue
+ nid = _next_id(nodes)
+ nd = _make_node(nid, val.title(),
+ desc=(f'Role: {slot_role or "—"} | '
+ f'Slot phrase: "{val}" | '
+ f'Variables: {len(vi_grp)} | '
+ f'Source: phrase-slot mining ({slot_source})'))
+ nd['label_provenance'] = {
+ 'label_source': 'phrase_slot',
+ 'evidence_terms': [val],
+ 'confidence': 1.0,
+ 'llm_used': slot_source == 'llm_role_classification',
+ 'llm_rejected': False,
+ 'role': slot_role,
+ }
+ nd['structure_provenance'] = {
+ 'route': 'slot_mining',
+ 'aspect_method': slot_source,
+ 'silhouette': None,
+ 'slot_coverage': round(float(slot_result.get('coverage', 0)), 3),
+ 'slot_role': slot_role,
+ }
+ nodes.append(nd); node_map[nid] = nd
+ _add_child(node_map, current_parent, nid)
+ aggregations_made[0] += 1
+ _split(vi_grp, slot_ord_pos + 1, nid, depth_remaining - 1)
+
+ if unassigned:
+ _attach_or_sub_recurse(unassigned, current_parent, depth_remaining)
+
+ _split(vi_list, 0, parent_id, max_depth_remaining)
+
+ # Reject the slot-built tree if it added almost no structure (likely the
+ # slots were not actually useful for this group).
+ return aggregations_made[0] >= 2
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 6 — DYNAMIC TOP-DOWN LOD TREE [ZHU §3.3 adapted]
+# ──────────────────────────────────────────────────────────────────────────────
+def _next_id(nodes: list) -> int:
+ return max((int(n['id']) for n in nodes), default=0) + 1
+
+def _add_child(node_map: dict, parent_id: int, child_id: int):
+ p = node_map.get(int(parent_id))
+ if p and int(child_id) not in p['related']:
+ p['related'].append(int(child_id))
+
+def _make_node(nid, name, ntype='aggregation', desc='', dtype='determine') -> dict:
+ return {'id': int(nid), 'name': str(name), 'related': [],
+ 'type': ntype, 'desc': str(desc), 'dtype': dtype, 'isShown': True}
+
+# ──────────────────────────────────────────────────────────────────────────────
+# POST-BUILD PASS 1 — SIBLING COMMON-PREFIX FACTORING
+# ──────────────────────────────────────────────────────────────────────────────
+def factor_sibling_common_prefixes(nodes: list,
+ min_siblings: int = 3,
+ min_prefix_tokens: int = 2) -> int:
+ """
+ For each parent whose ≥`min_siblings` aggregation children share a
+ common multi-token title prefix, insert a new intermediate parent named
+ by that prefix and re-attach the matching siblings under it (with the
+ prefix stripped from each name).
+
+ Generic, no domain knowledge. Inspired by sibling-label factoring
+ common in faceted-classification systems (Stoica & Hearst, NAACL 2007,
+ 'Castanet') — collapsing redundant repeated tokens in sibling names.
+
+ Returns the number of factor-parents inserted.
+ """
+ node_map = {int(n['id']): n for n in nodes}
+ inserted = 0
+ # We iterate over a snapshot of current aggregation nodes
+ queue = [int(n['id']) for n in nodes
+ if n.get('type') in ('aggregation', 'root')]
+ while queue:
+ parent_id = queue.pop(0)
+ parent = node_map.get(parent_id)
+ if not parent:
+ continue
+ # Gather aggregation children with their tokenised names
+ agg_children = []
+ for cid in parent.get('related', []):
+ child = node_map.get(int(cid))
+ if not child or child.get('type') != 'aggregation':
+ continue
+ toks = str(child.get('name', '')).split()
+ if len(toks) >= min_prefix_tokens:
+ agg_children.append((int(cid), toks))
+ if len(agg_children) < min_siblings:
+ continue
+ # Greedy: find the longest prefix shared by ≥ min_siblings children
+ best_prefix: list = []
+ best_group: list = []
+ # Sort children by name tokens for stable grouping
+ agg_children.sort(key=lambda x: x[1])
+ # Try each possible prefix length from longest down
+ max_len = max(len(t) for _, t in agg_children)
+ for length in range(max_len, min_prefix_tokens - 1, -1):
+ prefix_counts: Counter = Counter()
+ for cid, toks in agg_children:
+ if len(toks) > length: # must have something AFTER the prefix
+ prefix_counts[tuple(t.lower() for t in toks[:length])] += 1
+ for pfx, cnt in prefix_counts.most_common():
+ if cnt >= min_siblings:
+ # Reject low-quality prefixes: all-stopword, or starting/
+ # ending with a stopword (e.g. "the number of", "the").
+ # A good factored parent is a real concept phrase.
+ pfx_l = [t.lower() for t in pfx]
+ if all(t in _STOPWORDS for t in pfx_l):
+ continue
+ if pfx_l[0] in _STOPWORDS or pfx_l[-1] in _STOPWORDS:
+ continue
+ group = [(cid, toks) for cid, toks in agg_children
+ if len(toks) > length
+ and tuple(t.lower() for t in toks[:length]) == pfx]
+ if len(group) >= min_siblings:
+ best_prefix = list(pfx)
+ best_group = group
+ break
+ if best_prefix:
+ break
+ if not best_prefix:
+ continue
+
+ # Build the new intermediate parent
+ new_id = max(node_map) + 1
+ # Title-case the prefix using the original child capitalisation
+ # (take it from the first matched child's tokens)
+ orig_tokens = best_group[0][1][:len(best_prefix)]
+ prefix_name = ' '.join(orig_tokens)
+ new_node = _make_node(new_id, prefix_name, ntype='aggregation',
+ desc=(f'Factored common prefix: "{prefix_name}" | '
+ f'Siblings: {len(best_group)} | '
+ f'Source: sibling factoring [Castanet 2007]'))
+ new_node['label_provenance'] = {
+ 'label_source': 'factored_common_prefix',
+ 'evidence_terms': [c[0] for c in best_group],
+ 'confidence': 1.0,
+ 'llm_used': False,
+ 'llm_rejected': False,
+ }
+ new_node['structure_provenance'] = {
+ 'route': 'sibling_factoring',
+ 'aspect_method': None,
+ 'silhouette': None,
+ 'slot_coverage': None,
+ 'factored_from': [c[0] for c in best_group],
+ 'common_prefix_tokens': len(best_prefix),
+ }
+ # Rename the factored siblings (strip the prefix from their names)
+ for cid, toks in best_group:
+ child = node_map[cid]
+ new_name = ' '.join(toks[len(best_prefix):]).strip()
+ if new_name:
+ child['name'] = new_name
+ # Rewire parent → new_node → factored siblings
+ moved_ids = {c[0] for c in best_group}
+ parent['related'] = [c for c in parent['related']
+ if int(c) not in moved_ids]
+ parent['related'].append(new_id)
+ new_node['related'] = [c[0] for c in best_group]
+ nodes.append(new_node)
+ node_map[new_id] = new_node
+ inserted += 1
+ # Re-examine this parent in case multiple prefix groups exist
+ queue.append(parent_id)
+ # Also examine the new parent for further nesting
+ queue.append(new_id)
+ return inserted
+
+# ──────────────────────────────────────────────────────────────────────────────
+# POST-BUILD PASS 2 — TRACO-INSPIRED LOW-QUALITY NODE PRUNING
+# ──────────────────────────────────────────────────────────────────────────────
+def prune_low_quality_aggregations(nodes: list,
+ tfidf: TfidfVectorizer = None,
+ min_coherence: float = 0.0,
+ max_child_ratio: float = 1.0) -> int:
+ """
+ Conservative noise pruning.
+
+ ONLY rule applied by default: dissolve aggregation nodes whose name is
+ pure noise — single-word stopword titles ('And', 'Them'), dictionary
+ markers ('Key', 'Note'), or all-stopword titles ('To Be', 'Have A Lot Of').
+
+ Group anchors (route='group_anchor') and sibling-factoring nodes
+ (route='sibling_factoring') are NEVER pruned — they are structural and
+ legitimately have short names that may not score well on TF-IDF metrics.
+
+ Rules B (parent-child coherence) and C (lopsided split) are intentionally
+ DISABLED by default — both metrics punish good hierarchies where children
+ are legitimately more specific than parents (low TF-IDF cosine) or where
+ a slot mining branch happens to be dominated by one large subgroup.
+ They can be opted into by passing tfidf and tightening the thresholds.
+
+ Inspired by TraCo (Wu et al., AAAI 2024): affinity / rationality /
+ diversity failures are common in hierarchical topic models. Here we
+ enforce the most conservative form of that — only obviously-noise names.
+
+ Returns the number of nodes dissolved.
+ """
+ node_map = {int(n['id']): n for n in nodes}
+ parent_of: dict = {}
+ for n in nodes:
+ for c in n.get('related', []):
+ parent_of.setdefault(int(c), int(n['id']))
+
+ FORBIDDEN = frozenset(_STOPWORDS) | frozenset({
+ 'key', 'note', 'item', 'label', 'group', 'cluster',
+ })
+ # Routes that are ALWAYS structural — never prune these even if the name
+ # is short / stopword-like (e.g. 'MOT' is a group anchor, not noise).
+ PROTECTED_ROUTES = frozenset({'group_anchor', 'sibling_factoring'})
+
+ to_dissolve: list = []
+ for n in nodes:
+ if n.get('type') != 'aggregation':
+ continue
+ nid = int(n['id'])
+ if nid not in parent_of:
+ continue
+ sp = n.get('structure_provenance', {})
+ if sp.get('route') in PROTECTED_ROUTES:
+ continue
+
+ name_lower = str(n.get('name', '')).strip().lower()
+ if not name_lower:
+ to_dissolve.append(nid)
+ continue
+
+ # Only Rule A is enforced by default. A token is "noise" if it is a
+ # stopword/marker OR shorter than 2 chars AND not a digit (numeric
+ # labels like '2' / '4' / '6' / '8' are kept — they may be meaningful,
+ # e.g. PAL pattern counts).
+ toks = name_lower.split()
+ def _is_noise_tok(t: str) -> bool:
+ if t in FORBIDDEN:
+ return True
+ if len(t) < 2 and not t.isdigit():
+ return True
+ return False
+ all_noise = bool(toks) and all(_is_noise_tok(t) for t in toks)
+ if name_lower in FORBIDDEN or all_noise:
+ to_dissolve.append(nid)
+
+ # Dissolve: promote children to grandparent
+ dissolved = 0
+ for nid in to_dissolve:
+ n = node_map.get(nid)
+ if not n:
+ continue
+ gp_id = parent_of.get(nid)
+ if gp_id is None:
+ continue
+ gp = node_map.get(gp_id)
+ if not gp:
+ continue
+ gp['related'] = [c for c in gp['related'] if int(c) != nid]
+ for cid in n.get('related', []):
+ if int(cid) not in gp['related']:
+ gp['related'].append(int(cid))
+ parent_of[int(cid)] = gp_id
+ n['isShown'] = False
+ n['type'] = 'dissolved'
+ n.setdefault('structure_provenance', {})['dissolved_reason'] = \
+ 'noise_label_only'
+ dissolved += 1
+ return dissolved
+
+def build_dynamic_lod_tree(can: pd.DataFrame,
+ aspect_reprs: list,
+ aspect_labels: list,
+ tfidf: TfidfVectorizer,
+ max_depth: int = 6,
+ min_cluster_size: int = 2,
+ sil_threshold: float = 0.04,
+ max_clusters_per_split: int = 8,
+ project: str = 'project',
+ local_nmf: bool = True,
+ min_local_nmf_size: int = 8,
+ max_aspects: int = 10,
+ sbert_model=None,
+ llm_label_fn: Optional[Callable] = None,
+ use_slot_mining: bool = True,
+ text_col_names: Optional[list] = None,
+ use_fastopic: bool = True,
+ fastopic_min_size: int = 8,
+ llm_role_classifier_fn: Optional[Callable] = None,
+ use_role_decomposition: bool = True,
+ llm_role_namer_fn: Optional[Callable] = None,
+ role_regularity_threshold: float = 0.40,
+ per_row_role_extractor_fn: Optional[Callable] = None,
+ use_per_row_role_extraction: bool = True) -> list:
+ """
+ Build a dynamic top-down LoD tree.
+
+ Entry strategy (new — fully data-driven, no hardcoding):
+ • If detected _group metadata provides L1/L2 structure, materialise those
+ path segments as aggregation nodes first.
+ • Then apply NMF aspect discovery and GMM clustering *locally* inside each
+ terminal group (not globally across all variables).
+ • Falls back to global NMF from root when no group structure is found.
+
+ Within each recursive split [ZHU §3.3 adapted]:
+ • Evaluate K aspects by silhouette — select the highest (simplified
+ best-aspect split; not the full probabilistic search of Zhu Eq.6/7).
+ • Single-variable clusters are attached directly without an aggregation
+ wrapper (singleton prevention).
+ """
+ texts = can['_text'].fillna('').astype(str).tolist()
+ n_vars = len(can)
+
+ # ── build leaf attribute nodes (ids 1..n_vars) ───────────────────────────
+ nodes: list = [_make_node(0, project, ntype='root', desc='Root node')]
+ var_nodes: dict = {} # can positional index → node id
+ for i, (_, row) in enumerate(can.iterrows(), start=1):
+ nd = _make_node(i, row['_label'], ntype='attribute',
+ desc=row['_text'], dtype='determine')
+ nd['metadata'] = {
+ 'leaf_id': str(row['_id']),
+ 'row_index': int(row['_row']),
+ 'group': row['_group'],
+ }
+ nodes.append(nd)
+ var_nodes[int(row.name)] = i
+ node_map: dict = {int(n['id']): n for n in nodes}
+
+ # ── recursive splitter ────────────────────────────────────────────────────
+ def _recurse(vi_global: list,
+ cur_reprs: list,
+ cur_labels: list,
+ cur_tfidf: TfidfVectorizer,
+ parent_id: int,
+ depth: int,
+ parent_path: str,
+ aspect_method_tag: str = 'nmf'):
+ """
+ vi_global : global positional indices into `can` for this node's variables.
+ cur_reprs : list of K arrays, each shape (len(vi_global), d).
+ Rows correspond positionally to vi_global — no global indexing.
+ cur_labels : NMF aspect labels for cur_reprs.
+ cur_tfidf : TF-IDF vectorizer fitted on this scope's texts.
+ """
+ if depth >= max_depth or len(vi_global) < min_cluster_size:
+ for vi in vi_global:
+ _add_child(node_map, parent_id, var_nodes[vi])
+ return
+
+ # NEW: shortcut for tiny homogeneous clusters — same _group, ≤3 vars.
+ # Avoids spending GMM/silhouette evaluation on already-meaningful leaves.
+ if len(vi_global) <= 3:
+ groups_here = {str(can.iloc[vi]['_group']) for vi in vi_global}
+ if len(groups_here) == 1:
+ for vi in vi_global:
+ _add_child(node_map, parent_id, var_nodes[vi])
+ return
+
+ sub_texts = [texts[vi] for vi in vi_global]
+
+ # Evaluate every aspect — pick the one with highest silhouette [ZHU §3.3]
+ best_sil, best_k_idx, best_lbls = -1.0, -1, None
+ for k_idx, rep in enumerate(cur_reprs):
+ if rep.shape[0] < 3:
+ continue
+ lbls, _, sil = cluster_aspect_gmm(
+ rep, max_k=min(max_clusters_per_split, len(vi_global) // 2))
+ if sil > best_sil:
+ best_sil, best_k_idx, best_lbls = sil, k_idx, lbls
+
+ if best_k_idx == -1 or best_sil < sil_threshold or best_lbls is None:
+ for vi in vi_global:
+ _add_child(node_map, parent_id, var_nodes[vi])
+ return
+
+ # NEW: split-quality rejection (TraCo-inspired structural check).
+ # Reject splits that are extremely imbalanced or mostly singletons —
+ # silhouette can be high even when one cluster swallows most variables.
+ cluster_sizes = list(Counter(best_lbls).values())
+ max_ratio = max(cluster_sizes) / sum(cluster_sizes)
+ n_singletons = sum(1 for s in cluster_sizes if s == 1)
+ if max_ratio > 0.85 or n_singletons > len(cluster_sizes) // 2:
+ for vi in vi_global:
+ _add_child(node_map, parent_id, var_nodes[vi])
+ return
+
+ # Group variables by cluster assignment
+ cluster_to_global: dict = defaultdict(list)
+ for local_pos, (vi, cl) in enumerate(zip(vi_global, best_lbls)):
+ cluster_to_global[int(cl)].append(vi)
+
+ aspect_name = cur_labels[best_k_idx]
+ global_to_local = {vi: idx for idx, vi in enumerate(vi_global)}
+
+ for cl_id, cl_global in cluster_to_global.items():
+ if not cl_global:
+ continue
+
+ # Singleton prevention: attach single-variable clusters directly [NEW]
+ if len(cl_global) == 1:
+ _add_child(node_map, parent_id, var_nodes[cl_global[0]])
+ continue
+
+ cl_texts = [texts[vi] for vi in cl_global]
+ cl_groups = [str(can.iloc[vi]['_group']) for vi in cl_global]
+ lbl, prov = label_cluster(cl_texts, sub_texts, cur_tfidf,
+ cluster_groups=cl_groups,
+ parent_path=parent_path,
+ llm_label_fn=llm_label_fn,
+ return_provenance=True)
+ desc = (f'Aspect: {aspect_name} | '
+ f'Silhouette: {best_sil:.3f} | '
+ f'Variables: {len(cl_global)}')
+ nid = _next_id(nodes)
+ nd = _make_node(nid, lbl, desc=desc)
+ nd['label_provenance'] = prov
+ nd['structure_provenance'] = {
+ 'route': 'aspect_clustering',
+ 'aspect_method': aspect_method_tag,
+ 'silhouette': round(float(best_sil), 4),
+ 'slot_coverage': None,
+ }
+ nodes.append(nd)
+ node_map[nid] = nd
+ _add_child(node_map, parent_id, nid)
+
+ # Slice embeddings to this cluster's local positions and recurse
+ cl_pos = [global_to_local[vi] for vi in cl_global]
+ sub_reprs = [rep[cl_pos] for rep in cur_reprs]
+ _recurse(cl_global, sub_reprs, cur_labels, cur_tfidf,
+ nid, depth + 1, f'{parent_path} > {lbl}',
+ aspect_method_tag=aspect_method_tag)
+
+ # ── group-anchored entry ─────────────────────────────────────────────────
+ # The _group column is built from user-confirmed group_cols — fully data-driven.
+ # We materialise each path segment as an aggregation node, then run NMF/GMM
+ # only inside each terminal group.
+
+ group_node_ids: dict = {} # path string → node id
+
+ def _get_or_create_group_node(path: str, parent_id: int) -> int:
+ if path in group_node_ids:
+ return group_node_ids[path]
+ seg_name = path.split(' > ')[-1].strip()
+ nid = _next_id(nodes)
+ nd = _make_node(nid, seg_name, ntype='aggregation', desc=f'Group: {path}')
+ nd['structure_provenance'] = {
+ 'route': 'group_anchor',
+ 'aspect_method': None,
+ 'silhouette': None,
+ 'slot_coverage': None,
+ }
+ nodes.append(nd)
+ node_map[nid] = nd
+ _add_child(node_map, parent_id, nid)
+ group_node_ids[path] = nid
+ return nid
+
+ # Collect global variable positions per terminal group path
+ terminal_groups: dict = defaultdict(list)
+ for vi, (_, row) in enumerate(can.iterrows()):
+ terminal_groups[str(row['_group'])].append(vi)
+
+ non_ungrouped = [g for g in terminal_groups if g.strip().lower() != 'ungrouped']
+
+ if not non_ungrouped:
+ # No group structure detected → fall back to global NMF from root
+ full_reprs = [rep[list(range(n_vars))] for rep in aspect_reprs]
+ _recurse(list(range(n_vars)), full_reprs, aspect_labels, tfidf,
+ 0, 0, project)
+ else:
+ # Optional Streamlit progress bar — visible feedback for large datasets
+ # like HCP where the per-group loop dominates runtime.
+ try:
+ import streamlit as _st_progress
+ _pbar = _st_progress.progress(0.0, text='Building groups…')
+ _show_progress = True
+ except Exception:
+ _pbar = None
+ _show_progress = False
+
+ sorted_groups = sorted(terminal_groups.items())
+ n_groups_total = len(sorted_groups)
+
+ for _g_idx, (group_path, vi_list) in enumerate(sorted_groups):
+ if _show_progress and _pbar is not None:
+ try:
+ short_path = group_path[:60] + ('…' if len(group_path) > 60 else '')
+ _pbar.progress((_g_idx + 1) / max(1, n_groups_total),
+ text=f'[{_g_idx + 1}/{n_groups_total}] '
+ f'{short_path} ({len(vi_list)} vars)')
+ except Exception:
+ pass
+ # Build L1/L2 path nodes from detected group metadata
+ segments = [s.strip() for s in group_path.split(' > ') if s.strip()]
+ pid = 0
+ for depth_seg in range(len(segments)):
+ cumpath = ' > '.join(segments[:depth_seg + 1])
+ pid = _get_or_create_group_node(cumpath, pid)
+ existing_depth = len(segments)
+
+ if len(vi_list) < min_cluster_size:
+ for vi in vi_list:
+ _add_child(node_map, pid, var_nodes[vi])
+ continue
+
+ # ── Aspect-discovery callback for this group ──
+ # Top-level: optionally uses FASTopic (slow, transformer-based,
+ # semantic). Sub-recursion: NMF only (fast, lexical). This split
+ # gives the best of both: rich top-level structure + quick deeper
+ # splits. Critical for HCP performance (~10× speedup vs running
+ # FASTopic in every sub-recursion).
+ def _aspect_recurse_for_group(vi_sub: list, parent: int,
+ depth_left: int = 99,
+ is_top_level: bool = False) -> int:
+ local_texts = [texts[vi] for vi in vi_sub]
+ use_reprs, use_labels, use_tfidf = None, None, None
+ aspect_method_tag = 'sliced_global'
+
+ if (is_top_level and use_fastopic and _FASTOPIC_AVAILABLE
+ and len(vi_sub) >= fastopic_min_size):
+ try:
+ f_tfidf, _, _, _, f_H, _, f_labels = discover_aspects_fastopic(
+ local_texts, max_aspects=max_aspects)
+ f_reprs = per_aspect_representations(
+ local_texts, f_H, f_tfidf, sbert_model)
+ use_reprs, use_labels, use_tfidf = f_reprs, f_labels, f_tfidf
+ aspect_method_tag = 'fastopic'
+ except Exception:
+ use_reprs = None
+ if use_reprs is None and local_nmf and len(vi_sub) >= min_local_nmf_size:
+ try:
+ l_tfidf, _, _, _, l_H, _, l_labels = discover_aspects(
+ local_texts, max_aspects)
+ l_reprs = per_aspect_representations(
+ local_texts, l_H, l_tfidf, sbert_model)
+ use_reprs, use_labels, use_tfidf = l_reprs, l_labels, l_tfidf
+ aspect_method_tag = 'nmf'
+ except Exception:
+ use_reprs = None
+ if use_reprs is None:
+ use_reprs = [rep[vi_sub] for rep in aspect_reprs]
+ use_labels, use_tfidf = aspect_labels, tfidf
+
+ before = sum(1 for n in nodes if n.get('type') == 'aggregation')
+ effective_depth = max(0, min(depth_left, max_depth))
+ _recurse(vi_sub, use_reprs, use_labels, use_tfidf,
+ parent, max(0, max_depth - effective_depth), group_path,
+ aspect_method_tag=aspect_method_tag)
+ after = sum(1 for n in nodes if n.get('type') == 'aggregation')
+ return max(0, after - before)
+
+ local_texts = [texts[vi] for vi in vi_list]
+ _top_grp = group_path.split(' > ')[0].strip()
+
+ role_built = False
+ role_regularity = None
+ route_label = None
+
+ # ── ROUTING 1A: PER-ROW LLM ROLE EXTRACTION (Zhu et al. EMNLP 2025) ──
+ # The cleanest semantic route: one LLM call per row extracts
+ # measure / statistic / condition / subtype values directly from
+ # the description text. Strict substring grounding prevents
+ # hallucination. Bypasses SBERT phrase clustering entirely.
+ if (use_per_row_role_extraction
+ and per_row_role_extractor_fn is not None
+ and len(vi_list) >= 4):
+ try:
+ role_result = discover_roles_via_per_row_extraction(
+ local_texts,
+ text_col_names=text_col_names,
+ per_row_extractor_fn=per_row_role_extractor_fn,
+ group_name=_top_grp,
+ regularity_threshold=role_regularity_threshold,
+ )
+ role_regularity = role_result.get('regularity')
+ # Always capture audit data, even when valid=False, so the
+ # Role Decomposition tab can show what the LLM proposed
+ # and why proposals were rejected. group_path + vi_list
+ # let the display map each audit row back to its variable.
+ try:
+ st.session_state.a2_per_row_audit.append({
+ 'group_name': _top_grp,
+ 'group_path': group_path,
+ 'vi_list': [int(v) for v in vi_list],
+ 'n_rows': len(vi_list),
+ 'coverage': role_result.get('coverage'),
+ 'valid': role_result.get('valid'),
+ 'roles_final': list(role_result.get('roles', {}).keys()),
+ 'summary': role_result.get('extractor_summary', {}),
+ 'per_row_audit': role_result.get('per_row_audit', []),
+ })
+ except Exception:
+ pass
+ if role_result['valid']:
+ role_built = build_role_hierarchy(
+ vi_list, role_result, can,
+ pid, group_path, nodes, node_map, var_nodes,
+ max_depth_remaining=max(2, max_depth - existing_depth),
+ post_split_fn=lambda vi_grp, p, d:
+ _aspect_recurse_for_group(vi_grp, p, d,
+ is_top_level=False),
+ min_post_split_size=max(6, min_cluster_size + 4))
+ if role_built:
+ route_label = 'per_row_llm_extraction'
+ except Exception:
+ role_built = False
+
+ # ── ROUTING 1B: SBERT phrase clustering fallback (Option D original) ──
+ if not role_built and (use_role_decomposition and sbert_model is not None
+ and len(vi_list) >= 6):
+ try:
+ role_result = discover_roles_via_sbert_phrase_clustering(
+ local_texts,
+ text_col_names=text_col_names,
+ sbert_model=sbert_model,
+ llm_role_namer_fn=llm_role_namer_fn,
+ group_name=_top_grp,
+ regularity_threshold=role_regularity_threshold,
+ )
+ role_regularity = role_result.get('regularity')
+ if role_result['valid']:
+ role_built = build_role_hierarchy(
+ vi_list, role_result, can,
+ pid, group_path, nodes, node_map, var_nodes,
+ max_depth_remaining=max(2, max_depth - existing_depth),
+ post_split_fn=lambda vi_grp, p, d:
+ _aspect_recurse_for_group(vi_grp, p, d,
+ is_top_level=False),
+ min_post_split_size=max(6, min_cluster_size + 4))
+ if role_built:
+ route_label = 'sbert_phrase_clustering'
+ except Exception:
+ role_built = False
+ if role_built:
+ anchor = node_map.get(pid)
+ if anchor is not None:
+ anchor.setdefault('structure_provenance', {})
+ anchor['structure_provenance']['phrase_regularity'] = role_regularity
+ anchor['structure_provenance']['route_used'] = route_label
+ continue
+
+ # ── ROUTING 2: phrase-slot mining (IE / mutual-exclusion) ──
+ slot_built = False
+ if use_slot_mining and len(vi_list) >= 6:
+ try:
+ slot_result = mine_phrase_slots(
+ local_texts,
+ text_col_names=text_col_names,
+ group_name=_top_grp,
+ llm_role_classifier_fn=llm_role_classifier_fn,
+ )
+ if slot_result['valid']:
+ slot_built = build_slot_hierarchy(
+ vi_list, slot_result, can,
+ pid, group_path, nodes, node_map, var_nodes,
+ max_depth_remaining=max(2, max_depth - existing_depth),
+ post_slot_split_fn=lambda vi_grp, p, d:
+ _aspect_recurse_for_group(vi_grp, p, d,
+ is_top_level=False),
+ min_post_slot_size=max(6, min_cluster_size + 4))
+ except Exception:
+ slot_built = False
+ if slot_built:
+ anchor = node_map.get(pid)
+ if anchor is not None:
+ anchor.setdefault('structure_provenance', {})
+ anchor['structure_provenance']['phrase_regularity'] = role_regularity
+ anchor['structure_provenance']['route_used'] = 'slot_mining_fallback'
+ continue
+
+ # ── ROUTING 3: aspect discovery (FASTopic / NMF) ──
+ anchor = node_map.get(pid)
+ if anchor is not None:
+ anchor.setdefault('structure_provenance', {})
+ anchor['structure_provenance']['phrase_regularity'] = role_regularity
+ anchor['structure_provenance']['route_used'] = 'aspect_clustering_fallback'
+ _aspect_recurse_for_group(vi_list, pid, is_top_level=True)
+
+ # Clear the progress bar
+ try:
+ if _show_progress and _pbar is not None:
+ _pbar.empty()
+ except Exception:
+ pass
+
+ # ── POST-BUILD PASS 1 — sibling common-prefix factoring [Castanet 2007] ──
+ try:
+ n_factored = factor_sibling_common_prefixes(
+ nodes, min_siblings=3, min_prefix_tokens=2)
+ except Exception:
+ n_factored = 0
+
+ # ── POST-BUILD PASS 2 — conservative noise-label pruning ──────────────────
+ try:
+ n_dissolved = prune_low_quality_aggregations(nodes)
+ except Exception:
+ n_dissolved = 0
+
+ # ── POST-BUILD PASS 3 — strip group prefix from child labels ──────────────
+ # 'DMS — Total Errors' under DMS → 'Total Errors'. Reduces visual
+ # redundancy without altering tree structure.
+ try:
+ n_stripped = strip_group_prefix_from_labels(nodes)
+ except Exception:
+ n_stripped = 0
+
+ # ── POST-BUILD PASS 4 — enforce single parent (collapse DAG → tree) ───────
+ # Keeps each variable under its most specific parent so the hierarchy is a
+ # true tree, matching the viewer/evaluator assumptions and rendering proportionally.
+ try:
+ n_reparented = enforce_single_parent(nodes)
+ except Exception:
+ n_reparented = 0
+
+ # ── POST-BUILD PASS 5 — drop aggregation nodes left childless by PASS 4 ───
+ try:
+ n_empty_pruned = prune_empty_aggregations(nodes)
+ except Exception:
+ n_empty_pruned = 0
+
+ # Annotate the root with post-build statistics
+ if nodes and nodes[0].get('type') == 'root':
+ nodes[0]['post_build_stats'] = {
+ 'sibling_factor_nodes_inserted': int(n_factored),
+ 'low_quality_nodes_dissolved': int(n_dissolved),
+ 'group_prefix_labels_stripped': int(n_stripped),
+ 'dag_links_removed': int(n_reparented),
+ 'empty_aggregations_pruned': int(n_empty_pruned),
+ }
+
+ # Deduplicate children
+ for nd in nodes:
+ nd['related'] = list(dict.fromkeys(int(x) for x in nd['related']))
+
+ return nodes
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STEP 9 — EVALUATION [ZHU Table 2 / TraCo / TICL §3.4]
+# ──────────────────────────────────────────────────────────────────────────────
+def purity_score(true_labels, pred_labels) -> float:
+ true = np.array(true_labels); pred = np.array(pred_labels)
+ total = len(true)
+ if total == 0:
+ return 0.0
+ score = 0
+ for cl in np.unique(pred):
+ mask = pred == cl
+ if not mask.any():
+ continue
+ counts = np.bincount(true[mask].astype(int))
+ score += counts.max()
+ return score / total
+
+def evaluate(true_labels, pred_labels) -> dict:
+ le = LabelEncoder()
+ tl = le.fit_transform([str(x) for x in true_labels])
+ pl = np.array(pred_labels, dtype=int)
+ return {
+ 'NMI': round(float(normalized_mutual_info_score(tl, pl)), 4),
+ 'ARI': round(float(adjusted_rand_score(tl, pl)), 4),
+ 'Purity': round(purity_score(tl, pl), 4),
+ }
+
+def hierarchy_quality_metrics(nodes: list, tfidf: TfidfVectorizer) -> dict:
+ """
+ Compute lightweight structural quality metrics inspired by TraCo (AAAI 2024).
+
+ [TraCo] Wu et al. identify three failure modes in hierarchical topic models:
+ low affinity (children unrelated to parent), low rationality (children not
+ more specific than parent), and low diversity (sibling topics too similar).
+
+ Here we measure:
+ • parent-child coherence: mean cosine(parent_label, child_label) — proxy for affinity.
+ • sibling diversity: mean (1 - pairwise cosine) among siblings — proxy for diversity.
+ """
+ node_map = {n['id']: n for n in nodes}
+ coherence_scores, diversity_scores = [], []
+
+ for n in nodes:
+ if n['type'] != 'aggregation' or not n['related']:
+ continue
+ children = [node_map[c] for c in n['related'] if c in node_map]
+ child_labels = [c['name'] for c in children
+ if c.get('type') == 'aggregation' and c['name'] != 'Group']
+ if len(child_labels) < 2:
+ continue
+
+ try:
+ parent_vec = tfidf.transform([n['name']])
+ child_vecs = tfidf.transform(child_labels)
+ # Parent-child coherence
+ coh = float(cosine_similarity(parent_vec, child_vecs).mean())
+ coherence_scores.append(coh)
+ # Sibling diversity
+ sib_sims = cosine_similarity(child_vecs)
+ np.fill_diagonal(sib_sims, 0)
+ diversity_scores.append(float(1 - sib_sims.mean()))
+ except Exception:
+ pass
+
+ return {
+ 'mean_parent_child_coherence': round(np.mean(coherence_scores), 4) if coherence_scores else 0.0,
+ 'mean_sibling_diversity': round(np.mean(diversity_scores), 4) if diversity_scores else 0.0,
+ 'n_aggregation_nodes': len([n for n in nodes if n['type'] == 'aggregation']),
+ 'n_singleton_splits': len([n for n in nodes
+ if n['type'] == 'aggregation'
+ and len(n['related']) == 1]),
+ }
+
+# ──────────────────────────────────────────────────────────────────────────────
+# DISPLAY-TIME ONE-CHILD CHAIN COMPRESSION (visualization only — structure preserved in JSON)
+# ──────────────────────────────────────────────────────────────────────────────
+def compress_one_child_chains(nodes: list) -> list:
+ """
+ Display-only transformation: collapse chains where an aggregation node has
+ exactly one aggregation child (e.g. 'DMS → DMS Recommended Standard').
+ The collapsed node displays as 'DMS / DMS Recommended Standard'.
+ Structural data in the exported JSON is NOT modified — this returns a new
+ node list used only for visualisation.
+ """
+ nodes = _filter_dissolved(nodes)
+ nm = {int(n['id']): dict(n) for n in nodes} # deep-ish copy
+ parent_of: dict = {}
+ for n in nodes:
+ for c in n.get('related', []):
+ parent_of.setdefault(int(c), int(n['id']))
+
+ def _is_chain_link(n):
+ if n.get('type') != 'aggregation':
+ return False
+ children = n.get('related', [])
+ return (len(children) == 1
+ and nm.get(int(children[0]), {}).get('type') == 'aggregation')
+
+ changed = True
+ while changed:
+ changed = False
+ for nid, n in list(nm.items()):
+ if _is_chain_link(n):
+ child_id = int(n['related'][0])
+ child = nm[child_id]
+ # Merge: child takes parent's id, parent's slot, but combined name
+ merged_name = f"{n['name']} / {child['name']}"
+ new_node = dict(child)
+ new_node['id'] = nid
+ new_node['name'] = merged_name
+ new_node['desc'] = f"{n.get('desc','')} | {child.get('desc','')}"
+ # Rewire child's children to nid (already nid)
+ nm[nid] = new_node
+ # Remove the original child node
+ if child_id in nm:
+ del nm[child_id]
+ # Re-point any references to child_id → nid
+ for other in nm.values():
+ other['related'] = [nid if int(c) == child_id else int(c)
+ for c in other.get('related', [])]
+ changed = True
+ break
+
+ return list(nm.values())
+
+# ──────────────────────────────────────────────────────────────────────────────
+# VISUALISATION HELPERS
+# ──────────────────────────────────────────────────────────────────────────────
+def _filter_dissolved(nodes: list) -> list:
+ """
+ Drop dissolved/hidden nodes for visualisation. Children of dissolved
+ nodes were already promoted to the grandparent by the pruning pass, so
+ dropping the dissolved wrapper here makes the tree render cleanly.
+ """
+ drop_ids = {int(n['id']) for n in nodes
+ if n.get('type') == 'dissolved' or n.get('isShown') is False}
+ if not drop_ids:
+ return nodes
+ out = []
+ for n in nodes:
+ if int(n['id']) in drop_ids:
+ continue
+ m = dict(n)
+ m['related'] = [int(c) for c in n.get('related', [])
+ if int(c) not in drop_ids]
+ out.append(m)
+ return out
+
+def _leaf_ids(nodes: list, nid: int) -> list:
+ m = {int(n['id']): n for n in nodes}
+ out = []
+ def rec(x):
+ n = m.get(int(x))
+ if not n: return
+ if n.get('type') == 'attribute': out.append(int(x)); return
+ for c in n.get('related', []): rec(int(c))
+ rec(nid)
+ return list(dict.fromkeys(out))
+
+def _parent_map(nodes: list) -> dict:
+ pm = {}
+ for n in nodes:
+ for c in n.get('related', []):
+ if int(c) not in pm:
+ pm[int(c)] = int(n['id'])
+ return pm
+
+def _tree_value_map(nodes: list, pm: dict) -> dict:
+ """
+ Leaf count per node measured along the *rendered* tree (each node has
+ exactly one parent, per `pm`). Plotly draws sectors using that same
+ single-parent structure, so values built this way always satisfy
+ parent == sum(children) — which is what branchvalues='total' requires.
+
+ The full hierarchy can be a DAG (a variable promoted under more than one
+ role branch), in which case `_leaf_ids` double-counts a shared leaf and a
+ parent's unique-leaf count comes out *less* than the sum of its children's
+ counts. Feeding those numbers to a 'total' chart blanks it. Counting on
+ the rendered tree instead avoids that without changing the hierarchy.
+ """
+ kids = {}
+ for child, par in pm.items():
+ kids.setdefault(int(par), []).append(int(child))
+ nodemap = {int(n['id']): n for n in nodes}
+ memo = {}
+ def count(nid: int) -> int:
+ if nid in memo:
+ return memo[nid]
+ memo[nid] = 1 # guard against cycles while recursing
+ n = nodemap.get(nid)
+ if n is not None and n.get('type') == 'attribute':
+ memo[nid] = 1
+ return 1
+ ch = kids.get(nid, [])
+ v = sum(count(c) for c in ch) if ch else 1
+ memo[nid] = max(1, v)
+ return memo[nid]
+ return {nid: count(nid) for nid in nodemap}
+
+def _wrap_hover(text: str, width: int = 80) -> str:
+ """Soft-wrap long descriptions onto multiple
-separated lines so the
+ Plotly hover tooltip shows the full text instead of being cut off."""
+ import textwrap as _tw
+ s = str(text or '')
+ if not s:
+ return ''
+ lines = []
+ for raw_line in s.split('\n'):
+ lines.extend(_tw.wrap(raw_line, width=width) or [''])
+ return '
'.join(lines)
+
+def plot_sunburst(nodes: list, max_depth: int = 4):
+ nodes = _filter_dissolved(nodes)
+ pm = _parent_map(nodes)
+ vm = _tree_value_map(nodes, pm)
+ ids, labels, parents, values, hover = [], [], [], [], []
+ for n in nodes:
+ nid = int(n['id'])
+ lc = len(_leaf_ids(nodes, nid))
+ ids.append(str(nid))
+ labels.append(str(n.get('name', ''))[:40])
+ parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+ values.append(vm.get(nid, 1))
+ hover.append(f"{n.get('name','')}
Type: {n.get('type','')}
"
+ f"Variables: {lc}
{_wrap_hover(n.get('desc',''))}")
+ fig = go.Figure(go.Sunburst(
+ ids=ids, labels=labels, parents=parents, values=values,
+ branchvalues='total', hovertext=hover, hoverinfo='text',
+ maxdepth=max_depth, insidetextorientation='radial',
+ marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
+ title=dict(text='Click sector to drill down — click centre to go back',
+ font=dict(size=13), x=0.5))
+ return fig
+
+def plot_treemap(nodes: list):
+ nodes = _filter_dissolved(nodes)
+ pm = _parent_map(nodes)
+ vm = _tree_value_map(nodes, pm)
+ ids, labels, parents, values, hover = [], [], [], [], []
+ for n in nodes:
+ nid = int(n['id'])
+ lc = len(_leaf_ids(nodes, nid))
+ ids.append(str(nid))
+ labels.append(str(n.get('name', ''))[:40])
+ parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+ values.append(vm.get(nid, 1))
+ hover.append(f"{n.get('name','')}
Variables: {lc}
"
+ f"{_wrap_hover(n.get('desc',''))}")
+ fig = go.Figure(go.Treemap(
+ ids=ids, labels=labels, parents=parents, values=values,
+ branchvalues='total', hovertext=hover, hoverinfo='text',
+ textinfo='label+value',
+ marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
+ return fig
+
+# ──────────────────────────────────────────────────────────────────────────────
+# NODE-LINK TREE — Reingold-Tilford layout (matches Approach 1.1 interface)
+# ──────────────────────────────────────────────────────────────────────────────
+def _a2_node_color(n: dict) -> str:
+ t = n.get('type', '')
+ if t == 'root': return '#c44e52'
+ if t == 'attribute': return '#4C72B0'
+ if t == 'collapsed': return '#bbbbbb'
+ return '#8C8C8C'
+
+def _display_graph(nodes: list, max_depth: int = 4, show_hidden: bool = False):
+ """Walk tree to chosen depth, inserting 'collapsed' placeholders for cut-off branches."""
+ m = {int(n['id']): n for n in nodes}
+ dnodes: dict = {}
+ edges: list = []
+ counter = 10 ** 9
+
+ def rec(nid, depth):
+ nonlocal counter
+ n = m.get(int(nid))
+ if not n:
+ return
+ if not show_hidden and n.get('isShown') is False and depth > 0:
+ return
+ dnodes[int(nid)] = n
+ if depth >= max_depth and n.get('related'):
+ counter += 1
+ cid = counter
+ n_leaves = len(_leaf_ids(nodes, nid))
+ dnodes[cid] = {'id': cid,
+ 'name': f'… {n_leaves} variables',
+ 'type': 'collapsed', 'dtype': 'determine',
+ 'related': [], 'desc': f"Collapsed: {n.get('name')}",
+ 'isShown': True}
+ edges.append((int(nid), cid))
+ return
+ for c in n.get('related', []):
+ ch = m.get(int(c))
+ if not ch:
+ continue
+ if not show_hidden and ch.get('isShown') is False:
+ continue
+ edges.append((int(nid), int(c)))
+ rec(int(c), depth + 1)
+
+ rec(0, 0)
+ return list(dnodes.values()), edges
+
+def _positions(dnodes: list, edges: list):
+ """Reingold-Tilford style positions: x=depth, y=subtree-aware vertical."""
+ H_SCALE = 3.0
+ V_SPACE = 1.8
+ children: dict = defaultdict(list)
+ for p, c in edges:
+ children[p].append(c)
+ pos: dict = {}
+ counter = {'v': 0}
+
+ def rec(nid, depth):
+ ch = children.get(nid, [])
+ if not ch:
+ y_pos = counter['v'] * V_SPACE
+ counter['v'] += 1
+ pos[nid] = (depth * H_SCALE, y_pos)
+ return y_pos
+ child_ys = [rec(c, depth + 1) for c in ch]
+ y_pos = float(np.mean(child_ys))
+ pos[nid] = (depth * H_SCALE, y_pos)
+ return y_pos
+
+ rec(0, 0)
+ return pos
+
+def plot_node_link(nodes: list, max_depth: int = 4,
+ show_hidden: bool = False, show_leaf_labels: bool = False):
+ """
+ Node-link tree with elbow edges (matches Approach 1.1 layout).
+ Best for exploring structure at moderate depth — Sunburst remains
+ recommended for large hierarchies per Taxonomizer (Bian et al. 2020).
+ """
+ nodes = _filter_dissolved(nodes)
+ dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
+ pos = _positions(dnodes, edges)
+
+ # Elbow edges
+ ex, ey = [], []
+ for p, c in edges:
+ if p not in pos or c not in pos:
+ continue
+ x0, y0 = pos[p]
+ x1, y1 = pos[c]
+ xm = (x0 + x1) / 2
+ ex += [x0, xm, xm, x1, None]
+ ey += [y0, y0, y1, y1, None]
+ traces = [go.Scatter(x=ex, y=ey, mode='lines',
+ line=dict(width=1, color='#c8c8c8'),
+ hoverinfo='skip', showlegend=False)]
+
+ agg_xs, agg_ys, agg_labels, agg_colors, agg_hover = [], [], [], [], []
+ lf_xs, lf_ys, lf_labels, lf_colors, lf_hover = [], [], [], [], []
+
+ for n in dnodes:
+ nid = int(n['id'])
+ if nid not in pos:
+ continue
+ x, y = pos[nid]
+ lc = len(_leaf_ids(nodes, nid))
+ lab = n.get('name', str(nid))
+ htxt = (f"{n.get('name','')}
"
+ f"Type: {n.get('type','')}
"
+ f"Variables: {lc}
{_wrap_hover(n.get('desc',''))}")
+ col = _a2_node_color(n)
+
+ if n.get('type') in ('root', 'aggregation', 'collapsed'):
+ display_lab = (lab + (f' ({lc})' if lc else ''))[:50]
+ agg_xs.append(x); agg_ys.append(y)
+ agg_labels.append(display_lab); agg_colors.append(col); agg_hover.append(htxt)
+ else:
+ display_lab = lab[:40] if show_leaf_labels else ''
+ lf_xs.append(x); lf_ys.append(y)
+ lf_labels.append(display_lab); lf_colors.append(col); lf_hover.append(htxt)
+
+ if agg_xs:
+ traces.append(go.Scatter(
+ x=agg_xs, y=agg_ys, mode='markers+text',
+ text=agg_labels, textposition='middle right',
+ hovertext=agg_hover, hoverinfo='text',
+ marker=dict(size=16, color=agg_colors,
+ line=dict(color='white', width=2)),
+ showlegend=False))
+ if lf_xs:
+ traces.append(go.Scatter(
+ x=lf_xs, y=lf_ys, mode='markers+text',
+ text=lf_labels, textposition='middle right',
+ hovertext=lf_hover, hoverinfo='text',
+ marker=dict(size=7, color=lf_colors, symbol='circle',
+ opacity=0.75, line=dict(color='white', width=1)),
+ showlegend=False))
+
+ n_leaves = max(12, len(lf_xs))
+ fig = go.Figure(traces)
+ fig.update_layout(
+ height=max(700, min(4000, int(n_leaves * 32))),
+ margin=dict(l=20, r=220, t=40, b=20),
+ plot_bgcolor='white', paper_bgcolor='white',
+ xaxis=dict(visible=False, fixedrange=False),
+ yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
+ dragmode='pan',
+ annotations=[dict(
+ text='Tip: Sunburst is better for large hierarchies [Taxonomizer 2020]',
+ xref='paper', yref='paper', x=0.0, y=1.01,
+ showarrow=False, font=dict(size=11, color='grey'), align='left')]
+ )
+ return fig
+
+# ──────────────────────────────────────────────────────────────────────────────
+# STREAMLIT APP
+# ──────────────────────────────────────────────────────────────────────────────
+st.set_page_config(page_title='Approach 2 — Multi-Aspect Hierarchy',
+ layout='wide')
+st.title('Approach 2 — Role-Decomposed Metadata Hierarchy')
+st.caption('Group anchoring → LLM role extraction → role-nested LoD tree. '
+ 'Full method details and citations in the Method tab.')
+
+# Session-state init — defensive: always ensure these keys exist with safe
+# defaults. Streamlit can occasionally drop attribute-style access if the key
+# was set via [setter] in a previous run; using setdefault here works on both
+# new and existing sessions.
+st.session_state.setdefault('a2_nodes', None)
+st.session_state.setdefault('a2_can', None)
+st.session_state.setdefault('a2_meta', {})
+st.session_state.setdefault('a2_per_row_audit', []) # list of group audits
+
+# Local-LLM auto-detection: probe Ollama server. LLM refinement defaults to
+# ON when Ollama is reachable; user can switch it off any time. Env vars
+# OLLAMA_URL and OLLAMA_MODEL override the localhost / qwen2.5:3b defaults.
+_ollama_url = os.environ.get('OLLAMA_URL', OLLAMA_URL_DEFAULT).strip() or OLLAMA_URL_DEFAULT
+_ollama_model = os.environ.get('OLLAMA_MODEL', OLLAMA_MODEL_DEFAULT).strip() or OLLAMA_MODEL_DEFAULT
+_ollama_reachable = _ping_ollama(_ollama_url)
+
+# Groq detection: env-var GROQ_API_KEY makes the cloud option available.
+_groq_url = os.environ.get('GROQ_URL', GROQ_URL_DEFAULT).strip() or GROQ_URL_DEFAULT
+_groq_model = os.environ.get('GROQ_MODEL', GROQ_MODEL_DEFAULT).strip() or GROQ_MODEL_DEFAULT
+_groq_key_env = os.environ.get('GROQ_API_KEY', '').strip()
+
+_default_provider = 'groq' if (_groq_key_env and _LLM_CLIENT_AVAILABLE) else 'ollama'
+_default_llm_on = (
+ (_ollama_reachable or bool(_groq_key_env)) and _LLM_CLIENT_AVAILABLE)
+
+with st.sidebar:
+ st.header('1 · Input')
+ uploads = st.file_uploader('Metadata / data-dictionary file(s)',
+ type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
+ accept_multiple_files=True)
+
+ st.header('2 · Algorithm')
+ max_aspects = st.slider('Max aspects (K upper bound)', 3, 15, 6)
+ max_depth = st.slider('Max tree depth', 2, 10, 6)
+ min_cluster_sz = st.slider('Min variables per cluster', 1, 10, 2)
+ sil_thresh = st.slider('Silhouette threshold',
+ 0.01, 0.30,
+ value=0.04 if not _SBERT_AVAILABLE else 0.05,
+ step=0.01)
+ max_k_split = st.slider('Max child clusters per split', 2, 12, 5)
+ use_sbert = st.checkbox('SBERT embeddings',
+ value=_SBERT_AVAILABLE,
+ disabled=not _SBERT_AVAILABLE)
+ local_nmf = st.checkbox('Local NMF fallback', value=True)
+ min_local_nmf = st.slider('Min group size for NMF', 5, 30, 6)
+ use_slot_mining = st.checkbox('Phrase-slot mining', value=True)
+ use_fastopic = st.checkbox(
+ 'FASTopic aspects',
+ value=_FASTOPIC_AVAILABLE,
+ disabled=not _FASTOPIC_AVAILABLE,
+ )
+ fastopic_min_sz = st.slider('Min group size for FASTopic', 6, 100, 40)
+ if not _FASTOPIC_AVAILABLE:
+ st.warning('FASTopic not installed — using NMF.')
+
+ st.header('3 · LLM provider')
+ provider_options = ['Ollama (local)', 'Groq (cloud)']
+ provider_default_idx = 1 if _default_provider == 'groq' else 0
+ provider_label = st.radio('Provider', provider_options,
+ index=provider_default_idx, horizontal=True)
+ llm_provider = 'groq' if provider_label.startswith('Groq') else 'ollama'
+
+ if llm_provider == 'groq':
+ if not _LLM_CLIENT_AVAILABLE:
+ st.warning('`openai` package not installed.')
+ elif _groq_key_env:
+ st.success('GROQ_API_KEY detected → ready')
+ groq_key_in = st.text_input(
+ 'Groq API key', value=_groq_key_env, type='password',
+ help='Free key at console.groq.com/keys, or set GROQ_API_KEY env var.')
+ groq_model_in = st.text_input('Groq model', value=_groq_model)
+ llm_base_url = _groq_url
+ llm_model = groq_model_in
+ llm_api_key = groq_key_in
+ llm_ready = bool(groq_key_in and _LLM_CLIENT_AVAILABLE)
+ else:
+ if not _LLM_CLIENT_AVAILABLE:
+ st.warning('`openai` package not installed.')
+ elif _ollama_reachable:
+ st.success(f'Ollama reachable at `{_ollama_url}`')
+ else:
+ st.warning(f'Ollama not reachable at `{_ollama_url}`.')
+ ollama_url_in = st.text_input('Ollama URL', value=_ollama_url)
+ ollama_model_in = st.text_input('Ollama model', value=_ollama_model)
+ llm_base_url = ollama_url_in
+ llm_model = ollama_model_in
+ llm_api_key = ''
+ llm_ready = bool(_LLM_CLIENT_AVAILABLE and _ollama_reachable)
+
+ st.header('4 · LLM features')
+ use_per_row_role_extraction = st.checkbox(
+ 'Per-row role extraction (primary route)',
+ value=llm_ready,
+ disabled=not llm_ready,
+ help='One LLM call per variable extracts measure / statistic / '
+ 'condition / subtype, grounded to the description text.'
+ )
+ use_llm = st.checkbox(
+ 'LLM label refinement',
+ value=llm_ready,
+ disabled=not llm_ready,
+ )
+ use_llm_roles = st.checkbox(
+ 'LLM phrase-role classifier (fallback)',
+ value=llm_ready,
+ disabled=not llm_ready,
+ )
+ use_role_decomposition = st.checkbox(
+ 'SBERT phrase clustering (fallback)',
+ value=_SBERT_AVAILABLE and llm_ready,
+ disabled=not (_SBERT_AVAILABLE and llm_ready),
+ )
+ role_namer_constrained = st.checkbox(
+ 'Constrained role vocabulary',
+ value=True,
+ )
+ role_regularity_threshold = st.slider(
+ 'Min phrase regularity', 0.05, 0.80, 0.20, 0.05,
+ )
+
+ st.header('5 · Project')
+ project_name = st.text_input('Project name', value='project')
+
+# ── load and configure files ──────────────────────────────────────────────────
+if uploads:
+ import tempfile
+ tmp = Path(tempfile.mkdtemp())
+ raw_by = {}
+ cfg_by = {}
+
+ st.subheader('Step 1 — Inspect metadata')
+ for f in uploads:
+ p = tmp / safe_name(f.name)
+ p.write_bytes(f.getbuffer())
+ try:
+ df = load_any(p)
+ raw_by[f.name] = df
+ cfg_by[f.name] = detect_roles(df)
+ with st.expander(f'{f.name}', expanded=False):
+ st.write(f'Rows: **{len(df):,}** Columns: **{len(df.columns)}**')
+ st.dataframe(df.head(8), use_container_width=True)
+ except Exception as e:
+ st.error(f'Could not load {f.name}: {e}')
+
+ st.subheader('Step 2 — Confirm column roles')
+ configs = {}
+ for name, df in raw_by.items():
+ cols = list(df.columns)
+ auto = cfg_by[name]
+ with st.expander(f'{name}', expanded=True):
+ c1, c2 = st.columns(2)
+ with c1:
+ leaf = st.multiselect('Leaf variable column(s)', cols,
+ default=[c for c in auto['leaf_cols'] if c in cols],
+ key=f'lf_{name}')
+ group = st.multiselect('Group / task column(s)', cols,
+ default=[c for c in auto['group_cols'] if c in cols],
+ key=f'gr_{name}')
+ with c2:
+ text = st.multiselect('Description column(s)', cols,
+ default=[c for c in auto['text_cols'] if c in cols],
+ key=f'tx_{name}')
+ meta = st.multiselect('Type / unit column(s)', cols,
+ default=[c for c in auto['meta_cols'] if c in cols],
+ key=f'mt_{name}')
+ configs[name] = {'leaf_cols': leaf, 'group_cols': group,
+ 'text_cols': text, 'meta_cols': meta}
+
+ if st.button('Build Approach 2 Hierarchy', type='primary'):
+ try:
+ # Clear stale audit data from any previous build
+ st.session_state.a2_per_row_audit = []
+
+ cans = [build_canonical(df, configs[name], name)
+ for name, df in raw_by.items()]
+ can = pd.concat(cans, ignore_index=True)
+ st.session_state.a2_can = can
+ texts = can['_text'].fillna('').astype(str).tolist()
+
+ # Global NMF (used as fallback and for facet trees)
+ with st.spinner('Step 3 — Discovering global aspects via NMF …'):
+ tfidf, X, nmf, W, H, K, alabels = discover_aspects(texts, max_aspects)
+ st.session_state.a2_meta = {
+ 'K': K, 'aspect_labels': alabels, 'tfidf': tfidf, 'W': W, 'H': H
+ }
+ st.info(f'Global aspects discovered: **{K}** — {" | ".join(alabels)}')
+
+ sbert_model = None
+ if use_sbert and _SBERT_AVAILABLE:
+ with st.spinner('Loading SBERT model …'):
+ sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
+
+ with st.spinner('Step 4 — Building global per-aspect representations …'):
+ reprs = per_aspect_representations(texts, H, tfidf, sbert_model)
+
+ # Build LLM function trio — same provider + endpoint for all.
+ llm_fn = None
+ if use_llm:
+ llm_fn = make_llm_label_fn(llm_base_url, llm_model,
+ provider=llm_provider,
+ api_key=llm_api_key)
+ if llm_fn is None:
+ st.warning('LLM unreachable — deterministic labels only.')
+
+ llm_role_namer = None
+ if use_role_decomposition:
+ llm_role_namer = make_llm_role_namer_fn(
+ llm_base_url, llm_model,
+ constrained=role_namer_constrained,
+ provider=llm_provider, api_key=llm_api_key)
+
+ llm_role_fn = None
+ if use_llm_roles:
+ llm_role_fn = make_llm_role_classifier_fn(
+ llm_base_url, llm_model,
+ provider=llm_provider, api_key=llm_api_key)
+
+ # NEW: per-row role extractor (Zhu et al. EMNLP 2025) — primary route
+ per_row_extractor = None
+ if use_per_row_role_extraction:
+ per_row_extractor = make_per_row_role_extractor_fn(
+ llm_base_url, llm_model,
+ provider=llm_provider, api_key=llm_api_key)
+ if per_row_extractor:
+ st.info(f'Per-row role extraction: **{llm_provider}** · `{llm_model}`')
+ else:
+ st.warning('LLM unreachable for per-row role extraction.')
+
+ # Collect detected text-column names across all uploaded configs —
+ # used by phrase-slot mining to identify description-like fields.
+ all_text_cols: list = []
+ for cfg in configs.values():
+ for c in cfg.get('text_cols', []):
+ if c not in all_text_cols:
+ all_text_cols.append(c)
+
+ # Replace the module-level FIELD_NAME_NOISE with a noise set DERIVED
+ # from the actual detected column names — zero hardcoding. All
+ # downstream calls (label_cluster, _bigram_preferred_terms) read
+ # the module-level name so this swap propagates everywhere.
+ import sys as _sys
+ _sys.modules[__name__].FIELD_NAME_NOISE = build_field_noise(configs)
+
+ with st.spinner('Step 6 — Building group-anchored LoD tree '
+ '(slot mining → FASTopic → NMF) …'):
+ nodes = build_dynamic_lod_tree(
+ can, reprs, alabels, tfidf,
+ max_depth=max_depth,
+ min_cluster_size=min_cluster_sz,
+ sil_threshold=sil_thresh,
+ max_clusters_per_split=max_k_split,
+ project=project_name,
+ local_nmf=local_nmf,
+ min_local_nmf_size=min_local_nmf,
+ max_aspects=max_aspects,
+ sbert_model=sbert_model,
+ llm_label_fn=llm_fn,
+ use_slot_mining=use_slot_mining,
+ text_col_names=all_text_cols,
+ use_fastopic=use_fastopic,
+ fastopic_min_size=fastopic_min_sz,
+ llm_role_classifier_fn=llm_role_fn,
+ use_role_decomposition=use_role_decomposition,
+ llm_role_namer_fn=llm_role_namer,
+ role_regularity_threshold=role_regularity_threshold,
+ per_row_role_extractor_fn=per_row_extractor,
+ use_per_row_role_extraction=use_per_row_role_extraction,
+ )
+ st.session_state.a2_nodes = nodes
+
+
+ # Concise build summary — per-route node counts
+ route_counts: dict = Counter()
+ for n in nodes:
+ if n.get('type') == 'aggregation':
+ route_counts[
+ n.get('structure_provenance', {}).get('route', '—')] += 1
+ n_leaves = len([n for n in nodes if n.get('type') == 'attribute'])
+ n_internal = len([n for n in nodes if n.get('type') == 'aggregation'])
+ route_str = ' · '.join(f'{r}: {c}' for r, c in route_counts.most_common())
+ st.success(f'Done — {n_leaves} variables · {n_internal} internal nodes '
+ f'({route_str})')
+
+ # If any LLM call hit a rate-limit (429), the model ran out of
+ # tokens — tell the user to switch model in the sidebar and rebuild.
+ ran_out = any(
+ ('RateLimit' in str(r) or '429' in str(r))
+ for a in (st.session_state.get('a2_per_row_audit') or [])
+ for r in (a.get('summary') or {})
+ )
+ if ran_out:
+ st.error(f'Ran out of tokens on `{llm_model}`. '
+ f'Switch to another Groq model in the sidebar '
+ f'(e.g. llama-3.1-8b-instant) and rebuild.')
+ except Exception as e:
+ st.error(f'Build failed: {e}')
+ import traceback; st.code(traceback.format_exc())
+
+# ── display ───────────────────────────────────────────────────────────────────
+# Robust session-state reads — use .get() so a partial/incomplete build that
+# wrote some keys but not others doesn't crash the display layer.
+if st.session_state.get('a2_nodes') is None:
+ st.info('Upload a metadata file and click **Build Approach 2 Hierarchy** to start.')
+ st.stop()
+
+nodes = st.session_state.get('a2_nodes')
+can = st.session_state.get('a2_can')
+meta = st.session_state.get('a2_meta') or {}
+
+tabs = st.tabs(['LoD Tree', 'Evaluation', 'Role Decomposition',
+ 'Label Provenance', 'Metadata', 'Export', 'Method'])
+
+with tabs[0]:
+ # ── Visualization controls (above chart — easy to find, matches Approach 1.1) ─
+ vc1, vc2, vc3, vc4, vc5 = st.columns([2, 2, 1, 1, 1])
+ with vc1:
+ viz_mode = st.radio(
+ 'View mode',
+ ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
+ horizontal=True, index=0,
+ help='Sunburst best for large hierarchies [Taxonomizer]. '
+ 'Node-link best for moderate depth structure inspection.'
+ )
+ with vc2:
+ depth_display = st.slider('Depth (Level of Detail)', 1, 8, 4, 1)
+ with vc3:
+ show_leaf_labels = st.checkbox('Leaf labels', value=False)
+ with vc4:
+ show_hidden = st.checkbox('Hidden nodes', value=False)
+ with vc5:
+ compress_chains = st.checkbox('Compress chains', value=True,
+ help='Merge one-child aggregation chains '
+ '(e.g. "DMS → DMS Recommended Standard") '
+ 'for display. Export JSON keeps original structure.')
+ st.divider()
+
+ display_nodes = compress_one_child_chains(nodes) if compress_chains else nodes
+
+ if viz_mode == 'Sunburst (drill-down)':
+ st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
+ use_container_width=True)
+ elif viz_mode == 'Treemap':
+ st.plotly_chart(plot_treemap(display_nodes), use_container_width=True)
+ else:
+ st.plotly_chart(plot_node_link(display_nodes, depth_display,
+ show_hidden, show_leaf_labels),
+ use_container_width=True)
+
+ n_l = len([n for n in nodes if n.get('type') == 'attribute'])
+ n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
+ # max depth
+ pm = _parent_map(nodes)
+ def _node_depth(nid):
+ d = 0; cur = nid
+ while cur in pm:
+ cur = pm[cur]; d += 1
+ return d
+ max_d = max((_node_depth(n['id']) for n in nodes), default=0)
+ c1, c2, c3, c4 = st.columns(4)
+ c1.metric('Variables', n_l)
+ c2.metric('Internal nodes', n_i)
+ c3.metric('Global aspects', meta.get('K', '?'))
+ c4.metric('Max depth', max_d)
+
+with tabs[1]:
+ import hierarchy_eval as he
+
+ st.markdown('### Evaluation')
+ if can is None or meta.get('W') is None:
+ st.info('Run the builder first.')
+ else:
+ st.caption(
+ 'No manually curated reference taxonomy is available for this experiment. '
+ 'The primary metrics below are **reference-free** — they assess the hierarchy '
+ 'itself. Group-column figures later are descriptive self-consistency checks only. '
+ 'Identical definitions to the Baseline and Approach 1 apps make the primary '
+ 'diagnostics directly comparable.'
+ )
+
+ # ── PRIMARY: reference-free hierarchy quality (compute on demand) ──────
+ # These use SBERT, which is slow to load. Computing them only on a button
+ # click keeps the tree, sliders and Save button instant.
+ st.markdown('#### Primary — reference-free hierarchy quality')
+ if st.button('▶Compute reference-free metrics', key='a2_eval_btn'):
+ with st.spinner('Computing reference-free metrics (loads SBERT once)…'):
+ tm = he.traco_metrics(nodes)
+ npmi = he.npmi_coherence(nodes, can['_text'].tolist())
+ st.session_state['a2_eval_cache'] = {'tm': tm, 'npmi': npmi}
+
+ _ev = st.session_state.get('a2_eval_cache')
+ if _ev:
+ tm, npmi = _ev['tm'], _ev['npmi']
+ p1, p2, p3 = st.columns(3)
+ p1.metric('Parent–child coherence', tm['pc_coherence'],
+ help='TraCo (Wu et al., AAAI 2024). Children nest under parent theme.')
+ p2.metric('Sibling diversity', tm['sibling_diversity'],
+ help='TraCo (Wu et al., AAAI 2024). Higher = distinct siblings; LOW = redundant.')
+ p3.metric('NPMI label coherence', npmi,
+ help='Lau et al., EACL 2014. Label terms genuinely co-occur in the data.')
+ st.caption(f'Embedding backend: **{tm["encoder"]}**.')
+ else:
+ st.info('Click the button above to compute coherence / diversity / NPMI '
+ '(takes a few seconds the first time while SBERT loads).')
+
+ # ── Label-quality proxies (interpretability) ──────────────────────────
+ st.markdown('#### Label quality *(interpretability — reference-free)*')
+ lq = he.label_quality(nodes)
+ l1, l2, l3 = st.columns(3)
+ l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
+ help='% of internal labels that read as a real concept (short noun '
+ 'phrase, WordNet head) rather than a "/"-joined term fragment.')
+ l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
+ help='% of internal labels duplicating a sibling label (lower is better).')
+ l3.metric('Avg label words', lq['avg_label_words'],
+ help='Mean label length in words.')
+
+ # ── Structural statistics ─────────────────────────────────────────────
+ st.markdown('#### Structural statistics')
+ sm = he.structural_stats(nodes)
+ s1, s2, s3, s4, s5 = st.columns(5)
+ s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
+ s2.metric('Max leaf depth', sm['max_depth'])
+ s3.metric('Avg leaf depth', sm['avg_leaf_depth'])
+ s4.metric('Avg branching', sm['avg_branching_factor'])
+ s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%")
+
+ # ── Group-structure self-consistency (descriptive, NOT accuracy) ───────
+ st.markdown('#### Group-structure self-consistency *(descriptive — not accuracy)*')
+ st.caption(
+ 'The group column is a **construction input** (group-anchored L1/L2), so this '
+ 'only confirms the NMF aspect partition reflects its own input — expected high, '
+ 'NOT a quality signal and NOT comparable across methods as an accuracy metric.'
+ )
+ true_labels = can['_group'].apply(
+ lambda x: str(x).split(' > ')[0].strip()).tolist()
+ W = meta['W']
+ pred_nmf = np.argmax(W, axis=1).tolist()
+ metrics = evaluate(true_labels, pred_nmf)
+ st.metric('ARI (self-consistency)', metrics['ARI'])
+
+ # ── legacy global aspect table (diagnostic only) ─────────────────────
+ with st.expander('Legacy global NMF aspect table (diagnostic — not the '
+ 'primary result)'):
+ st.caption('Global NMF aspects are a coarse lexical fallback. The '
+ 'hierarchy is built from per-variable role extraction — '
+ 'see the Role Decomposition tab for the actual result.')
+ alabs = meta.get('aspect_labels', [])
+ W_df = pd.DataFrame(
+ W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
+ W_df.insert(0, 'Variable', can['_label'].tolist())
+ st.dataframe(W_df.round(4), use_container_width=True)
+
+with tabs[2]:
+ st.markdown('### Role decomposition')
+ st.caption('Each variable decomposed into measure / statistic / condition / subtype.')
+
+ if can is None:
+ st.info('Build a hierarchy first.')
+ else:
+ # ── Per-group summary ─────────────────────────────────────────────────
+ st.markdown('#### Per-group routing')
+ reg_rows = []
+ for n in nodes:
+ sp = n.get('structure_provenance', {})
+ if sp.get('route') == 'group_anchor' and 'phrase_regularity' in sp:
+ reg_rows.append({
+ 'Group': n.get('name', ''),
+ 'Regularity': sp.get('phrase_regularity'),
+ 'Route used': sp.get('route_used', '—'),
+ })
+ if reg_rows:
+ reg_df = pd.DataFrame(reg_rows).sort_values(
+ 'Regularity', ascending=False, na_position='last')
+ st.dataframe(reg_df, use_container_width=True, hide_index=True)
+
+ # ── Per-variable role table ───────────────────────────────────────────
+ st.markdown('#### Per-variable role table')
+
+ # Primary source: raw per-row LLM extractions captured in the audit
+ # (complete — includes roles that didn't become tree levels, e.g.
+ # condition values skipped by singleton prevention).
+ # Fallback per variable: roles collected from tree-ancestor nodes.
+ audit_roles_by_vi: dict = {}
+ for a in (st.session_state.get('a2_per_row_audit') or []):
+ vi_list_a = a.get('vi_list') or []
+ for r in a.get('per_row_audit', []):
+ ridx = r.get('row_idx')
+ accepted = r.get('accepted_roles') or {}
+ if ridx is not None and ridx < len(vi_list_a) and accepted:
+ audit_roles_by_vi[int(vi_list_a[ridx])] = accepted
+
+ # Tree-walk fallback (roles that became hierarchy levels)
+ node_map_disp = {int(n['id']): n for n in nodes}
+ parent_lookup: dict = {}
+ for n in nodes:
+ for c in n.get('related', []):
+ parent_lookup.setdefault(int(c), int(n['id']))
+
+ def _tree_roles_for_attr(node_id: int) -> dict:
+ roles_here: dict = {}
+ cur = node_id
+ while cur in parent_lookup:
+ cur = parent_lookup[cur]
+ cur_node = node_map_disp.get(cur)
+ if not cur_node:
+ break
+ lp = cur_node.get('label_provenance', {})
+ role = (lp.get('role')
+ or cur_node.get('structure_provenance', {}).get('slot_role'))
+ if role and role not in roles_here:
+ roles_here[role] = cur_node.get('name', '')
+ return roles_here
+
+ role_rows = []
+ for vi, (_, crow) in enumerate(can.iterrows()):
+ # Audit roles win; tree roles fill any gaps
+ roles_here = dict(_tree_roles_for_attr(vi + 1)) # attribute ids = 1..n
+ for r, v in (audit_roles_by_vi.get(vi) or {}).items():
+ if v:
+ roles_here[r] = v
+ row = {
+ 'Group': str(crow.get('_group', '')).split(' > ')[0].strip(),
+ 'Variable': str(crow.get('_label', '')),
+ }
+ for std_role in ('measure', 'statistic', 'condition',
+ 'subtype', 'outcome', 'modifier'):
+ row[std_role.title()] = roles_here.pop(std_role, '')
+ if roles_here:
+ row['Other roles'] = '; '.join(
+ f'{r}: {v}' for r, v in roles_here.items())
+ role_rows.append(row)
+
+ if role_rows:
+ role_df = pd.DataFrame(role_rows)
+ st.dataframe(role_df, use_container_width=True, hide_index=True)
+ st.download_button(
+ 'Download per-variable role CSV',
+ data=role_df.to_csv(index=False).encode('utf-8'),
+ file_name=f'{safe_name(project_name)}_approach2_role_decomposition.csv',
+ mime='text/csv',
+ )
+
+ # ── Per-row LLM extractor audit ───────────────────────────────────────
+ audits = st.session_state.get('a2_per_row_audit', []) or []
+ if audits:
+ st.markdown('#### Extraction audit')
+
+ # Summary table per group
+ sum_rows = []
+ for a in audits:
+ sum_rows.append({
+ 'Group': a.get('group_name', ''),
+ 'Rows': a.get('n_rows', 0),
+ 'Coverage': a.get('coverage'),
+ 'Valid': a.get('valid'),
+ 'Roles found': ', '.join(a.get('roles_final', []))[:60],
+ 'Reasons': ', '.join(f'{k}:{v}' for k, v in
+ (a.get('summary', {}) or {}).items()),
+ })
+ st.dataframe(pd.DataFrame(sum_rows), use_container_width=True,
+ hide_index=True)
+
+ # Drill-down per group
+ grp_names = [a.get('group_name', '?') for a in audits]
+ if grp_names:
+ sel_grp = st.selectbox(
+ 'Drill into a group to see per-row proposals + rejections:',
+ grp_names)
+ sel_audit = next((a for a in audits
+ if a.get('group_name') == sel_grp), None)
+ if sel_audit:
+ row_rows = []
+ for r in sel_audit.get('per_row_audit', [])[:60]:
+ accepted = r.get('accepted_roles', {}) or {}
+ rejected = r.get('rejected', []) or []
+ row_rows.append({
+ 'Row #': r.get('row_idx', ''),
+ 'Description': r.get('description_snippet', ''),
+ 'Accepted': '; '.join(f'{k}={v}'
+ for k, v in accepted.items())[:140],
+ 'Rejected': '; '.join(
+ f'{x[0]}={x[1]!r} (missing stems: {x[2]})'
+ if isinstance(x, (list, tuple)) and len(x) >= 3
+ else str(x) for x in rejected)[:200],
+ 'Reason': r.get('reason', ''),
+ })
+ if row_rows:
+ st.dataframe(pd.DataFrame(row_rows),
+ use_container_width=True, hide_index=True)
+ # Download as CSV for offline analysis
+ csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
+ st.download_button(
+ 'Download per-row audit for this group',
+ data=csv_bytes,
+ file_name=f'{safe_name(project_name)}_audit_{safe_name(sel_grp)}.csv',
+ mime='text/csv',
+ )
+ else:
+ st.info('No role assignments recorded yet — Option D may have '
+ 'fallen back to slot mining or aspect clustering for all '
+ 'groups in this dataset.')
+
+with tabs[3]:
+ st.markdown('### Label provenance')
+ st.caption('Audit trail: which stage produced each node label.')
+ rows = []
+ for n in nodes:
+ if n.get('type') != 'aggregation':
+ continue
+ p = n.get('label_provenance', {})
+ s = n.get('structure_provenance', {})
+ rows.append({
+ 'Node': n.get('name', ''),
+ 'Source': p.get('label_source', '—'),
+ 'Route': s.get('route', '—'),
+ 'Aspect method': s.get('aspect_method') or '—',
+ 'Silhouette': s.get('silhouette') if s.get('silhouette') is not None else '—',
+ 'LLM used': p.get('llm_used', False),
+ 'LLM rejected': p.get('llm_rejected', False),
+ 'LLM proposed': p.get('llm_raw_label', ''),
+ 'LLM reason': p.get('llm_reason', '')[:60],
+ 'Confidence': round(float(p.get('confidence', 1.0)), 3),
+ 'Evidence': ', '.join(str(t) for t in p.get('evidence_terms', []))[:120],
+ })
+ if not rows:
+ st.info('No internal nodes yet — build a hierarchy first.')
+ else:
+ prov_df = pd.DataFrame(rows)
+
+ # ── Labels by source ──────────────────────────────────────────────────
+ source_counts = prov_df['Source'].value_counts()
+ st.write('**Labels by source**')
+ cols_src = st.columns(min(5, max(2, len(source_counts))))
+ for i, (src, cnt) in enumerate(source_counts.items()):
+ cols_src[i % len(cols_src)].metric(str(src), int(cnt))
+
+ # ── Structure routes ──────────────────────────────────────────────────
+ am_counts = prov_df['Aspect method'].value_counts()
+ st.write('**Structure routes used**')
+ cols_am = st.columns(min(5, max(2, len(am_counts))))
+ for i, (am, cnt) in enumerate(am_counts.items()):
+ cols_am[i % len(cols_am)].metric(str(am), int(cnt))
+
+ # ── LLM usage — split per-row extraction from the downstream refiner ──
+ # Per-row nodes are LLM-BUILT (source 'per_row_llm_role'); the refiner
+ # only renames deterministically-labeled nodes (source 'llm' when its
+ # proposal is accepted). Counting them together made the panel read
+ # "N calls, 0 accepted" even on a fully successful build.
+ n_per_row = int((prov_df['Source'] == 'per_row_llm_role').sum())
+ refiner_accepted = int((prov_df['Source'] == 'llm').sum())
+ refiner_rejected = int(((prov_df['LLM rejected'] == True) # noqa: E712
+ & (prov_df['Source'] != 'per_row_llm_role')).sum())
+ st.write('**LLM usage**')
+ cL1, cL2, cL3 = st.columns(3)
+ cL1.metric('Per-row extraction nodes', n_per_row)
+ cL2.metric('Refiner accepted', refiner_accepted)
+ cL3.metric('Refiner rejected', refiner_rejected)
+ if refiner_accepted == 0 and refiner_rejected == 0 and n_per_row > 0:
+ st.caption('Label refiner did not run — the tree was built entirely '
+ 'by per-row extraction, leaving no deterministic labels '
+ 'to refine.')
+ if refiner_rejected > 0:
+ with st.expander('Rejected refiner proposals'):
+ rej = prov_df[(prov_df['LLM rejected'] == True) # noqa: E712
+ & (prov_df['LLM proposed'].astype(str).str.len() > 0)]
+ if len(rej):
+ st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
+ use_container_width=True, hide_index=True)
+
+ # ── Full provenance table ─────────────────────────────────────────────
+ st.write('**Full per-node provenance**')
+ st.dataframe(prov_df, use_container_width=True, hide_index=True)
+
+with tabs[4]:
+ if can is not None:
+ st.dataframe(can.drop(columns=['_row'], errors='ignore'),
+ use_container_width=True)
+
+with tabs[5]:
+ # ── derive a per-CSV base name from the uploaded files ────────────────────
+ # Uses the actual uploaded file names so different CSVs get different
+ # output filenames (e.g. ai-mind-…json vs HCP_S1200_…json).
+ csv_basis = ''
+ if can is not None and '_source' in can.columns:
+ sources = [str(s) for s in can['_source'].dropna().unique().tolist()]
+ # Drop extensions, join with '+' if multiple files merged
+ bases = []
+ for s in sources:
+ stem = Path(s).stem
+ bases.append(safe_name(stem))
+ csv_basis = '+'.join(bases) if bases else safe_name(project_name)
+ if not csv_basis:
+ csv_basis = safe_name(project_name)
+
+ lod_fname = f'{csv_basis}_approach2_lod.json'
+
+ st.caption(f'Filename basis: **{csv_basis}** '
+ f'(taken from the uploaded CSV — different CSVs export under different names)')
+
+ col1, col2 = st.columns(2)
+ with col1:
+ if nodes:
+ st.download_button(
+ 'LoD tree JSON',
+ data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
+ file_name=f'{csv_basis}_approach2_lod.json',
+ mime='application/json',
+ use_container_width=True,
+ )
+ with col2:
+ if can is not None:
+ st.download_button(
+ 'Canonical CSV',
+ data=can.to_csv(index=False).encode('utf-8'),
+ file_name=f'{csv_basis}_approach2_canonical.csv',
+ mime='text/csv',
+ use_container_width=True,
+ )
+
+ st.divider()
+ # ── Save directly into the project's outputs/approach_2/ folder ────────────
+ _out_dir = Path(__file__).resolve().parent / 'outputs' / 'approach_2'
+ st.markdown('### Save to project folder')
+ st.caption(
+ 'The download buttons above go to your browser’s Downloads folder (a browser '
+ f'restriction). This button instead writes the files into `{_out_dir}` with the '
+ 'dataset name — convenient for `evaluate_all.py`.'
+ )
+ if st.button('Save all to outputs/approach_2/', type='primary',
+ use_container_width=True):
+ try:
+ _out_dir.mkdir(parents=True, exist_ok=True)
+ saved = []
+ if nodes:
+ (_out_dir / f'{csv_basis}_approach2_lod.json').write_text(
+ json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
+ saved.append(f'{csv_basis}_approach2_lod.json')
+ if can is not None:
+ can.to_csv(_out_dir / f'{csv_basis}_approach2_canonical.csv', index=False)
+ saved.append(f'{csv_basis}_approach2_canonical.csv')
+ st.success(f'Saved to `{_out_dir}`:\n\n- ' + '\n- '.join(saved))
+ except Exception as _e:
+ st.error(f'Could not save: {_e}')
+
+with tabs[6]:
+ st.markdown("""
+## Approach 2 — Role-Decomposed Hierarchy via SBERT Phrase Clustering
+### Option D primary route + slot mining + FASTopic + constrained LLM labels
+
+### Algorithm
+
+```
+Step 1 Build metadata text objects (variable name + description + group)
+ [GON §3] — Gonçalves et al. (2019)
+
+Step 2 Group-anchored L1/L2 structure [NEW]
+ — detected group columns → path nodes (no hardcoding)
+ — e.g. category > assessment, or task > variant
+ — falls back to global NMF if no groups detected
+
+Step 3a Phrase-slot mining (slot-first routing) [IE / slot induction]
+ — For each terminal group, attempt deterministic phrase-slot
+ decomposition of variable descriptions:
+ • extract concept-prefix per row
+ • mine repeated n-grams (1–4 tokens, ≥2 rows)
+ • compute mutual-exclusion across phrase pairs:
+ M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
+ • cluster phrases by mutual exclusion → slots
+ — A slot is a set of phrases that rarely co-occur within a row but
+ each co-occur with phrases from other slots (= alternatives at
+ the same semantic position).
+ — Activates only when slot structure is statistically strong:
+ • ≥ 2 slots discovered
+ • ≥ 55% row coverage
+ • each slot ≥ 2 distinct phrases
+ — No domain hardcoding: phrase content is discovered from the data;
+ slot names = the highest-coverage phrase in each slot.
+ — When valid, the local hierarchy is built directly from slot values.
+ — When invalid (free-form descriptions, e.g. parts of HCP), routing
+ falls through to NMF (Step 3b).
+
+Step 3b Local NMF aspect discovery per terminal group [ZHU §3.1 adapted]
+ — Fallback path when slot mining does not apply.
+ — NMF runs inside each group, not across all variables
+ — prevents globally-dominant terms from polluting local aspects
+ — K selected by reconstruction-error elbow (deterministic)
+ — falls back to global embeddings for small groups (< min_local_nmf_size)
+
+Step 4 Per-aspect variable representations [ZHU §3.1]
+ — for each aspect k: filter text to top-T terms → SBERT or masked TF-IDF
+
+Step 5 Independent per-aspect GMM clustering [ZHU §3.2]
+ — GMM with diagonal covariance + BIC for stable k selection
+ — runs inside each group's aspect space
+
+Step 6 Simplified best-aspect split [ZHU §3.3 adapted]
+ — at each node: evaluate all K aspects by silhouette score
+ — highest silhouette → GMM split → child aggregation nodes
+ — NOTE: this is a silhouette-based greedy split, not the full
+ probabilistic search of Zhu et al. Eq. 6/7
+ — singleton prevention: 1-variable clusters attach directly (no wrapper node)
+
+Step 5a UPSTREAM LLM phrase-role classification [TopicGPT, NAACL 2024 adapted]
+ — One LLM call per terminal group:
+ input : (i) all repeated mined phrases (verbatim from the corpus)
+ (ii) 2–4 sample variable descriptions for context
+ (iii) the group name
+ output : {role_name: [phrases]} e.g. {measure: [...], statistic: [...],
+ condition: [...]}
+ — Anti-hallucination:
+ • every phrase in the returned roles MUST match an input phrase
+ verbatim (validator drops anything else)
+ • role names must be 1–2 generic English words (drops fancy/long names)
+ • ≥ 2 valid roles required; else falls back to mutual-exclusion
+ — Phrase ↔ role mapping drives the slot hierarchy in Step 6:
+ measure-like roles become outer levels, condition-like roles inner.
+ — STRUCTURAL use of the LLM (TopicGPT discovers topics). Distinct
+ from the downstream label-refiner step (TopicTag, see Step 7e).
+
+Step 7 Node labeling [ZHU §4.3 / TopicTag DocEng 2024]
+ a) description-prefix phrase shared by ≥60% of cluster
+ b) group-purity prefix: if ≥70% share one _group top-level value
+ c) data-driven boilerplate + FIELD_NAME_NOISE filter
+ d) bigram-preferred discriminative TF-IDF suffix
+ e) OPTIONAL downstream LLM refinement [TopicTag]:
+ — receives only evidence terms + parent path + sample descriptions
+ — strict grounding check: every label word must appear in evidence
+ — rejected proposals fall back to deterministic label
+ — provenance stored on each node (label_source, confidence, evidence_terms)
+
+Step 8 Evaluation [TraCo AAAI 2024 / Lau EACL 2014 / TICL §3.4]
+ — reference-free parent-child coherence and sibling diversity
+ — reference-free NPMI label coherence and label-quality proxies
+ — optional group-structure self-consistency, explicitly not manual-reference accuracy
+ — label and structure provenance audit tables
+```
+
+### Key design decisions
+
+| Decision | Rationale |
+|---|---|
+| FASTopic replaces NMF as primary aspect discovery | NMF (1999) is lexical only; FASTopic (NeurIPS 2024) uses pretrained Transformer + Dual Semantic-relation Reconstruction → semantic, not lexical. |
+| NMF kept as fallback | Required for very small groups or when FASTopic / SBERT model is unavailable. |
+| Slot mining tried first | Decomposes variables along multiple semantic dimensions before any topic model. No document-level method (NMF, BERTopic, FASTopic) can do this — they all collapse one variable into one vector. |
+| No facet trees | Removed: a single coherent LoD tree is easier to defend than parallel views of one clustering. |
+| Deterministic labels = default thesis result | Reproducible without API access. LLM is opt-in re-phrasing only. |
+| LLM via local Ollama | Localhost OpenAI-compatible endpoint (`http://localhost:11434/v1`) → LLM ON by default whenever Ollama is reachable; easy to disable. Override `OLLAMA_URL` / `OLLAMA_MODEL` env vars for non-default deployments. No external API, no key management, fully reproducible from a known model checkpoint. |
+| Strict LLM grounding | Every label word must appear in evidence — labels come from the CSV, LLM only rewords. |
+| Per-node provenance | Audit trail: `label_source ∈ {description_prefix, tfidf_bigram, group_anchor, phrase_slot, llm, fallback}`. |
+
+### Thesis wording (defense-safe)
+
+*Approach 2 is a dataset-constrained multi-aspect hierarchy with strict separation
+between structural decisions and label generation. The hierarchy topology is produced
+deterministically: detected group metadata anchors the upper levels, IE-style phrase-slot
+mining decomposes variables along multiple semantic dimensions when description structure
+permits, and FASTopic (NeurIPS 2024) discovers latent semantic aspects in the remaining
+groups, with NMF retained as a lexical fallback for small groups. Concept labels are
+generated by a deterministic five-stage pipeline whose evidence comes exclusively from
+the dataset itself. An optional TopicTag-style LLM refinement layer may re-phrase these
+labels, but every LLM proposal must pass a grounding check — each word in the proposed
+label must appear in the extracted evidence — and every label records its provenance
+(source stage, confidence, evidence terms). The LLM can neither alter the tree structure
+nor introduce vocabulary absent from the input CSV.*
+
+### Papers used
+
+| Ref | Citation | Role in this method |
+|---|---|---|
+| [ZHU] | Zhu et al. (2025). *Context-Aware Hierarchical Taxonomy Generation via LLM-Guided Multi-Aspect Clustering.* EMNLP 2025. | Main scaffold — adapted (FASTopic+NMF replace LLM aspect generation; greedy silhouette replaces Eq. 6/7 search). |
+| [FASTopic] | Wu et al. (2024). *FASTopic: Pretrained Transformer is a Fast, Adaptive, Stable, and Transferable Topic Model.* NeurIPS 2024 (arXiv:2405.17978). | Recent SOTA replacement for NMF — semantic topic discovery via Dual Semantic-relation Reconstruction with optimal transport. |
+| [IE-Slot] | Established IE literature on slot induction (surveyed *ACM Computing Surveys* 2022). | Phrase-slot mining adaptation — decomposes one variable into multiple alternative-phrase signals before clustering. |
+| [GON] | Gonçalves et al. (2019). ESWC 2019. | Canonical metadata text-object construction. |
+| [TopicGPT] | Pham et al. (2024). *TopicGPT: A Prompt-based Topic Modeling Framework.* NAACL 2024 (arXiv:2311.01449). | **STRUCTURAL** upstream LLM use — discovers semantic-role schema from mined phrases (one call per group); drives slot ordering in the hierarchy. Anti-hallucination contract: every phrase verbatim from corpus. |
+| [TopicTag] | Eren et al. (2024). DocEng 2024 (arXiv:2407.19616). | Constrained LLM label-refinement pattern — LLM only names existing clusters, never modifies structure. Downstream use only. |
+| [Qwen2.5] | Qwen Team (2024). *Qwen 2.5 Technical Report.* arXiv:2412.15115. | Open instruction-tuned model used as the local LLM (via Ollama) for label refinement. Replaces a hosted LLM for full offline reproducibility. |
+| [TraCo] | Wu et al. (2024). AAAI 2024 (arXiv:2401.14113). | Diagnostic metrics: parent-child coherence + sibling diversity. |
+| [TaxoAdapt] | Kargupta et al. (2025). ACL 2025 (arXiv:2506.10737). | Multidimensional taxonomy motivation. |
+| [SC-Taxo] | (2026). arXiv:2605.00620. | Future work — bidirectional semantic consistency. |
+| [BISE-26] | Motamedi, Novalija, Rei (2026). Springer BISE. | Validates group-anchored entry strategy. |
+| [TICL] | Kejriwal et al. (2022). EAAI 108, 104548. | Concept-label evaluation framework. |
+
+### Known limitations (honest)
+
+* **FASTopic is still document-level** — better than NMF semantically, but a single variable
+ is still one vector. Multi-dimension decomposition relies on phrase-slot mining.
+* **Greedy silhouette split** — not the full probabilistic search of Zhu et al. Eq. 6/7.
+* **TraCo metrics are diagnostic only** — measured, not enforced (no neural transport plan).
+* **LLM labels are only as recent as your Anthropic model** — model choice affects reproducibility;
+ the deterministic pipeline is the canonical thesis result.
+""")