# approach_2.py — Approach 2: Dataset-Constrained Multi-Aspect Hierarchy
#
# ALGORITHM (no domain hardcoding):
#
# Step 1 Build metadata text objects [GON §3]
# Step 2 Group-anchored L1/L2 from detected _group metadata [BISE-26]
# Step 3 Per terminal group — routing in priority order:
# (a) Phrase-slot mining [IE / slot induction]
# decomposes one variable into multiple alternative-phrase
# signals before clustering — fixes the document-level ceiling
# that NMF/BERTopic/FASTopic share.
# (b) FASTopic semantic aspect discovery [Wu et al. NeurIPS 2024]
# transformer-based Dual Semantic-relation Reconstruction with
# optimal transport — recent SOTA replacement for NMF.
# (c) NMF lexical fallback [ZHU §3.1 adapted]
# retained for small groups or when FASTopic is unavailable.
# Step 4 Per-aspect variable representations [ZHU §3.1]
# Step 5 Independent per-aspect clustering: GMM+BIC small / KMeans large [ZHU §3.2]
# Step 6 Top-down LoD tree (simplified silhouette best-aspect split) [ZHU §3.3 adapted]
# Step 7 Node labeling — deterministic by default:
# description-prefix → group anchor → IDF + FIELD_NAME filter →
# bigram-preferred discriminative TF-IDF suffix
# OPTIONAL: constrained LLM re-phrasing [TopicTag, DocEng 2024]
# — every label word must appear in evidence (grounding check)
# — provenance stored per node (audit trail)
# Step 8 Reference-free diagnostics: parent-child coherence, sibling diversity,
# label coherence, label quality, structure/provenance audit
# [TraCo, AAAI 2024]
#
# Facet trees (Castanet, 2007) removed in this version — a single coherent LoD tree.
#
# PAPERS:
# [ZHU] Zhu et al. (2025). EMNLP 2025. Main scaffold (adapted)
# [FASTopic] Wu et al. (2024). NeurIPS 2024. Semantic aspect discovery
# [GON] Gonçalves et al. (2019). ESWC. Canonical text objects
# [TopicTag] Eren et al. (2024). DocEng. Constrained LLM label refinement
# [TraCo] Wu et al. (2024). AAAI. Affinity + diversity metrics
# [TICL] Kejriwal et al. (2022). EAAI. Optional clustering diagnostics
# [BISE-26] Motamedi et al. (2026). BISE. Group-anchored entry validation
# [IE-Slot] IE / slot-induction literature (surveyed Xu et al., FCS 2024).
from __future__ import annotations
import json
import os
import warnings
from collections import Counter, defaultdict
from pathlib import Path
from typing import Callable, Optional
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import streamlit as st
from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
try:
from sentence_transformers import SentenceTransformer
_SBERT_AVAILABLE = True
except Exception: # broken torch raises OSError (WinError 127), not ImportError
_SBERT_AVAILABLE = False
try:
from openai import OpenAI as _OpenAIClient
_LLM_CLIENT_AVAILABLE = True
except ImportError:
_LLM_CLIENT_AVAILABLE = False
# Ollama defaults (overridable via env vars OLLAMA_URL / OLLAMA_MODEL).
OLLAMA_URL_DEFAULT = 'http://localhost:11434/v1'
OLLAMA_MODEL_DEFAULT = 'qwen2.5:3b-instruct'
GROQ_URL_DEFAULT = 'https://api.groq.com/openai/v1'
GROQ_MODEL_DEFAULT = 'qwen/qwen3-32b'
def _ping_ollama(base_url: str = OLLAMA_URL_DEFAULT, timeout: float = 1.5) -> bool:
"""Quick reachability check for the local Ollama server."""
if not _LLM_CLIENT_AVAILABLE:
return False
try:
import urllib.request as _urlreq
# /v1/models is OpenAI-compat; Ollama also exposes /api/tags
with _urlreq.urlopen(base_url.rstrip('/v1') + '/api/tags',
timeout=timeout) as r:
return r.status == 200
except Exception:
return False
def _make_llm_client(provider: str, base_url: str, api_key: str = '') -> Optional[object]:
"""
Build an OpenAI-compatible client for either local Ollama or cloud Groq.
Both providers expose an OpenAI-compatible REST endpoint, so the same
openai.OpenAI client class works for both — only the base_url and
auth differ.
"""
if not _LLM_CLIENT_AVAILABLE:
return None
if provider == 'groq':
if not api_key:
return None
return _OpenAIClient(base_url=base_url, api_key=api_key)
# Ollama ignores the key but the SDK requires a non-empty string
if not _ping_ollama(base_url):
return None
return _OpenAIClient(base_url=base_url, api_key='ollama-local')
def _parse_json_response(raw: str) -> dict:
"""
Robustly extract a JSON object from an LLM response.
Handles the response shapes seen across providers/models:
• plain JSON: {"role": "measure"}
• markdown-fenced: ```json\n{...}\n```
• reasoning-model preamble: ...\n{...}
(Qwen3 / DeepSeek-R1 style models emit thinking traces before the
answer when JSON mode is unavailable — e.g. qwen/qwen3-32b on Groq)
• prose then JSON: Here is the result: {...}
Strategy: strip blocks, then take the substring from the first
'{' to the last '}' and parse it. Raises ValueError if no JSON found.
"""
s = (raw or '').strip()
if not s:
raise ValueError('empty response')
# Strip reasoning blocks (Qwen3 / R1 style)
while '' in s:
start = s.find('')
end = s.find('', start)
if end == -1:
# Unclosed think block — drop everything from onward,
# the JSON (if any) would be before it
s = s[:start]
break
s = s[:start] + s[end + len(''):]
s = s.strip()
# Take first '{' .. last '}' — covers fences and prose prefixes/suffixes
i, j = s.find('{'), s.rfind('}')
if i == -1 or j == -1 or j <= i:
raise ValueError('no JSON object in response')
return json.loads(s[i:j + 1])
def _safe_chat_completion(client, model: str, prompt: str,
max_tokens: int = 200, temperature: float = 0.1):
"""
Call an OpenAI-compatible chat completion endpoint with automatic
fallback for providers that don't support JSON mode on a given model.
Some models on Groq (notably some Qwen 3 variants) reject
`response_format={"type":"json_object"}` with HTTP 400 BadRequestError.
This wrapper first tries WITH JSON mode (better reliability when
supported), and if the provider rejects it with a bad-request error,
retries WITHOUT. Prompts in this codebase already say 'Output JSON only'
and we strip ```json fences after parsing, so the retry path still
works deterministically.
"""
base_args = {
'model': model,
'messages': [{'role': 'user', 'content': prompt}],
'temperature': temperature,
'max_tokens': max_tokens,
}
try:
return client.chat.completions.create(
**base_args, response_format={'type': 'json_object'})
except Exception as e:
# Retry without JSON mode on bad-request / unsupported-feature errors
err_name = type(e).__name__
err_text = str(e)
if ('BadRequest' in err_name or '400' in err_text
or 'response_format' in err_text):
return client.chat.completions.create(**base_args)
raise
try:
from fastopic import FASTopic # type: ignore[import-not-found]
_FASTOPIC_AVAILABLE = True
except Exception: # fastopic pulls in torch; broken torch raises OSError, not ImportError
_FASTOPIC_AVAILABLE = False
# ──────────────────────────────────────────────────────────────────────────────
# ROLE KEYS (domain-agnostic)
# ──────────────────────────────────────────────────────────────────────────────
LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split()
GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation'.split()
META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding range min max scale'.split()
# ──────────────────────────────────────────────────────────────────────────────
# FILE LOADING
# ──────────────────────────────────────────────────────────────────────────────
def safe_name(n: str) -> str:
return ''.join(c if c.isalnum() or c in '-_.' else '_' for c in n)
def try_read_csv(path: Path) -> pd.DataFrame:
best, best_score = None, -1
# Try explicit comma first (most common), then let Python sniff, then other separators.
# Reject results with only 1 column — likely a parsing failure.
for enc in ['utf-8-sig', 'utf-8', 'latin1']:
for sep in [',', '\t', ';', '|', None]:
try:
df = pd.read_csv(path, sep=sep, engine='python', encoding=enc,
on_bad_lines='skip')
if df.shape[1] < 2:
continue
s = df.shape[1] * 10 - float(df.isna().mean().mean())
if s > best_score:
best, best_score = df, s
except Exception:
pass
if best is None:
raise ValueError(f'Could not read {path.name}')
best.columns = [str(c).strip().replace(';', '') for c in best.columns]
return best
def load_any(path: Path) -> pd.DataFrame:
s = path.suffix.lower()
if s in ['.csv', '.tsv', '.txt']:
return try_read_csv(path)
if s in ['.xlsx', '.xls']:
return pd.read_excel(path)
if s == '.json':
obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
if isinstance(obj, list):
return pd.json_normalize(obj)
if isinstance(obj, dict):
for v in obj.values():
if isinstance(v, list):
return pd.json_normalize(v)
raise ValueError(f'Unsupported: {s}')
# ──────────────────────────────────────────────────────────────────────────────
# COLUMN ROLE DETECTION (domain-agnostic heuristic scoring)
# ──────────────────────────────────────────────────────────────────────────────
def _norm(c: str) -> str:
return ''.join(ch if ch.isalnum() else ' ' for ch in str(c).lower())
def _ks(c: str, keys: list) -> int:
return sum(1 for k in keys if k in _norm(c))
def detect_roles(df: pd.DataFrame) -> dict:
n = max(len(df), 1)
out = []
for col in df.columns:
s = df[col]
nun = int(s.nunique(dropna=True))
ur = nun / n
avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
# Raised group uniqueness ceiling from 80 → 300 so large datasets (e.g. HCP
# with 100+ assessment categories) are not excluded.
out.append({
'column': str(col),
'leaf_score': 4 * _ks(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0),
'group_score': 4 * _ks(col, GROUP_KEYS) + (3 if 1 < nun < min(n * 0.5, 300) else 0),
'text_score': 5 * _ks(col, TEXT_KEYS) + (4 if avg > 50 else 0),
'meta_score': 4 * _ks(col, META_KEYS) + (2 if 1 < nun < min(n * 0.8, 100) else 0),
})
prof = pd.DataFrame(out)
leaf = prof.sort_values('leaf_score', ascending=False).head(1)['column'].tolist()
text = (prof[prof.text_score >= 4]
.sort_values('text_score', ascending=False)['column'].tolist()) or leaf[:]
group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf))]
.sort_values('group_score', ascending=False).head(3)['column'].tolist())
meta = (prof[(prof.meta_score >= 4) & (~prof.column.isin(text + leaf + group))]
.sort_values('meta_score', ascending=False).head(4)['column'].tolist())
return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'meta_cols': meta}
def sv(x) -> str:
return '' if pd.isna(x) else str(x).strip()
def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
"""Build normalised per-variable rows with a unified _text field [GON §3]."""
leaf_cols = cfg.get('leaf_cols', [])
group_cols = cfg.get('group_cols', [])
text_cols = cfg.get('text_cols', [])
meta_cols = cfg.get('meta_cols', [])
rows = []
for i, row in df.iterrows():
label = (' / '.join(p for p in [sv(row.get(c, '')) for c in leaf_cols] if p)
or f'var_{i}')
group = (' > '.join(p for p in [sv(row.get(c, '')) for c in group_cols]
if p and p.lower() not in ['nan', 'none'])
or 'Ungrouped')
all_cols = list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols))
text = ' | '.join(f'{c}: {sv(row.get(c, ""))}' for c in all_cols
if sv(row.get(c, '')))
rows.append({'_source': source, '_row': int(i), '_label': label,
'_id': f'{group}.{label}', '_group': group, '_text': text})
can = pd.DataFrame(rows)
cnt: dict = defaultdict(int)
ids = []
for lid in can['_id']:
cnt[lid] += 1
ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
can['_id'] = ids
return can
# ──────────────────────────────────────────────────────────────────────────────
# STEP 3 — ASPECT DISCOVERY VIA NMF [ZHU §3.1 adapted]
# ──────────────────────────────────────────────────────────────────────────────
def _elbow_k(errors: list, k_range: range) -> int:
if len(errors) <= 2:
return list(k_range)[0]
diffs = np.diff(errors)
drops = np.diff(diffs)
if drops.max() - drops.min() < 1e-8:
return max(2, int(np.sqrt(len(errors))))
elbow_idx = int(np.argmax(drops)) + 1
return list(k_range)[min(elbow_idx, len(k_range) - 1)]
def discover_aspects(texts: list, max_aspects: int = 10):
"""
Discover K latent semantic aspects via NMF on TF-IDF [ZHU §3.1 adapted].
Replaces Zhu et al.'s LLM aspect generation with NMF (deterministic, no
hallucination). K is selected by reconstruction-error elbow.
Returns tfidf, X, nmf, W, H, K, labels.
"""
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
max_features=3000, min_df=1)
X = tfidf.fit_transform(texts)
terms = np.array(tfidf.get_feature_names_out())
n_vars = X.shape[0]
k_range = range(2, min(max_aspects + 1, n_vars // 2 + 1, 16))
if len(k_range) < 2:
k_range = range(2, 3)
errors = []
for k in k_range:
m = NMF(n_components=k, random_state=42, max_iter=400, init='nndsvda')
m.fit_transform(X)
errors.append(m.reconstruction_err_)
K = _elbow_k(errors, k_range)
nmf = NMF(n_components=K, random_state=42, max_iter=400, init='nndsvda')
W = nmf.fit_transform(X)
H = nmf.components_
# Aspect labels: top-4 terms per NMF component
labels = []
for k in range(K):
top_idx = np.argsort(H[k])[-4:][::-1]
labels.append(' / '.join(terms[top_idx]))
return tfidf, X, nmf, W, H, K, labels
# ──────────────────────────────────────────────────────────────────────────────
# STEP 3 (FASTopic variant) — semantic aspect discovery [Wu et al. NeurIPS 2024]
# ──────────────────────────────────────────────────────────────────────────────
def discover_aspects_fastopic(texts: list,
max_aspects: int = 10,
fallback_tfidf: bool = True):
"""
Recent SOTA semantic aspect discovery via FASTopic [Wu et al. NeurIPS 2024,
arXiv:2405.17978].
FASTopic uses a pretrained Transformer (SBERT) to embed documents, then
learns topic/word embeddings via Dual Semantic-relation Reconstruction
(DSR) with optimal transport. Beats NMF, BERTopic, and CombinedTM on
standard topic benchmarks — semantic, not lexical, and reproducible.
Adapter shape matches `discover_aspects` so it is a drop-in replacement:
returns (tfidf, X, model, W, H, K, labels)
`tfidf` and `X` are still produced (used downstream by label_cluster +
masked TF-IDF representation fallbacks); FASTopic provides W (doc-topic),
H_proxy (topic-term scores derived from top words), K, and labels.
Falls back to NMF if FASTopic is not installed or the group is too small
for transformer training (< 6 documents).
"""
n_vars = len(texts)
# Keep a TF-IDF matrix available for downstream code paths
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2),
max_features=3000, min_df=1)
X = tfidf.fit_transform(texts)
terms = np.array(tfidf.get_feature_names_out())
if not _FASTOPIC_AVAILABLE or n_vars < 6:
if fallback_tfidf:
return discover_aspects(texts, max_aspects)
raise RuntimeError('FASTopic unavailable and TF-IDF fallback disabled')
# FASTopic K selection: bounded similar to NMF elbow but simpler — pick a
# reasonable K from corpus size (avoids overfitting tiny groups).
K = max(2, min(max_aspects, int(np.ceil(np.sqrt(n_vars))) + 1, n_vars - 1))
try:
model = FASTopic(num_topics=K,
doc_embed_model='all-MiniLM-L6-v2',
verbose=False)
# fit_transform returns (top_words_per_topic, doc_topic_dist)
result = model.fit_transform(texts)
if isinstance(result, tuple) and len(result) == 2:
top_words, doc_topic = result
else:
# Some FASTopic versions return only doc_topic; pull top words via API
doc_topic = result
top_words = [model.get_top_words(topic_id=k, num_top_words=10)
for k in range(K)]
except Exception:
# Robust fallback if FASTopic fails (small corpus, OOM, etc.)
if fallback_tfidf:
return discover_aspects(texts, max_aspects)
raise
W = np.asarray(doc_topic, dtype=np.float32)
if W.ndim != 2 or W.shape[0] != n_vars:
if fallback_tfidf:
return discover_aspects(texts, max_aspects)
# Build H_proxy: K × n_terms with weight = position-decay of each top word
n_terms = len(terms)
term_to_idx = {t: i for i, t in enumerate(terms)}
H_proxy = np.zeros((K, n_terms), dtype=np.float32)
labels = []
for k in range(K):
words_k = top_words[k] if k < len(top_words) else []
# Each entry may be 'word', or 'word score', or (word, score)
clean: list = []
for w in words_k:
if isinstance(w, (list, tuple)):
w = w[0]
w = str(w).split(' ')[0].strip().lower()
if w:
clean.append(w)
for rank, w in enumerate(clean):
if w in term_to_idx:
H_proxy[k, term_to_idx[w]] += 1.0 / (rank + 1)
labels.append(' / '.join(clean[:4]) if clean else f'aspect {k+1}')
return tfidf, X, model, W, H_proxy, K, labels
# ──────────────────────────────────────────────────────────────────────────────
# STEP 4 — PER-ASPECT VARIABLE REPRESENTATIONS [ZHU §3.1]
# ──────────────────────────────────────────────────────────────────────────────
def per_aspect_representations(texts: list, H: np.ndarray,
tfidf: TfidfVectorizer,
sbert_model=None) -> list:
"""
Build K independent representation matrices — one per aspect [ZHU §3.1].
For each aspect k:
• identify top-T terms from H[k]
• filter variable texts to those terms → encode with SBERT (or masked TF-IDF)
Returns list of K arrays, each shape (n_vars, embed_dim).
"""
terms = np.array(tfidf.get_feature_names_out())
X_arr = tfidf.transform(texts).toarray()
K = H.shape[0]
T = min(30, len(terms))
reprs = []
for k in range(K):
top_idx = np.argsort(H[k])[-T:]
top_terms = set(terms[top_idx])
if sbert_model is not None:
filtered = []
for txt in texts:
tokens = txt.lower().split()
kept = ' '.join(t for t in tokens if t in top_terms)
filtered.append(kept if kept.strip() else txt)
emb = sbert_model.encode(filtered, show_progress_bar=False,
batch_size=64, normalize_embeddings=True)
else:
mask = H[k]
emb = X_arr * mask[np.newaxis, :]
norms = np.linalg.norm(emb, axis=1, keepdims=True)
norms[norms == 0] = 1.0
emb = emb / norms
reprs.append(emb.astype(np.float32))
return reprs
# ──────────────────────────────────────────────────────────────────────────────
# STEP 5 — INDEPENDENT PER-ASPECT CLUSTERING [ZHU §3.2]
# ──────────────────────────────────────────────────────────────────────────────
def cluster_aspect_gmm(emb: np.ndarray, max_k: int = 8, fast_threshold: int = 30):
"""
Cluster variables within one aspect space [ZHU §3.2].
Hybrid strategy:
• n ≤ fast_threshold → GMM + BIC (accurate, recommended for small clusters)
• n > fast_threshold → MiniBatchKMeans + silhouette selection
(engineering adaptation for large groups, e.g. HCP Cognition / FreeSurfer)
Both paths use diagonal covariance / SVD pre-reduction for numerical
stability on high-dimensional sparse embeddings.
"""
n = emb.shape[0]
if n <= 2:
return np.zeros(n, dtype=int), 1, 0.0
d_target = min(20, emb.shape[1], n - 1)
if emb.shape[1] > d_target:
svd = TruncatedSVD(n_components=d_target, random_state=42)
emb_r = svd.fit_transform(emb)
else:
emb_r = emb.copy()
best_score, best_labels, best_k = -np.inf, None, 2
if n > fast_threshold:
# Fast path: MiniBatchKMeans + silhouette (large groups)
for k in range(2, min(max_k + 1, n)):
try:
km = MiniBatchKMeans(n_clusters=k, random_state=42,
n_init=3, batch_size=min(256, n),
max_iter=100)
labels = km.fit_predict(emb_r)
if len(set(labels)) < 2:
continue
sil = float(silhouette_score(emb_r, labels))
if sil > best_score:
best_score, best_labels, best_k = sil, labels, k
except Exception:
continue
else:
# Accurate path: GMM + BIC (small groups)
best_bic = np.inf
for k in range(2, min(max_k + 1, n)):
try:
gmm = GaussianMixture(n_components=k, random_state=42,
covariance_type='diag', reg_covar=1e-3,
max_iter=80, n_init=1,
init_params='random_from_data')
gmm.fit(emb_r)
bic = gmm.bic(emb_r)
labels = gmm.predict(emb_r)
if bic < best_bic:
best_bic, best_labels, best_k = bic, labels, k
except Exception:
continue
if best_labels is None:
best_labels = np.zeros(n, dtype=int)
sil = 0.0
if len(set(best_labels)) > 1:
try:
sil = float(silhouette_score(emb_r, best_labels))
except Exception:
pass
return best_labels.astype(int), best_k, sil
# ──────────────────────────────────────────────────────────────────────────────
# STEP 8a — OPTIONAL CONSTRAINED LLM LABEL REFINEMENT [TopicTag, DocEng 2024]
# ──────────────────────────────────────────────────────────────────────────────
def _light_stem(w: str) -> str:
"""
Minimal English morphological normalisation — no NLTK dependency.
Used by the LLM grounding check so that 'latencies' matches 'latency',
'errors' matches 'error', 'completion' matches 'completed', etc.
Avoids rejecting plurals and common tense variants while still requiring
every label word to derive from evidence vocabulary.
Based on Porter-stemmer-style suffix stripping (Porter 1980, adapted).
"""
w = w.lower().strip()
for suffix in ('ization', 'isation', 'ousness', 'iveness',
'ization', 'ities', 'iness',
'ation', 'ments', 'ness',
'ies', 'ied', 'ing', 'ers',
'ed', 'es', 'er', 'ly', 's'):
if w.endswith(suffix) and len(w) > len(suffix) + 2:
return w[:-len(suffix)]
return w
def make_llm_label_fn(base_url: str = OLLAMA_URL_DEFAULT,
model: str = OLLAMA_MODEL_DEFAULT,
provider: str = 'ollama',
api_key: str = '') -> Optional[Callable]:
"""
Build a TopicTag-style constrained LLM label refinement function backed
by a local Ollama server (OpenAI-compatible API at /v1).
[TopicTag] Eren et al. (2024) run NMF to discover topics, then use an LLM
to generate human-readable concept labels from the NMF topic terms.
The LLM receives ONLY the extracted evidence from the CSV — it cannot
alter the tree, cannot introduce new vocabulary, and must pass a strict
grounding check (every word in the proposed label must appear in evidence).
Local-LLM choice (Qwen 2.5 3B Instruct via Ollama) is deliberate:
• zero cost, zero API dependency, fully reproducible
• no external data transmission (privacy + thesis defensibility)
• TopicTag itself benchmarks open models (Llama, Mistral) — using an
open local model matches the paper's evaluation setup more closely
than a closed hosted model.
Returns a callable (candidate, top_terms, parent_path, sample_texts)
→ (label, metadata_dict). Returns None if Ollama is unreachable or the
openai client package is missing.
"""
client = _make_llm_client(provider, base_url, api_key)
if client is None:
return None
def _refine(candidate: str, top_terms: list, parent_path: str,
sample_texts: list):
meta = {'confidence': 0.0, 'evidence_terms': [],
'reason': '', 'raw_label': ''}
prompt = (
'You are labeling a cluster in a metadata variable hierarchy.\n'
'The label MUST be derived strictly from the evidence terms and '
'sample variable descriptions provided. Do not introduce concepts '
'or vocabulary that are not visible in the evidence.\n\n'
f'Parent path: {parent_path}\n'
f'Evidence terms (from NMF/TF-IDF over the cluster): {", ".join(top_terms[:10])}\n'
f'Sample variable descriptions:\n'
+ '\n'.join(f' - {str(t)[:160]}' for t in sample_texts[:4]) + '\n'
f'\nCurrent candidate label: {candidate}\n\n'
'Task: Return a concise 2–5 word concept label that PARAPHRASES '
'the evidence into a cleaner concept name.\n'
'Rules:\n'
'1. Every word in the label must appear in (or be an obvious '
'morphological variant of) the evidence terms or sample descriptions.\n'
'2. Do not invent domain concepts that are not in the evidence.\n'
'3. Prefer multi-word noun phrases over single keywords.\n'
'4. Avoid generic words: data, score, variable, assessment, total, '
'description, value, decimal.\n'
'5. Use base forms — singular nouns (Latency not Latencies), and '
'avoid -ing / -ed verb suffixes unless required.\n'
'6. Output strict JSON only — no prose, no markdown.\n\n'
'Output: {"label": "...", "evidence_terms": ["...", "..."], "confidence": 0.0}'
)
try:
# max_tokens generous: reasoning models (Qwen3) emit traces
# that consume budget before the JSON appears.
resp = _safe_chat_completion(client, model, prompt,
max_tokens=1200, temperature=0.2)
raw = (resp.choices[0].message.content or '').strip()
result = _parse_json_response(raw)
label = str(result.get('label', '')).strip()
conf = float(result.get('confidence', 0))
evid = result.get('evidence_terms', []) or []
meta['confidence'], meta['evidence_terms'] = conf, evid
meta['raw_label'] = label
if not label:
meta['reason'] = 'empty'
return candidate, meta
# NOTE: we IGNORE the LLM's self-reported confidence. Qwen 3B
# routinely returns conf ≈ 0.5 on perfectly good labels — using it
# as a gate rejected useful refinements. Grounding (below) is the
# real anti-hallucination check; if every label word stems back to
# the corpus, the label is accepted regardless of self-confidence.
#
# Stem-aware grounding accepts morphological variants
# (latencies ↔ latency, errors ↔ error, completion ↔ completed).
haystack_text = ' '.join(
str(s) for s in (
top_terms[:10] + evid + list(sample_texts[:4]) + [parent_path]
)
).lower()
haystack_stems = {_light_stem(w) for w in haystack_text.split()
if len(w) >= 3}
label_words = [w for w in label.lower().split() if len(w) >= 3]
label_stems = {_light_stem(w) for w in label_words}
ungrounded = label_stems - haystack_stems
if label_words and ungrounded:
meta['reason'] = f'ungrounded_words: {sorted(ungrounded)}'
return candidate, meta
meta['reason'] = 'accepted'
return label, meta
except Exception as e:
meta['reason'] = f'exception: {type(e).__name__}'
return candidate, meta
return _refine
# ──────────────────────────────────────────────────────────────────────────────
# STEP 5a — UPSTREAM LLM PHRASE-ROLE CLASSIFIER [TopicGPT, NAACL 2024 adapted]
#
# Used to discover semantic roles (Measure / Statistic / Condition / Subtype
# style dimensions) from the corpus and ASSIGN each mined phrase to a role.
# This is fundamentally different from TopicTag-style label refinement:
# • TopicTag (and the make_llm_label_fn above) uses LLMs to RENAME clusters
# after the tree structure is already decided. LLM is downstream.
# • This function uses an LLM to CREATE the role schema before the tree is
# built. Slot ordering and parent-child placement are driven by the
# LLM-derived roles. LLM is upstream — closer to TopicGPT (Pham et al.,
# NAACL 2024), where LLMs perform topic discovery, not topic labeling.
#
# Anti-hallucination contract (preserved):
# • Every phrase in the role assignment MUST come from the corpus verbatim.
# Outputs are validated against the input phrase list — anything new is
# dropped.
# • Role NAMES are proposed by the LLM but kept to 1–2 generic English
# words; long or fanciful role names are rejected.
# • If validation fails, the caller falls back to deterministic mutual-
# exclusion clustering (current slot mining behaviour). No silent
# failure modes.
# ──────────────────────────────────────────────────────────────────────────────
def make_llm_role_classifier_fn(base_url: str = OLLAMA_URL_DEFAULT,
model: str = OLLAMA_MODEL_DEFAULT,
provider: str = 'ollama',
api_key: str = '') -> Optional[Callable]:
"""
Build a callable that classifies repeated corpus phrases into semantic
roles using an LLM (Ollama or Groq).
"""
client = _make_llm_client(provider, base_url, api_key)
if client is None:
return None
def _classify(phrases: list, sample_descs: list, group_name: str = ''):
meta = {'reason': '', 'raw': '', 'role_count': 0}
if not phrases:
meta['reason'] = 'no_phrases'
return None, meta
# Truncate to keep the prompt small and the model focused
phrases_list = list(dict.fromkeys(str(p) for p in phrases))[:60]
prompt = (
'You are analysing a list of repeated phrases mined from a '
'data-dictionary corpus.\n\n'
f'Group context: {group_name or "(unknown)"}\n\n'
'Repeated phrases (verbatim from the corpus):\n'
+ '\n'.join(f' - {p}' for p in phrases_list) + '\n\n'
'Sample variable descriptions for context:\n'
+ '\n'.join(f' - {str(s)[:180]}' for s in sample_descs[:4]) + '\n\n'
'TASK: Group these phrases into 2–5 SEMANTIC ROLES. Each role '
'represents one ORTHOGONAL DIMENSION of what the variable measures '
'(for example: what is measured, what statistic is used, under '
'what condition).\n\n'
'STRICT RULES:\n'
'1. Use 2 to 5 roles. Fewer is better when phrases do not really '
'belong to different dimensions.\n'
'2. Role names: ONE OR TWO generic English words, lowercase. '
'Examples of good role names: "measure", "statistic", "condition", '
'"outcome", "subtype", "modifier". Do not invent fancy or domain-'
'specific role names.\n'
'3. Every phrase from the input list MUST appear EXACTLY ONCE in '
'exactly one role. Copy phrases VERBATIM. Do not rephrase, '
'normalise, plural-strip, or invent new phrases.\n'
'4. Phrases that are alternatives (rarely co-occur in the same '
'variable) should go in the SAME role.\n'
'5. Phrases that describe DIFFERENT dimensions of the same '
'variable should go in DIFFERENT roles.\n'
'6. Output strict JSON only — no prose, no markdown fences.\n\n'
'Output schema:\n'
'{\n'
' "roles": {\n'
' "role_name_1": ["phrase a", "phrase b", ...],\n'
' "role_name_2": ["phrase c", "phrase d", ...]\n'
' }\n'
'}'
)
try:
resp = _safe_chat_completion(client, model, prompt,
max_tokens=2000, temperature=0.1)
raw = (resp.choices[0].message.content or '').strip()
meta['raw'] = raw[:400]
result = _parse_json_response(raw)
roles_raw = result.get('roles', {}) or {}
if not isinstance(roles_raw, dict):
meta['reason'] = 'roles_not_dict'
return None, meta
# ── Validation ────────────────────────────────────────────────────
input_set = {p.lower(): p for p in phrases_list}
roles_clean: dict = {}
seen_phrases: set = set()
for role_name, items in roles_raw.items():
# Role name must be 1–2 generic words
rn = str(role_name).strip().lower()
if not rn or len(rn.split()) > 2 or len(rn) > 24:
continue
kept: list = []
if not isinstance(items, list):
continue
for it in items:
s = str(it).strip().lower()
if not s or s in seen_phrases:
continue
if s not in input_set:
# Phrase invented by LLM — drop (anti-hallucination)
continue
kept.append(input_set[s]) # original casing
seen_phrases.add(s)
if len(kept) >= 2:
roles_clean[rn] = kept
if len(roles_clean) < 2:
meta['reason'] = 'too_few_valid_roles'
return None, meta
meta['reason'] = 'accepted'
meta['role_count'] = len(roles_clean)
return roles_clean, meta
except Exception as e:
meta['reason'] = f'exception: {type(e).__name__}'
return None, meta
return _classify
# ──────────────────────────────────────────────────────────────────────────────
# STEP 8b–d — NODE LABELING [ZHU §4.3 / TopicTag]
# ──────────────────────────────────────────────────────────────────────────────
# Generic metadata field-name boilerplate that appears in essentially any
# data-dictionary header (description, value, name, ...). Dataset-specific
# column tokens (e.g. 'fullDisplayName' for HCP, 'Decimal Places' for AI-MIND)
# are added at runtime by build_field_noise() — derived from the user-confirmed
# column names, never enumerated by hand.
_GENERIC_FIELD_NOISE = frozenset({
'description', 'definition', 'value', 'metadata', 'desc',
'name', 'item', 'variable', 'field', 'attribute',
'code', 'type', 'dtype',
})
def build_field_noise(configs: dict) -> frozenset:
"""
Build the field-name boilerplate set entirely from the user-confirmed
column roles. For each detected column name we extract alphanumeric tokens
and add them to the noise set.
Examples (derived, NOT hardcoded):
HCP 'fullDisplayName' → {fulldisplayname}
HCP 'columnHeader' → {columnheader}
AI-MIND 'Decimal Places' → {decimal, places}
Result: the same effective filter as a hand-crafted list, but produced
deterministically from whatever columns the current CSV has. Datasets we
have never seen get an automatically-tailored noise set.
"""
noise = set(_GENERIC_FIELD_NOISE)
for cfg in (configs or {}).values():
for col_list in cfg.values():
for col in col_list:
tokens = ''.join(c if c.isalnum() else ' '
for c in str(col)).split()
noise.update(t.lower() for t in tokens if len(t) >= 2)
return frozenset(noise)
# Module-level fallback used when label_cluster is called without a configs-
# derived noise set. Replaced at build time by the Streamlit pipeline below.
FIELD_NAME_NOISE: frozenset = _GENERIC_FIELD_NOISE
def _extract_common_prefix_phrase(cluster_texts: list,
min_coverage: float = 0.6) -> str:
"""
Many data dictionaries write 'Concept Name: definition...' in the
description. If most cluster members share a concept-name prefix, that
prefix IS the concept label. Fully data-driven — works on any dictionary
using the 'name: definition' convention.
Returns a Title-cased phrase, or '' if no shared prefix is strong enough.
"""
# _text is "col1: val1 | col2: val2 | ...". Find description-like field
# and take its prefix before the inner colon.
prefixes = []
for t in cluster_texts:
for chunk in str(t).split(' | '):
if ':' not in chunk:
continue
key, val = chunk.split(':', 1)
key_l = key.lower()
if 'descrip' in key_l or 'def' in key_l or 'full' in key_l:
phrase = val.split(':')[0].strip()
tokens = phrase.split()
if 2 <= len(tokens) <= 6:
prefixes.append(tokens)
break
if not prefixes:
return ''
n_thresh = max(1, int(min_coverage * len(prefixes)))
max_len = max(len(p) for p in prefixes)
for length in range(min(6, max_len), 1, -1):
starts = Counter(tuple(p[:length]) for p in prefixes if len(p) >= length)
if not starts:
continue
top, cnt = starts.most_common(1)[0]
if cnt >= n_thresh:
return ' '.join(top).title()
return ''
def _bigram_preferred_terms(diff: np.ndarray, terms: np.ndarray,
boilerplate: set, prefix_lower: str,
n_terms: int) -> list:
"""
Pick top-n discriminative terms, preferring bigrams and removing redundancy.
Rules:
1. Skip short tokens (<3 chars), boilerplate, field-name noise, and any
token already in the prefix.
2. When a bigram is selected, drop any previously-picked unigram that is
a substring of it.
3. Skip unigrams that are substrings of any already-picked bigram.
"""
order = np.argsort(diff)[::-1]
picked, picked_lower = [], []
for i in order:
t = terms[i]; tl = t.lower()
if (len(t) < 3 or tl in boilerplate or tl in FIELD_NAME_NOISE
or (prefix_lower and tl in prefix_lower)):
continue
is_unigram = ' ' not in t
# Rule 3: unigram already covered by a picked bigram?
if is_unigram and any(tl in pl for pl in picked_lower if ' ' in pl):
continue
# Rule 2: replace picked unigrams subsumed by this new bigram
if not is_unigram:
keep = [(p, pl) for p, pl in zip(picked, picked_lower)
if not (' ' not in pl and pl in tl)]
picked, picked_lower = [k[0] for k in keep], [k[1] for k in keep]
picked.append(t); picked_lower.append(tl)
if len(picked) >= n_terms:
break
return picked
def label_cluster(cluster_texts: list, all_texts: list,
tfidf: TfidfVectorizer, n_terms: int = 3,
cluster_groups: Optional[list] = None,
parent_path: str = '',
llm_label_fn: Optional[Callable] = None,
return_provenance: bool = False):
"""
Label a cluster node using a five-stage deterministic pipeline.
A) Description-prefix candidate: extract concept phrase from 'Name: def...'
pattern shared by ≥60% of cluster members.
B) Group-purity prefix: if ≥70% share a top-level _group value, prepend it.
C) Boilerplate filter: drop terms with TF-IDF IDF ≤ 1.7 plus FIELD_NAME_NOISE.
D) Bigram-preferred discriminative suffix [ZHU §4.3].
E) Optional constrained LLM refinement [TopicTag, DocEng 2024].
When return_provenance=True returns a (label, provenance_dict) tuple where
provenance records which stage produced the label (description_prefix /
tfidf_bigram / group_anchor / llm) plus evidence terms and (for LLM) the
confidence score and a grounding check result.
"""
prov: dict = {'label_source': 'fallback',
'evidence_terms': [],
'confidence': 1.0,
'llm_used': False,
'llm_rejected': False}
if not cluster_texts:
return ('Group', prov) if return_provenance else 'Group'
terms = np.array(tfidf.get_feature_names_out())
idf_vals = tfidf.idf_
boilerplate = set(terms[idf_vals <= 1.7].tolist())
# ── B) group-purity prefix ───────────────────────────────────────────────
prefix = ''
if cluster_groups:
top_lvl = [str(g).split(' > ')[0].strip() for g in cluster_groups]
top_grp, cnt = Counter(top_lvl).most_common(1)[0]
if cnt / len(top_lvl) >= 0.70 and top_grp not in ('', 'Ungrouped'):
prefix = top_grp
# ── A) description-prefix candidate phrase ───────────────────────────────
phrase = _extract_common_prefix_phrase(cluster_texts)
if phrase and prefix and phrase.lower().startswith(prefix.lower()):
phrase = phrase[len(prefix):].strip()
# ── D) bigram-preferred discriminative terms ─────────────────────────────
top_terms_raw, words = [], []
try:
X_all = tfidf.transform(all_texts).toarray()
X_clus = tfidf.transform(cluster_texts).toarray()
diff = X_clus.mean(axis=0) - X_all.mean(axis=0)
top_terms_raw = [terms[i] for i in np.argsort(diff)[::-1][:20]]
words = _bigram_preferred_terms(diff, terms, boilerplate,
prefix.lower(), n_terms)
except Exception:
pass
# ── compose deterministic candidate (records which stage produced it) ────
if phrase:
candidate = f'{prefix} — {phrase}' if prefix else phrase
prov['label_source'] = 'description_prefix'
prov['evidence_terms'] = [phrase] + ([prefix] if prefix else [])
elif words:
suffix = ' / '.join(w.title() for w in words)
candidate = f'{prefix} — {suffix}' if prefix else suffix
prov['label_source'] = 'tfidf_bigram'
prov['evidence_terms'] = list(words) + ([prefix] if prefix else [])
elif prefix:
candidate = prefix
prov['label_source'] = 'group_anchor'
prov['evidence_terms'] = [prefix]
else:
candidate = 'Group'
# ── E) optional constrained LLM refinement [TopicTag] ────────────────────
# LLM is used ONLY as a re-phraser of evidence already present in the
# cluster. Refinement is rejected if the returned label is not grounded
# in the evidence terms — keeping the user's contract that "labels should
# come from the csv itself".
if llm_label_fn and candidate != 'Group':
prov['llm_used'] = True
try:
refined, llm_meta = llm_label_fn(
candidate, top_terms_raw[:10],
parent_path, cluster_texts[:4])
# Always record the raw LLM proposal and the grounding-check reason
# — even when rejected — so the Provenance tab can show "what did
# the LLM suggest and why was it dropped?"
prov['llm_raw_label'] = llm_meta.get('raw_label', '')
prov['llm_reason'] = llm_meta.get('reason', '')
if refined and refined != candidate:
candidate = refined
prov['label_source'] = 'llm'
prov['confidence'] = llm_meta.get('confidence', 0.0)
prov['evidence_terms'] = llm_meta.get('evidence_terms',
prov['evidence_terms'])
else:
prov['llm_rejected'] = True
except Exception as _e:
prov['llm_rejected'] = True
prov['llm_reason'] = f'exception: {type(_e).__name__}'
if return_provenance:
return candidate, prov
return candidate
# ──────────────────────────────────────────────────────────────────────────────
# STEP 5b — PHRASE-SLOT MINING (data-driven IE-style slot induction)
#
# Information-extraction adaptation for structured metadata dictionaries.
# Many data dictionaries (e.g. AI-MIND cognitive tests, HCP Study Completion)
# express each variable as a regular phrase combining several semantic
# dimensions (measure type, statistic, condition, etc.). Document-level
# methods (NMF, BERTopic, CTM) cannot separate these because all dimensions
# collapse into one vector. Phrase-slot mining decomposes each description
# into multiple phrases, identifies which phrases are mutually-exclusive
# alternatives across the corpus, and uses each alternative-set as a slot.
#
# No domain hardcoding: slot phrases are discovered from n-gram co-occurrence
# patterns in the actual descriptions. Slot names are derived from the
# phrases themselves; if structure is too weak, the algorithm reports
# `valid=False` and the caller falls back to NMF.
# ──────────────────────────────────────────────────────────────────────────────
# ── Generic dictionary markers stripped before slot mining ────────────────────
# These are universal data-dictionary conventions (`KEY: ...`, `Note: ...`,
# `Question: ...`) — not domain knowledge. Without stripping, they become
# false slot phrases (e.g. AI-MIND output contained `Key` nodes promoted from
# `KEY: DMS Percent Correct ...`).
_LEADING_MARKERS = frozenset({
'key', 'note', 'notes', 'definition', 'description', 'desc',
'question', 'q', 'item', 'value', 'meaning', 'label',
})
def _strip_leading_markers(text: str) -> str:
"""Iteratively strip leading dictionary markers like 'KEY:' or 'Note:'."""
out = text
for _ in range(4): # bounded — never more than a few stacked markers
if ':' not in out:
break
head, rest = out.split(':', 1)
if head.strip().lower() in _LEADING_MARKERS:
out = rest.strip()
else:
break
return out
def _split_concept_and_body(text: str) -> tuple:
"""
Split a 'Concept Name: definition sentence' description into its concept
prefix and its definition body.
Data dictionaries overwhelmingly use the convention
:
e.g. 'DMS Correct Latency Standard Deviation (SD) (0 second delay):
The standard deviation of response latencies for trials ...'
The concept phrase is the clean, canonical label; the body is explanatory
prose that introduces boilerplate ('the number of times', 'a subject
revisits a box ...') and weaker surface phrases ('response latencies').
Returns (concept, body). If no clear concept/body boundary exists, concept
is the whole text and body is ''. Generic — no domain knowledge.
"""
s = _strip_leading_markers(str(text).strip())
if ':' not in s:
return s, ''
head, body = s.split(':', 1)
head = head.strip()
body = body.strip()
# Only treat `head` as a concept prefix if it's phrase-length (not a whole
# sentence) — a real concept name is short.
if 1 <= len(head.split()) <= 14:
return head, body
return s, ''
# Generic data-dictionary definition-prose openers. Role values starting with
# these are explanatory fragments, not concept labels — reject them. Generic
# English, not domain vocabulary.
_BOILERPLATE_VALUE_PREFIXES = (
'the number of', 'number of', 'the subject', 'a subject', 'the participant',
'a participant', 'this measure', 'this variable', 'calculated across',
'calculated as', 'the percentage of', 'the proportion of', 'the total number',
'the mean of', 'the median of', 'the standard deviation of', 'the amount of',
'the time', 'the length of', 'expressed as', 'defined as', 'measured as',
)
def _is_boilerplate_value(val: str) -> bool:
"""True if a role value looks like a definition fragment rather than a label."""
v = str(val).strip().lower()
if not v:
return True
if any(v.startswith(p) for p in _BOILERPLATE_VALUE_PREFIXES):
return True
# Sentence-length values are definitions, not concept labels
if len(v.split()) > 6:
return True
return False
# ── English stopwords for phrase-quality filtering ────────────────────────────
# Reject phrases like 'and', 'them', 'to be', 'have a lot of' from becoming
# slot nodes. This is general English filtering, not domain knowledge.
_STOPWORDS = frozenset(
'a an the and or but of in on at by for with about into during '
'through over under above below from to as is are was were be being '
'been have has had do does did this that these those they them their '
'it its he she his her you your we our i my me us not no nor '
'how often when where which who why what '
'lot lots much many more most some any all none '
'very also too just so such only even still even also '
'one two three first second ' # 'second' as standalone — careful: kept in n-grams via context
.split()
)
def _phrase_is_meaningful(phrase: str, group_name_lower: str = '') -> bool:
"""
Reject phrases that should never be a hierarchy node.
Rules (all generic, no domain knowledge):
• all-stopword phrases ('and', 'to be', 'have a lot of')
• boundary stopwords ('the response time' → 'the' boundary)
• phrase equals the parent group name ('DMS' inside DMS branch)
• single dictionary-marker words ('key', 'note')
• pure numeric tokens
"""
p = phrase.strip().lower()
if not p:
return False
if p == group_name_lower:
return False
if p in _LEADING_MARKERS:
return False
tokens = p.split()
if not tokens:
return False
# Boundary stopwords
if tokens[0] in _STOPWORDS or tokens[-1] in _STOPWORDS:
return False
# Need at least one content token (non-stopword, length ≥ 2)
content = [t for t in tokens if t not in _STOPWORDS and len(t) >= 2]
if not content:
return False
# Single-token phrases must be a meaningful word, not a bare digit
if len(tokens) == 1 and tokens[0].isdigit():
return False
return True
# ──────────────────────────────────────────────────────────────────────────────
# OPTION D — SBERT PHRASE CLUSTERING + CONSTRAINED LLM ROLE NAMING
#
# Following the EDC pattern (Zhang & Soh, EMNLP 2024) and ZOES (arXiv
# 2506.04458, 2025):
# Extract → mine repeated phrases from each variable description
# Define → semantically cluster phrases via SBERT + AgglomerativeClustering
# Canonicalize → constrained-vocabulary LLM names each cluster with a role
# (measure / statistic / condition / subtype / outcome / modifier)
#
# Anti-hallucination contract:
# • Phrases are extracted verbatim from the corpus, never invented
# • LLM only NAMES discovered clusters — cannot move phrases
# • Role names are constrained to a fixed generic English vocabulary
# (configurable via constrained=True/False)
# • Anonymous fallback when LLM is unavailable or invalid (cluster_0, ...)
#
# Different from Approach 1: Approach 1 embeds the WHOLE description as one
# vector; this embeds each EXTRACTED PHRASE as its own vector. Approach 1
# clusters variables; this clusters phrases. Approach 1 produces a single
# similarity tree; this produces a multi-role decomposition + nested tree.
# ──────────────────────────────────────────────────────────────────────────────
_ALLOWED_ROLES_CONSTRAINED = frozenset({
'measure', 'statistic', 'condition', 'subtype', 'outcome', 'modifier',
})
# Generic role priority for hierarchy nesting (no domain knowledge).
# Higher priority = outermost (closest to root) level inside the group.
_ROLE_PRIORITY = {
'measure': 6,
'outcome': 5,
'statistic': 4,
'condition': 3,
'modifier': 2,
'subtype': 1,
}
def make_llm_role_namer_fn(base_url: str = OLLAMA_URL_DEFAULT,
model: str = OLLAMA_MODEL_DEFAULT,
constrained: bool = True,
provider: str = 'ollama',
api_key: str = '') -> Optional[Callable]:
"""
Returns a callable (phrases, sample_descs) → (role_name, meta).
Works with Ollama (local) or Groq (cloud).
"""
client = _make_llm_client(provider, base_url, api_key)
if client is None:
return None
def _name_cluster(phrases: list, sample_descs: list = None):
meta = {'reason': '', 'raw': '', 'constrained': constrained}
if not phrases:
meta['reason'] = 'empty_phrases'
return None, meta
sample_descs = sample_descs or []
head = (
'Below is a cluster of semantically related phrases mined from a '
'data-dictionary corpus. The phrases were grouped by sentence-'
'transformer embedding similarity.\n\n'
'Phrases in cluster:\n'
+ '\n'.join(f' - {p}' for p in phrases[:20]) + '\n\n'
'Sample variable descriptions for context:\n'
+ '\n'.join(f' - {str(s)[:160]}' for s in sample_descs[:3]) + '\n\n'
)
if constrained:
prompt = head + (
'Choose EXACTLY ONE semantic role from this fixed list that '
'best names what these phrases share. Pick the SINGLE best fit:\n'
' - measure (the base concept being measured)\n'
' - statistic (mean, median, standard deviation, total, ...)\n'
' - condition (when/where/under what circumstances)\n'
' - outcome (result/output type)\n'
' - subtype (a specific kind of measure)\n'
' - modifier (a qualifier)\n\n'
'Output strict JSON only — no markdown:\n'
'{"role": "measure"}'
)
else:
prompt = head + (
'Propose ONE generic 1–2 word lowercase English noun that names '
'the semantic role these phrases share. Examples of good roles: '
'measure, statistic, condition, subtype, location, time, '
'identifier, frequency, severity, quality.\n\n'
'Output strict JSON only:\n'
'{"role": "..."}'
)
try:
# max_tokens must cover reasoning-model traces (Qwen3)
# before the tiny JSON answer appears.
resp = _safe_chat_completion(client, model, prompt,
max_tokens=800, temperature=0.1)
raw = (resp.choices[0].message.content or '').strip()
meta['raw'] = raw[:200]
result = _parse_json_response(raw)
role = str(result.get('role', '')).strip().lower()
if not role:
meta['reason'] = 'empty_role'
return None, meta
if constrained:
if role not in _ALLOWED_ROLES_CONSTRAINED:
meta['reason'] = f'role_not_allowed: {role}'
return None, meta
else:
toks = role.split()
if len(toks) > 2 or len(role) > 24:
meta['reason'] = 'role_too_long'
return None, meta
if not all(t.isalpha() and t.islower() for t in toks):
meta['reason'] = 'role_not_alpha_lower'
return None, meta
meta['reason'] = 'accepted'
return role, meta
except Exception as e:
meta['reason'] = f'exception: {type(e).__name__}'
return None, meta
return _name_cluster
def _extract_phrases_for_role_clustering(texts: list,
text_col_names: Optional[list],
group_name: str,
min_phrase_count: int = 2
) -> tuple:
"""
Shared helper: extract repeated meaningful phrases from a group of
descriptions. Returns (phrases_list, per_row_phrase_sets, regularity).
Regularity = fraction of rows that contain ≥2 mined phrases.
"""
text_keys = ({c.strip().lower() for c in (text_col_names or [])}
if text_col_names else None)
n = len(texts)
# 1. extract description-prefix per row
prefixes = []
for t in texts:
prefix = ''
for chunk in str(t).split(' | '):
if ':' not in chunk:
continue
key, val = chunk.split(':', 1)
key_l = key.strip().lower()
if text_keys is not None and key_l not in text_keys:
continue
prefix = val.split(':')[0].strip()
break
if not prefix:
prefix = str(t).strip()
prefix = _strip_leading_markers(prefix)
prefixes.append(prefix)
# 2. normalise
def _norm(p):
s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
for ch in str(p).lower())
return ' '.join(s.split())
norm = [_norm(p) for p in prefixes]
# 3. mine n-grams 1..4 ≥3 chars
row_phrases: list = []
phrase_count: Counter = Counter()
for text in norm:
tokens = text.split()
row_set = set()
for ngram_n in range(1, 5):
for i in range(len(tokens) - ngram_n + 1):
phrase = ' '.join(tokens[i:i + ngram_n])
if len(phrase) >= 3:
row_set.add(phrase)
row_phrases.append(row_set)
for p in row_set:
phrase_count[p] += 1
# 4. keep repeated, retain maximal per row
keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
refined = []
for row in row_phrases:
sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
kept: list = []
for p in sorted_by_len:
if not any(p != q and p in q for q in kept):
kept.append(p)
refined.append(set(kept))
phrase_count = Counter()
for r in refined:
for p in r:
phrase_count[p] += 1
group_lower = (group_name or '').strip().lower()
phrases = sorted([p for p, c in phrase_count.items()
if c >= min_phrase_count
and _phrase_is_meaningful(p, group_lower)])
regularity = sum(1 for r in refined if len(r) >= 2) / max(1, n)
return phrases, refined, round(regularity, 4)
# ──────────────────────────────────────────────────────────────────────────────
# PER-ROW LLM ROLE EXTRACTOR [Zhu et al. EMNLP 2025 — proper implementation]
#
# For each variable description, ONE LLM call extracts role values directly:
# measure = base concept being measured (e.g. "Correct Latency")
# statistic = statistical aggregation (e.g. "Standard Deviation")
# condition = experimental condition (e.g. "0 second delay")
# subtype = specific error / outcome subtype (e.g. "Incorrect Colour")
#
# Anti-hallucination contract:
# • Each returned value MUST be a literal substring of the input description
# (grounding check rejects anything else).
# • LLM cannot invent new roles — only the four canonical roles are output.
# • Empty string is a valid output ("this row has no statistic" etc.).
#
# Cost: ONE call per variable. AI-MIND ≈ 108 calls, HCP ≈ 813.
# ──────────────────────────────────────────────────────────────────────────────
_PER_ROW_ROLES = ('measure', 'statistic', 'condition', 'subtype')
def make_per_row_role_extractor_fn(base_url: str = OLLAMA_URL_DEFAULT,
model: str = OLLAMA_MODEL_DEFAULT,
provider: str = 'ollama',
api_key: str = '') -> Optional[Callable]:
"""
Returns a callable (description, sample_descs_in_group) → (roles_dict, meta).
Implements the Zhu et al. (2025, EMNLP) multi-aspect encoding pattern:
instead of clustering phrases and naming clusters, ask the LLM to extract
each role value directly from each row's description. Each extracted
value must be a verbatim substring (anti-hallucination).
"""
client = _make_llm_client(provider, base_url, api_key)
if client is None:
return None
def _extract(concept: str, sample_concepts: list = None,
full_text: str = None):
meta = {'reason': '', 'raw': ''}
if not concept or len(str(concept).strip()) < 3:
return {}, {'reason': 'empty_description'}
# `concept` = the clean concept-name phrase (preferred extraction source)
# `full_text` = concept + definition body (used for subtype + grounding)
concept = str(concept)[:300]
full = str(full_text or concept)[:600]
sample_concepts = sample_concepts or []
prompt = (
'You are extracting semantic role values from one variable in a '
'data dictionary.\n\n'
f'CONCEPT NAME (use this for measure / statistic / condition):\n'
f' {concept}\n\n'
f'FULL DEFINITION (use ONLY for subtype, and only if needed):\n'
f' {full}\n\n'
'Similar concept names in the same group (context):\n'
+ '\n'.join(f' - {str(s)[:120]}' for s in sample_concepts[:3]) + '\n\n'
'TASK: Extract values for these four semantic roles. Take '
'measure, statistic and condition from the CONCEPT NAME wherever '
'possible. Every value MUST be a verbatim substring of the CONCEPT '
'NAME or FULL DEFINITION. Empty string if a role does not apply.\n\n'
' measure = the base quantity being measured\n'
' (e.g. "reaction time", "accuracy", "score")\n'
' statistic = a statistical aggregation operator\n'
' (e.g. "mean", "median", "standard deviation", "total")\n'
' condition = an experimental condition or scope\n'
' (e.g. "baseline", "follow-up", "task condition")\n'
' subtype = a specific subtype / kind / error type\n'
' (e.g. "error type", "response type", "trial type")\n\n'
'STRICT RULES:\n'
'1. Each value COPIED VERBATIM — do not invent, summarise, paraphrase.\n'
'2. Prefer short concept phrases over long definition fragments.\n'
'3. Do NOT return a value that is a sentence or starts with "the '
'number of", "the subject", "calculated across" — those are '
'definition prose, not labels.\n'
'4. Empty string "" for roles that do not apply.\n'
'5. Output strict JSON only:\n\n'
'{"measure": "...", "statistic": "...", "condition": "...", "subtype": ""}'
)
try:
resp = _safe_chat_completion(client, model, prompt,
max_tokens=1500, temperature=0.1)
raw = (resp.choices[0].message.content or '').strip()
meta['raw'] = raw[:300]
result = _parse_json_response(raw)
except Exception as e:
meta['reason'] = f'exception: {type(e).__name__}: {str(e)[:80]}'
return {}, meta
# Grounding is checked against the FULL text (concept + body) so that
# subtype values living in the definition body still pass.
ground_lower = full.lower()
ground_tokens = [w.strip(',.()[]{}"\'') for w in ground_lower.split()]
ground_stems = {_light_stem(w) for w in ground_tokens
if len(w) >= 3 and w not in _STOPWORDS}
roles: dict = {}
rejected: list = []
for role in _PER_ROW_ROLES:
val = result.get(role, '')
if not isinstance(val, str):
continue
val_clean = val.strip().strip('"').strip("'")
if not val_clean:
continue
# P2: reject definition-prose fragments before grounding
if _is_boilerplate_value(val_clean):
rejected.append((role, val_clean, ['boilerplate']))
continue
# Strict substring (cheapest, most common)
if val_clean.lower() in ground_lower:
roles[role] = val_clean
continue
# Token-stem grounding fallback (morphological variants)
val_tokens = [w.strip(',.()[]{}"\'') for w in val_clean.lower().split()]
val_stems = {_light_stem(w) for w in val_tokens
if len(w) >= 3 and w not in _STOPWORDS}
if val_stems and val_stems.issubset(ground_stems):
roles[role] = val_clean
else:
missing = sorted(val_stems - ground_stems) if val_stems else ['(no content tokens)']
rejected.append((role, val_clean, missing))
meta['reason'] = 'accepted' if roles else 'all_rejected'
meta['rejected'] = rejected
meta['n_extracted'] = len(roles)
meta['n_rejected'] = len(rejected)
return roles, meta
return _extract
def discover_roles_via_per_row_extraction(
texts: list,
text_col_names: Optional[list],
per_row_extractor_fn: Callable,
group_name: str = '',
regularity_threshold: float = 0.40) -> dict:
"""
Zhu et al. (EMNLP 2025) style: one LLM call per row extracts measure /
statistic / condition / subtype values directly from each description.
Returns the same dict shape as discover_roles_via_sbert_phrase_clustering()
so it can flow into build_role_hierarchy().
"""
n = len(texts)
if n < 4:
return {'valid': False, 'reason': 'too_few_rows',
'regularity': 0.0, 'roles': {}, 'row_assignments': []}
# Extract a "concept-prefix" snippet per row for context to the extractor
text_keys = ({c.strip().lower() for c in (text_col_names or [])}
if text_col_names else None)
def _row_desc_value(t: str) -> str:
"""Return the raw description-column value (everything after 'desc:')."""
for chunk in str(t).split(' | '):
if ':' not in chunk:
continue
key, val = chunk.split(':', 1)
key_l = key.strip().lower()
if text_keys is not None and key_l not in text_keys:
continue
return val.strip()
return str(t).strip()
# For each row split into (concept-name prefix, definition body).
# The concept feeds measure/statistic/condition; the full text grounds
# subtype and the grounding check.
row_concepts: list = []
row_fulls: list = []
for t in texts:
dv = _row_desc_value(t)
concept, body = _split_concept_and_body(dv)
row_concepts.append(concept)
row_fulls.append((concept + ' ' + body).strip() if body else concept)
sample_descs = row_concepts[:3]
# Call extractor per row. Use Streamlit progress bar so the user sees
# what's happening (especially important for HCP).
try:
import streamlit as _st_p
pbar = _st_p.progress(0.0, text=f'Extracting roles for "{group_name}"…')
show_progress = True
except Exception:
pbar = None
show_progress = False
per_row_roles: list = []
per_row_audit: list = [] # full audit trail (proposed + rejected per row)
all_phrases_by_role: dict = defaultdict(list)
extractor_meta_summary: dict = defaultdict(int)
for i, concept in enumerate(row_concepts):
if show_progress and pbar is not None:
try:
pbar.progress((i + 1) / max(1, n),
text=f'[{group_name}] row {i+1}/{n}')
except Exception:
pass
try:
roles, meta = per_row_extractor_fn(
concept, sample_descs, full_text=row_fulls[i])
except Exception as e:
roles, meta = {}, {'reason': f'exception: {type(e).__name__}'}
per_row_roles.append(roles)
# Audit trail: store per-row details for the Role Decomposition tab
per_row_audit.append({
'row_idx': i,
'description_snippet': str(concept)[:120],
'accepted_roles': dict(roles),
'rejected': meta.get('rejected', []),
'reason': meta.get('reason', ''),
'raw': meta.get('raw', ''),
})
extractor_meta_summary[meta.get('reason', 'unknown')] += 1
for role, val in roles.items():
if val and val not in all_phrases_by_role[role]:
all_phrases_by_role[role].append(val)
try:
if show_progress and pbar is not None:
pbar.empty()
except Exception:
pass
# Keep only roles that have ≥ 2 distinct values across the corpus
roles_final: dict = {}
for role in _PER_ROW_ROLES:
vals = all_phrases_by_role.get(role, [])
if len(vals) >= 2:
roles_final[role] = vals
# Coverage = fraction of rows with ≥1 non-empty role assignment
covered = sum(1 for r in per_row_roles if any(r.get(rl) for rl in roles_final))
coverage = covered / max(1, n)
regularity = coverage # for per-row extractor, coverage is regularity
valid = (len(roles_final) >= 2 and coverage >= regularity_threshold)
return {
'roles': roles_final,
'row_assignments': per_row_roles,
'coverage': round(coverage, 4),
'regularity': round(regularity, 4),
'valid': valid,
'role_source': 'per_row_llm_extraction',
'extractor_summary': dict(extractor_meta_summary),
'per_row_audit': per_row_audit,
'group_name': group_name,
}
def discover_roles_via_sbert_phrase_clustering(
texts: list,
text_col_names: Optional[list],
sbert_model,
llm_role_namer_fn: Optional[Callable] = None,
min_phrase_count: int = 2,
min_role_size: int = 2,
n_clusters_range: tuple = (2, 6),
group_name: str = '',
regularity_threshold: float = 0.40) -> dict:
"""
Option D core: discover semantic-role schema for a group via
SBERT phrase clustering + constrained-vocab LLM cluster naming.
Pipeline (EDC / ZOES style):
1. Extract repeated phrases per row (shared helper).
2. Compute group regularity = fraction of rows with ≥2 mined phrases.
If < regularity_threshold → return invalid (caller falls back to
existing slot mining / FASTopic / NMF path).
3. SBERT-embed each unique phrase.
4. Agglomerative-cluster phrases by cosine similarity; select K by
silhouette score (range 2..6).
5. Name each cluster via LLM (constrained vocab). Anonymous fallback
when LLM is off or rejects.
6. Per-row: assign one phrase per role (longest mined phrase wins ties).
7. Return roles + row_assignments + diagnostics.
Returns a dict in the same shape as mine_phrase_slots() so it can flow
straight into build_slot_hierarchy / build_role_hierarchy.
"""
n = len(texts)
if n < 4 or sbert_model is None:
return {'valid': False, 'reason': 'too_few_rows_or_no_sbert',
'regularity': 0.0, 'roles': {}, 'row_assignments': []}
# 1. extract phrases
phrases, refined, regularity = _extract_phrases_for_role_clustering(
texts, text_col_names, group_name, min_phrase_count)
if regularity < regularity_threshold:
return {'valid': False, 'reason': f'low_regularity ({regularity:.2f})',
'regularity': regularity, 'roles': {}, 'row_assignments': []}
if len(phrases) < 4:
return {'valid': False, 'reason': 'too_few_phrases',
'regularity': regularity, 'roles': {}, 'row_assignments': []}
# 2. SBERT embed
try:
embs = sbert_model.encode(phrases, normalize_embeddings=True,
show_progress_bar=False, batch_size=64)
except Exception as e:
return {'valid': False, 'reason': f'sbert_failed: {type(e).__name__}',
'regularity': regularity, 'roles': {}, 'row_assignments': []}
# 3. agglomerative + silhouette K selection
best_score, best_labels, best_k = -1.0, None, 2
for k in range(n_clusters_range[0],
min(n_clusters_range[1] + 1, len(phrases))):
try:
ac = AgglomerativeClustering(n_clusters=k, metric='cosine',
linkage='average')
labels = ac.fit_predict(embs)
if len(set(labels)) < 2:
continue
sil = float(silhouette_score(embs, labels, metric='cosine'))
if sil > best_score:
best_score, best_labels, best_k = sil, labels, k
except Exception:
continue
if best_labels is None:
return {'valid': False, 'reason': 'no_clusters',
'regularity': regularity, 'roles': {}, 'row_assignments': []}
# 4. group phrases by cluster id
clusters_by_id: dict = defaultdict(list)
for p, lbl in zip(phrases, best_labels):
clusters_by_id[int(lbl)].append(p)
valid_clusters = {cid: ps for cid, ps in clusters_by_id.items()
if len(ps) >= min_role_size}
if len(valid_clusters) < 2:
return {'valid': False, 'reason': 'too_few_valid_clusters',
'regularity': regularity, 'roles': {}, 'row_assignments': []}
# 5. name each cluster
sample_descs = [str(t)[:200] for t in texts[:3]]
roles: dict = {}
naming_meta: dict = {}
used_names: set = set()
for cid, cluster_phrases in valid_clusters.items():
role_name = None
if llm_role_namer_fn:
role_name, name_meta = llm_role_namer_fn(cluster_phrases, sample_descs)
naming_meta[cid] = name_meta
if not role_name:
role_name = f'cluster_{cid}'
# Disambiguate if LLM gave the same name to two clusters
original = role_name
suffix = 2
while role_name in used_names:
role_name = f'{original}_{suffix}'
suffix += 1
used_names.add(role_name)
roles[role_name] = cluster_phrases
# 6. per-row role assignment (longest phrase per role per row)
phrase_to_role = {p: r for r, ps in roles.items() for p in ps}
row_assignments: list = []
covered = 0
for r in refined:
assignment: dict = {}
for p in r:
role = phrase_to_role.get(p)
if not role:
continue
if role not in assignment or len(p) > len(assignment[role]):
assignment[role] = p
if assignment:
covered += 1
row_assignments.append(assignment)
coverage = covered / n
valid = (coverage >= 0.50 and len(roles) >= 2)
return {
'roles': roles,
'row_assignments': row_assignments,
'coverage': round(coverage, 4),
'regularity': regularity,
'valid': valid,
'role_source': 'sbert_phrase_clustering',
'phrase_silhouette': round(float(best_score), 4),
'n_clusters': best_k,
'naming_meta': naming_meta,
}
def build_role_hierarchy(vi_list: list,
role_result: dict,
can: pd.DataFrame,
parent_id: int,
parent_path: str,
nodes: list,
node_map: dict,
var_nodes: dict,
max_depth_remaining: int = 4,
post_split_fn: Optional[Callable] = None,
min_post_split_size: int = 4) -> bool:
"""
Build a role-nested hierarchy from the Option D role decomposition.
Roles ordered by generic priority:
measure > outcome > statistic > condition > modifier > subtype
Variables without a value for a given role skip that level.
Returns True on success (≥ 2 aggregation nodes added), else False.
"""
if not role_result.get('valid'):
return False
roles = role_result['roles']
row_assigns = role_result['row_assignments']
if len(vi_list) != len(row_assigns):
return False
role_names = sorted(roles.keys(),
key=lambda r: _ROLE_PRIORITY.get(r, 0),
reverse=True)
if not role_names:
return False
aggregations_made = [0]
vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}
def _terminal_attach(vi_grp: list, parent: int, depth_left: int):
if (post_split_fn is not None
and len(vi_grp) >= min_post_split_size
and depth_left > 0):
try:
added = post_split_fn(vi_grp, parent, depth_left)
if added > 0:
aggregations_made[0] += added
return
except Exception:
pass
for vi in vi_grp:
_add_child(node_map, parent, var_nodes[vi])
def _split(vi_subset: list, role_idx: int, current_parent: int,
depth_remaining: int):
if (depth_remaining <= 0 or role_idx >= len(role_names)
or len(vi_subset) <= 1):
_terminal_attach(vi_subset, current_parent, depth_remaining)
return
role = role_names[role_idx]
groups: dict = defaultdict(list)
unassigned: list = []
for vi in vi_subset:
local = vi_to_local.get(vi)
val = row_assigns[local].get(role) if local is not None else None
if val:
groups[val].append(vi)
else:
unassigned.append(vi)
if len(groups) <= 1:
_split(vi_subset, role_idx + 1, current_parent, depth_remaining)
return
# BUGFIX: read role_source from the result so the same builder
# correctly labels per-row LLM extraction nodes vs SBERT clustering
# nodes. Previously this was hardcoded to 'sbert_phrase_clustering',
# hiding which route actually ran in the exported provenance.
_route_src = role_result.get('role_source', 'sbert_phrase_clustering')
_label_src = ('per_row_llm_role'
if _route_src == 'per_row_llm_extraction'
else 'sbert_phrase_role')
_node_source_str = ('per-row LLM extraction (Zhu et al. 2025)'
if _route_src == 'per_row_llm_extraction'
else 'SBERT phrase cluster + LLM role naming')
for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
if len(vi_grp) == 1:
_add_child(node_map, current_parent, var_nodes[vi_grp[0]])
continue
nid = _next_id(nodes)
nd = _make_node(nid, val.title(),
desc=(f'Role: {role} | Value: "{val}" | '
f'Variables: {len(vi_grp)} | '
f'Source: {_node_source_str}'))
nd['label_provenance'] = {
'label_source': _label_src,
'evidence_terms': [val],
'confidence': 1.0,
'llm_used': True,
'llm_rejected': False,
'role': role,
}
nd['structure_provenance'] = {
'route': _route_src,
'aspect_method': _route_src,
'slot_role': role,
'phrase_silhouette': role_result.get('phrase_silhouette'),
'regularity': role_result.get('regularity'),
'n_clusters': role_result.get('n_clusters'),
}
nodes.append(nd)
node_map[nid] = nd
_add_child(node_map, current_parent, nid)
aggregations_made[0] += 1
_split(vi_grp, role_idx + 1, nid, depth_remaining - 1)
if unassigned:
_terminal_attach(unassigned, current_parent, depth_remaining)
_split(vi_list, 0, parent_id, max_depth_remaining)
return aggregations_made[0] >= 2
def strip_group_prefix_from_labels(nodes: list) -> int:
"""
Post-build pass: for every aggregation node, if its name starts with the
parent group's name (case-insensitive), strip the prefix.
Effect: 'DMS — Total Errors' under DMS becomes 'Total Errors'.
'Pal Total Errors' under PAL becomes 'Total Errors'.
Returns the number of labels modified.
"""
node_map = {int(n['id']): n for n in nodes}
modified = 0
def _walk(nid: int, parent_name: str):
nonlocal modified
n = node_map.get(int(nid))
if not n:
return
if n.get('type') == 'aggregation' and parent_name:
current = str(n.get('name', '')).strip()
cn_lower = current.lower()
pn_lower = parent_name.strip().lower()
if pn_lower and (cn_lower.startswith(pn_lower + ' ')
or cn_lower.startswith(pn_lower + '—')
or cn_lower.startswith(pn_lower + '-')
or cn_lower.startswith(pn_lower + ':')
or cn_lower.startswith(pn_lower + '/')):
stripped = current[len(parent_name):].lstrip(' —-—:/').strip()
if stripped and len(stripped) >= 2:
n['name'] = stripped
modified += 1
new_parent = (n.get('name', '') if n.get('type') in ('aggregation', 'root')
else parent_name)
for cid in n.get('related', []):
_walk(int(cid), new_parent)
_walk(0, '')
return modified
def enforce_single_parent(nodes: list) -> int:
"""
POST-BUILD PASS 4 — guarantee the result is a tree (each node has exactly
one parent).
The role builder can attach a variable both directly to a group and again
under a sub-aggregation of that same group — e.g. a leaf under '3 Targets'
*and* under '3 Targets > False Alarm Sequences'. That makes the branch a
DAG, which (a) diverges from the single-parent tree format, (b) fragments
the branch, and (c) breaks proportional ('total') sunburst/treemap sizing.
For every node with more than one parent, keep the MOST SPECIFIC (deepest)
parent and detach it from the shallower ones. Keeping the deepest parent
removes the redundant direct attachment while preserving the finer
sub-grouping the role extractor discovered. Fully generic — no domain
knowledge, no hardcoded labels.
Returns the number of redundant parent links removed.
"""
from collections import deque
node_map = {int(n['id']): n for n in nodes}
# depth = shortest distance from root (id 0) along child edges
depth = {0: 0}
dq = deque([0])
while dq:
cur = dq.popleft()
for c in node_map.get(cur, {}).get('related', []):
c = int(c)
if c not in depth:
depth[c] = depth[cur] + 1
dq.append(c)
parents = defaultdict(list)
for n in nodes:
for c in n.get('related', []):
parents[int(c)].append(int(n['id']))
removed = 0
for child, ps in parents.items():
if len(ps) <= 1:
continue
keep = max(ps, key=lambda p: depth.get(p, 0)) # deepest = most specific
for p in ps:
if p == keep:
continue
par = node_map.get(p)
if par and int(child) in par['related']:
par['related'] = [x for x in par['related'] if int(x) != int(child)]
removed += 1
return removed
def prune_empty_aggregations(nodes: list) -> int:
"""
POST-BUILD PASS 5 — drop aggregation nodes that ended up with no children.
`enforce_single_parent` can empty a shallow aggregation when all of its
variables were kept under a deeper/sibling parent (e.g. 'RVP Response
Latency' losing every leaf to a more specific group). An empty category
node is export noise — it renders as a blank sector and has no members.
Iteratively removes childless aggregation nodes and detaches them from
their parents (removal can empty a parent in turn). Root and attribute
nodes are never touched. Returns the number of nodes removed.
"""
removed = 0
while True:
node_map = {int(n['id']): n for n in nodes}
empties = {int(n['id']) for n in nodes
if n.get('type') == 'aggregation' and not n.get('related')}
if not empties:
break
nodes[:] = [n for n in nodes if int(n['id']) not in empties]
for n in nodes:
if any(int(c) in empties for c in n.get('related', [])):
n['related'] = [int(c) for c in n['related'] if int(c) not in empties]
removed += len(empties)
return removed
def mine_phrase_slots(texts: list,
text_col_names: Optional[list] = None,
min_phrase_count: int = 2,
min_slot_size: int = 2,
coverage_threshold: float = 0.55,
excl_threshold: float = 0.75,
group_name: str = '',
llm_role_classifier_fn: Optional[Callable] = None) -> dict:
"""
Discover phrase slots in a group of variable descriptions.
Algorithm:
1. Extract concept-prefix from each description (text before ':' in a
description-like column; full text if no such column).
2. Tokenise + lowercase; generate n-grams (1–4 tokens, ≥3 chars).
3. Keep n-grams that appear in ≥ min_phrase_count rows.
4. For each row, retain only maximal phrases (drop sub-phrases of
longer phrases present in the same row).
5. Compute mutual-exclusion score per phrase pair:
M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
Phrases with M[a,b] ≥ excl_threshold are 'alternatives' (rarely
appear together → likely fill the same slot in different rows).
6. Cluster phrases into slots via greedy mutual-exclusion BFS, starting
from the most-frequent phrase.
7. A slot is valid if it has ≥ min_slot_size distinct phrases.
8. Coverage: fraction of rows that contain ≥1 phrase from ≥1 slot.
If coverage < threshold or < 2 slots survive → valid=False.
Returns:
{
'slots': list[ {phrases: set[str], best: str} ],
'row_assignments': list[ dict[slot_id → phrase] ] for each row,
'coverage': float in [0,1],
'valid': bool — True if slot structure is strong enough.
}
"""
n = len(texts)
if n < 4:
return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
text_keys = ({c.strip().lower() for c in (text_col_names or [])}
if text_col_names else None)
# ── 1. extract concept prefixes ──────────────────────────────────────────
prefixes = []
for t in texts:
prefix = ''
for chunk in str(t).split(' | '):
if ':' not in chunk:
continue
key, val = chunk.split(':', 1)
key_l = key.strip().lower()
if text_keys is not None and key_l not in text_keys:
continue
prefix = val.split(':')[0].strip()
break
if not prefix:
prefix = str(t).strip()
# Strip 'KEY:', 'Note:', etc. before tokenising
prefix = _strip_leading_markers(prefix)
prefixes.append(prefix)
# ── 2. normalise: alphanumerics + parens preserved, others → spaces ──────
def _normalize(p: str) -> str:
s = ''.join(ch if ch.isalnum() or ch == ' ' else ' '
for ch in str(p).lower())
return ' '.join(s.split())
norm = [_normalize(p) for p in prefixes]
# ── 3. extract n-grams (1..4) per row, keep ≥3 chars ─────────────────────
row_phrases: list = []
phrase_count: Counter = Counter()
for text in norm:
tokens = text.split()
row_set = set()
for ngram_n in range(1, 5):
for i in range(len(tokens) - ngram_n + 1):
phrase = ' '.join(tokens[i:i + ngram_n])
if len(phrase) >= 3:
row_set.add(phrase)
row_phrases.append(row_set)
for p in row_set:
phrase_count[p] += 1
# ── 4. keep repeated phrases, retain only maximal phrases per row ────────
keep = {p for p, c in phrase_count.items() if c >= min_phrase_count}
refined = []
for row in row_phrases:
sorted_by_len = sorted(row & keep, key=lambda p: -len(p.split()))
kept: list = []
for p in sorted_by_len:
if not any(p != q and p in q for q in kept):
kept.append(p)
refined.append(set(kept))
# Recount after refinement, re-filter
phrase_count = Counter()
for r in refined:
for p in r:
phrase_count[p] += 1
# NEW: filter out non-meaningful phrases (stopwords, group-name echoes,
# dictionary markers) before they enter mutual-exclusion clustering.
group_lower = (group_name or '').strip().lower()
phrases = sorted([p for p, c in phrase_count.items()
if c >= min_phrase_count
and _phrase_is_meaningful(p, group_lower)])
if len(phrases) < 2 * min_slot_size:
return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
p_idx = {p: i for i, p in enumerate(phrases)}
n_p = len(phrases)
counts = np.array([phrase_count[p] for p in phrases])
# ── 5. co-occurrence + mutual-exclusion matrix ───────────────────────────
cooc = np.zeros((n_p, n_p), dtype=int)
for r in refined:
idxs = [p_idx[p] for p in r if p in p_idx]
for i in idxs:
for j in idxs:
if i != j:
cooc[i, j] += 1
min_counts = np.minimum.outer(counts, counts).astype(float)
min_counts[min_counts == 0] = 1.0
mut_excl = 1.0 - cooc / min_counts
np.fill_diagonal(mut_excl, 0)
# ── 6a. UPSTREAM ROUTE: ask the LLM to classify phrases into roles ───────
# The LLM proposes a role schema (e.g. {measure: [...], statistic: [...],
# condition: [...]}) — phrases are assigned to roles, role names provide
# semantic ordering for the hierarchy. Anti-hallucination: every phrase
# must come back verbatim, otherwise rejected by the validator inside
# make_llm_role_classifier_fn.
slot_source = 'mutual_exclusion'
role_names: list = []
slots: list = [] # list[set[int]] — phrase indices per slot
if llm_role_classifier_fn is not None:
try:
classified, classifier_meta = llm_role_classifier_fn(
phrases, texts, group_name)
except Exception:
classified, classifier_meta = None, {'reason': 'exception'}
if classified:
# Build slots in the order the LLM proposed them. Each role is
# one slot containing the phrase-index set.
for role_name, role_phrases in classified.items():
idx_set = {p_idx[p] for p in role_phrases if p in p_idx}
if len(idx_set) >= min_slot_size:
slots.append(idx_set)
role_names.append(role_name)
if len(slots) >= 2:
slot_source = 'llm_role_classification'
# ── 6b. FALLBACK: greedy mutual-exclusion BFS ────────────────────────────
if slot_source == 'mutual_exclusion':
visited: set = set()
slots = []
order = np.argsort(-counts)
for seed in order:
if seed in visited:
continue
slot = {int(seed)}
queue = [int(seed)]
while queue:
cur = queue.pop()
for j in np.where(mut_excl[cur] >= excl_threshold)[0]:
j = int(j)
if j in slot:
continue
if all(mut_excl[j, k] >= excl_threshold - 0.15 for k in slot):
slot.add(j)
queue.append(j)
if len(slot) >= min_slot_size:
slots.append(slot)
visited |= slot
# Synthesise anonymous role names from the most-frequent phrase in each
# slot — these become the visible slot tags in provenance.
role_names = [phrases[max(s, key=lambda i: counts[i])] for s in slots]
if len(slots) < 2:
return {'slots': [], 'row_assignments': [], 'coverage': 0.0, 'valid': False}
# ── 7. assign per-row phrase per slot ────────────────────────────────────
row_assignments: list = []
covered = 0
for r in refined:
row_idx = {p_idx[p] for p in r if p in p_idx}
assignment: dict = {}
any_match = False
for slot_id, slot in enumerate(slots):
matched = row_idx & slot
if matched:
best = max(matched, key=lambda i: counts[i])
assignment[slot_id] = phrases[best]
any_match = True
if any_match:
covered += 1
row_assignments.append(assignment)
coverage = covered / n
valid = (coverage >= coverage_threshold and len(slots) >= 2)
return {
'slots': [{'phrases': {phrases[i] for i in s},
'best': phrases[max(s, key=lambda i: counts[i])],
'role_name': role_names[idx] if idx < len(role_names) else ''}
for idx, s in enumerate(slots)],
'row_assignments': row_assignments,
'coverage': round(coverage, 4),
'valid': valid,
'slot_source': slot_source, # 'llm_role_classification' or 'mutual_exclusion'
}
def build_slot_hierarchy(vi_list: list,
slot_result: dict,
can: pd.DataFrame,
parent_id: int,
parent_path: str,
nodes: list,
node_map: dict,
var_nodes: dict,
max_depth_remaining: int = 4,
post_slot_split_fn: Optional[Callable] = None,
min_post_slot_size: int = 4) -> bool:
"""
Build a hierarchy for `vi_list` using inferred phrase slots.
Slots are ordered by partition quality (more distinct values + fewer
singletons = higher priority). Each slot becomes one tree level.
Variables that lack a phrase at a given slot level skip that level.
Returns True on success; False if the result is too shallow to be useful
(caller should then fall back to NMF/GMM).
"""
if not slot_result.get('valid'):
return False
slots = slot_result['slots']
row_assigns = slot_result['row_assignments']
if len(vi_list) != len(row_assigns):
return False
# Index map: global vi → local position in row_assigns
vi_to_local = {vi: idx for idx, vi in enumerate(vi_list)}
# Order slots by partition-quality + semantic-shape heuristics.
# Higher score → used at a shallower level in the hierarchy.
#
# Heuristics (all generic, no domain knowledge):
# + many distinct values, low singleton fraction (existing)
# + average phrase token-length (multi-word noun phrases preferred)
# + total row coverage of the slot
# − slots whose top phrases look like pure conditions
# (numeric token + temporal/quantity word)
# − slots where every phrase is just a number or 'all' / 'none' modifier
_CONDITION_HINTS = {'second', 'seconds', 'minute', 'minutes', 'hour',
'hours', 'day', 'days', 'month', 'months', 'year',
'years', 'week', 'weeks', 'box', 'boxes', 'token',
'tokens', 'pattern', 'patterns', 'trial', 'trials'}
def _looks_like_condition(phrase: str) -> bool:
toks = phrase.split()
if not toks:
return False
has_num = any(t.isdigit() or t in {'all','none','simultaneous'} for t in toks)
has_hint = any(t in _CONDITION_HINTS for t in toks)
return has_num and has_hint
def _slot_score(slot_id: int) -> float:
vals = Counter()
for a in row_assigns:
v = a.get(slot_id)
if v:
vals[v] += 1
if not vals:
return -1.0
n_distinct = len(vals)
n_singletons = sum(1 for c in vals.values() if c == 1)
coverage = sum(vals.values()) / max(1, len(row_assigns))
avg_tokens = float(np.mean([len(p.split()) for p in vals]))
condition_frac = sum(1 for p in vals if _looks_like_condition(p)) / n_distinct
base = n_distinct - 0.6 * n_singletons
base += 0.4 * coverage
base += 0.3 * (avg_tokens - 1)
base -= 0.8 * condition_frac
return base
# When the LLM produced the role schema, trust its role ordering for slots
# whose role name is broadly "measure-like" (base concept) over
# "statistic/condition/subtype" (modifiers). This is generic English
# vocabulary, not domain knowledge — same heuristic used by IE slot-
# induction work (cf. Xu et al., FCS 2024 IE survey).
slot_source = slot_result.get('slot_source', 'mutual_exclusion')
_MEASURE_LIKE = {'measure', 'outcome', 'metric', 'variable', 'quantity'}
_STATISTIC_LIKE = {'statistic', 'stat', 'aggregate', 'summary'}
_CONDITION_LIKE = {'condition', 'modifier', 'context', 'setting'}
_SUBTYPE_LIKE = {'subtype', 'type', 'kind', 'category'}
def _role_priority(role: str) -> int:
r = (role or '').strip().lower()
if any(k in r for k in _MEASURE_LIKE): return 4
if any(k in r for k in _STATISTIC_LIKE): return 3
if any(k in r for k in _CONDITION_LIKE): return 2
if any(k in r for k in _SUBTYPE_LIKE): return 1
return 0 # unknown role — fall back to data-driven score
if slot_source == 'llm_role_classification':
ordered_slots = sorted(
range(len(slots)),
key=lambda i: (_role_priority(slots[i].get('role_name', '')),
_slot_score(i)),
reverse=True)
else:
ordered_slots = sorted(range(len(slots)), key=_slot_score, reverse=True)
ordered_slots = [s for s in ordered_slots if _slot_score(s) > 0]
if not ordered_slots:
return False
aggregations_made = [0] # mutable counter for fallback decision
def _attach_or_sub_recurse(vi_grp: list, parent: int, depth_left: int):
"""
Terminal-leaf attach point inside slot mining. When slots are
exhausted but the cluster still has enough variables AND we have a
post-slot callback (NMF/FASTopic + GMM splitter), recurse further to
deepen the tree. Otherwise attach leaves directly.
"""
if (post_slot_split_fn is not None
and len(vi_grp) >= min_post_slot_size
and depth_left > 0):
try:
added = post_slot_split_fn(vi_grp, parent, depth_left)
if added > 0:
aggregations_made[0] += added
return
except Exception:
pass
for vi in vi_grp:
_add_child(node_map, parent, var_nodes[vi])
def _split(vi_subset: list, slot_ord_pos: int, current_parent: int,
depth_remaining: int):
if (depth_remaining <= 0 or slot_ord_pos >= len(ordered_slots)
or len(vi_subset) <= 1):
_attach_or_sub_recurse(vi_subset, current_parent, depth_remaining)
return
slot_id = ordered_slots[slot_ord_pos]
groups: dict = defaultdict(list)
unassigned: list = []
for vi in vi_subset:
local = vi_to_local.get(vi)
val = row_assigns[local].get(slot_id) if local is not None else None
if val:
groups[val].append(vi)
else:
unassigned.append(vi)
# If this slot doesn't partition the subset, move to next slot
if len(groups) <= 1:
_split(vi_subset, slot_ord_pos + 1, current_parent, depth_remaining)
return
# The slot's role name (from LLM classification, when applicable)
slot_role = slots[slot_id].get('role_name', '') if slot_id < len(slots) else ''
for val, vi_grp in sorted(groups.items(), key=lambda x: -len(x[1])):
if len(vi_grp) == 1:
_add_child(node_map, current_parent, var_nodes[vi_grp[0]])
continue
nid = _next_id(nodes)
nd = _make_node(nid, val.title(),
desc=(f'Role: {slot_role or "—"} | '
f'Slot phrase: "{val}" | '
f'Variables: {len(vi_grp)} | '
f'Source: phrase-slot mining ({slot_source})'))
nd['label_provenance'] = {
'label_source': 'phrase_slot',
'evidence_terms': [val],
'confidence': 1.0,
'llm_used': slot_source == 'llm_role_classification',
'llm_rejected': False,
'role': slot_role,
}
nd['structure_provenance'] = {
'route': 'slot_mining',
'aspect_method': slot_source,
'silhouette': None,
'slot_coverage': round(float(slot_result.get('coverage', 0)), 3),
'slot_role': slot_role,
}
nodes.append(nd); node_map[nid] = nd
_add_child(node_map, current_parent, nid)
aggregations_made[0] += 1
_split(vi_grp, slot_ord_pos + 1, nid, depth_remaining - 1)
if unassigned:
_attach_or_sub_recurse(unassigned, current_parent, depth_remaining)
_split(vi_list, 0, parent_id, max_depth_remaining)
# Reject the slot-built tree if it added almost no structure (likely the
# slots were not actually useful for this group).
return aggregations_made[0] >= 2
# ──────────────────────────────────────────────────────────────────────────────
# STEP 6 — DYNAMIC TOP-DOWN LOD TREE [ZHU §3.3 adapted]
# ──────────────────────────────────────────────────────────────────────────────
def _next_id(nodes: list) -> int:
return max((int(n['id']) for n in nodes), default=0) + 1
def _add_child(node_map: dict, parent_id: int, child_id: int):
p = node_map.get(int(parent_id))
if p and int(child_id) not in p['related']:
p['related'].append(int(child_id))
def _make_node(nid, name, ntype='aggregation', desc='', dtype='determine') -> dict:
return {'id': int(nid), 'name': str(name), 'related': [],
'type': ntype, 'desc': str(desc), 'dtype': dtype, 'isShown': True}
# ──────────────────────────────────────────────────────────────────────────────
# POST-BUILD PASS 1 — SIBLING COMMON-PREFIX FACTORING
# ──────────────────────────────────────────────────────────────────────────────
def factor_sibling_common_prefixes(nodes: list,
min_siblings: int = 3,
min_prefix_tokens: int = 2) -> int:
"""
For each parent whose ≥`min_siblings` aggregation children share a
common multi-token title prefix, insert a new intermediate parent named
by that prefix and re-attach the matching siblings under it (with the
prefix stripped from each name).
Generic, no domain knowledge. Inspired by sibling-label factoring
common in faceted-classification systems (Stoica & Hearst, NAACL 2007,
'Castanet') — collapsing redundant repeated tokens in sibling names.
Returns the number of factor-parents inserted.
"""
node_map = {int(n['id']): n for n in nodes}
inserted = 0
# We iterate over a snapshot of current aggregation nodes
queue = [int(n['id']) for n in nodes
if n.get('type') in ('aggregation', 'root')]
while queue:
parent_id = queue.pop(0)
parent = node_map.get(parent_id)
if not parent:
continue
# Gather aggregation children with their tokenised names
agg_children = []
for cid in parent.get('related', []):
child = node_map.get(int(cid))
if not child or child.get('type') != 'aggregation':
continue
toks = str(child.get('name', '')).split()
if len(toks) >= min_prefix_tokens:
agg_children.append((int(cid), toks))
if len(agg_children) < min_siblings:
continue
# Greedy: find the longest prefix shared by ≥ min_siblings children
best_prefix: list = []
best_group: list = []
# Sort children by name tokens for stable grouping
agg_children.sort(key=lambda x: x[1])
# Try each possible prefix length from longest down
max_len = max(len(t) for _, t in agg_children)
for length in range(max_len, min_prefix_tokens - 1, -1):
prefix_counts: Counter = Counter()
for cid, toks in agg_children:
if len(toks) > length: # must have something AFTER the prefix
prefix_counts[tuple(t.lower() for t in toks[:length])] += 1
for pfx, cnt in prefix_counts.most_common():
if cnt >= min_siblings:
# Reject low-quality prefixes: all-stopword, or starting/
# ending with a stopword (e.g. "the number of", "the").
# A good factored parent is a real concept phrase.
pfx_l = [t.lower() for t in pfx]
if all(t in _STOPWORDS for t in pfx_l):
continue
if pfx_l[0] in _STOPWORDS or pfx_l[-1] in _STOPWORDS:
continue
group = [(cid, toks) for cid, toks in agg_children
if len(toks) > length
and tuple(t.lower() for t in toks[:length]) == pfx]
if len(group) >= min_siblings:
best_prefix = list(pfx)
best_group = group
break
if best_prefix:
break
if not best_prefix:
continue
# Build the new intermediate parent
new_id = max(node_map) + 1
# Title-case the prefix using the original child capitalisation
# (take it from the first matched child's tokens)
orig_tokens = best_group[0][1][:len(best_prefix)]
prefix_name = ' '.join(orig_tokens)
new_node = _make_node(new_id, prefix_name, ntype='aggregation',
desc=(f'Factored common prefix: "{prefix_name}" | '
f'Siblings: {len(best_group)} | '
f'Source: sibling factoring [Castanet 2007]'))
new_node['label_provenance'] = {
'label_source': 'factored_common_prefix',
'evidence_terms': [c[0] for c in best_group],
'confidence': 1.0,
'llm_used': False,
'llm_rejected': False,
}
new_node['structure_provenance'] = {
'route': 'sibling_factoring',
'aspect_method': None,
'silhouette': None,
'slot_coverage': None,
'factored_from': [c[0] for c in best_group],
'common_prefix_tokens': len(best_prefix),
}
# Rename the factored siblings (strip the prefix from their names)
for cid, toks in best_group:
child = node_map[cid]
new_name = ' '.join(toks[len(best_prefix):]).strip()
if new_name:
child['name'] = new_name
# Rewire parent → new_node → factored siblings
moved_ids = {c[0] for c in best_group}
parent['related'] = [c for c in parent['related']
if int(c) not in moved_ids]
parent['related'].append(new_id)
new_node['related'] = [c[0] for c in best_group]
nodes.append(new_node)
node_map[new_id] = new_node
inserted += 1
# Re-examine this parent in case multiple prefix groups exist
queue.append(parent_id)
# Also examine the new parent for further nesting
queue.append(new_id)
return inserted
# ──────────────────────────────────────────────────────────────────────────────
# POST-BUILD PASS 2 — TRACO-INSPIRED LOW-QUALITY NODE PRUNING
# ──────────────────────────────────────────────────────────────────────────────
def prune_low_quality_aggregations(nodes: list,
tfidf: TfidfVectorizer = None,
min_coherence: float = 0.0,
max_child_ratio: float = 1.0) -> int:
"""
Conservative noise pruning.
ONLY rule applied by default: dissolve aggregation nodes whose name is
pure noise — single-word stopword titles ('And', 'Them'), dictionary
markers ('Key', 'Note'), or all-stopword titles ('To Be', 'Have A Lot Of').
Group anchors (route='group_anchor') and sibling-factoring nodes
(route='sibling_factoring') are NEVER pruned — they are structural and
legitimately have short names that may not score well on TF-IDF metrics.
Rules B (parent-child coherence) and C (lopsided split) are intentionally
DISABLED by default — both metrics punish good hierarchies where children
are legitimately more specific than parents (low TF-IDF cosine) or where
a slot mining branch happens to be dominated by one large subgroup.
They can be opted into by passing tfidf and tightening the thresholds.
Inspired by TraCo (Wu et al., AAAI 2024): affinity / rationality /
diversity failures are common in hierarchical topic models. Here we
enforce the most conservative form of that — only obviously-noise names.
Returns the number of nodes dissolved.
"""
node_map = {int(n['id']): n for n in nodes}
parent_of: dict = {}
for n in nodes:
for c in n.get('related', []):
parent_of.setdefault(int(c), int(n['id']))
FORBIDDEN = frozenset(_STOPWORDS) | frozenset({
'key', 'note', 'item', 'label', 'group', 'cluster',
})
# Routes that are ALWAYS structural — never prune these even if the name
# is short / stopword-like (e.g. 'MOT' is a group anchor, not noise).
PROTECTED_ROUTES = frozenset({'group_anchor', 'sibling_factoring'})
to_dissolve: list = []
for n in nodes:
if n.get('type') != 'aggregation':
continue
nid = int(n['id'])
if nid not in parent_of:
continue
sp = n.get('structure_provenance', {})
if sp.get('route') in PROTECTED_ROUTES:
continue
name_lower = str(n.get('name', '')).strip().lower()
if not name_lower:
to_dissolve.append(nid)
continue
# Only Rule A is enforced by default. A token is "noise" if it is a
# stopword/marker OR shorter than 2 chars AND not a digit (numeric
# labels like '2' / '4' / '6' / '8' are kept — they may be meaningful,
# e.g. PAL pattern counts).
toks = name_lower.split()
def _is_noise_tok(t: str) -> bool:
if t in FORBIDDEN:
return True
if len(t) < 2 and not t.isdigit():
return True
return False
all_noise = bool(toks) and all(_is_noise_tok(t) for t in toks)
if name_lower in FORBIDDEN or all_noise:
to_dissolve.append(nid)
# Dissolve: promote children to grandparent
dissolved = 0
for nid in to_dissolve:
n = node_map.get(nid)
if not n:
continue
gp_id = parent_of.get(nid)
if gp_id is None:
continue
gp = node_map.get(gp_id)
if not gp:
continue
gp['related'] = [c for c in gp['related'] if int(c) != nid]
for cid in n.get('related', []):
if int(cid) not in gp['related']:
gp['related'].append(int(cid))
parent_of[int(cid)] = gp_id
n['isShown'] = False
n['type'] = 'dissolved'
n.setdefault('structure_provenance', {})['dissolved_reason'] = \
'noise_label_only'
dissolved += 1
return dissolved
def build_dynamic_lod_tree(can: pd.DataFrame,
aspect_reprs: list,
aspect_labels: list,
tfidf: TfidfVectorizer,
max_depth: int = 6,
min_cluster_size: int = 2,
sil_threshold: float = 0.04,
max_clusters_per_split: int = 8,
project: str = 'project',
local_nmf: bool = True,
min_local_nmf_size: int = 8,
max_aspects: int = 10,
sbert_model=None,
llm_label_fn: Optional[Callable] = None,
use_slot_mining: bool = True,
text_col_names: Optional[list] = None,
use_fastopic: bool = True,
fastopic_min_size: int = 8,
llm_role_classifier_fn: Optional[Callable] = None,
use_role_decomposition: bool = True,
llm_role_namer_fn: Optional[Callable] = None,
role_regularity_threshold: float = 0.40,
per_row_role_extractor_fn: Optional[Callable] = None,
use_per_row_role_extraction: bool = True) -> list:
"""
Build a dynamic top-down LoD tree.
Entry strategy (new — fully data-driven, no hardcoding):
• If detected _group metadata provides L1/L2 structure, materialise those
path segments as aggregation nodes first.
• Then apply NMF aspect discovery and GMM clustering *locally* inside each
terminal group (not globally across all variables).
• Falls back to global NMF from root when no group structure is found.
Within each recursive split [ZHU §3.3 adapted]:
• Evaluate K aspects by silhouette — select the highest (simplified
best-aspect split; not the full probabilistic search of Zhu Eq.6/7).
• Single-variable clusters are attached directly without an aggregation
wrapper (singleton prevention).
"""
texts = can['_text'].fillna('').astype(str).tolist()
n_vars = len(can)
# ── build leaf attribute nodes (ids 1..n_vars) ───────────────────────────
nodes: list = [_make_node(0, project, ntype='root', desc='Root node')]
var_nodes: dict = {} # can positional index → node id
for i, (_, row) in enumerate(can.iterrows(), start=1):
nd = _make_node(i, row['_label'], ntype='attribute',
desc=row['_text'], dtype='determine')
nd['metadata'] = {
'leaf_id': str(row['_id']),
'row_index': int(row['_row']),
'group': row['_group'],
}
nodes.append(nd)
var_nodes[int(row.name)] = i
node_map: dict = {int(n['id']): n for n in nodes}
# ── recursive splitter ────────────────────────────────────────────────────
def _recurse(vi_global: list,
cur_reprs: list,
cur_labels: list,
cur_tfidf: TfidfVectorizer,
parent_id: int,
depth: int,
parent_path: str,
aspect_method_tag: str = 'nmf'):
"""
vi_global : global positional indices into `can` for this node's variables.
cur_reprs : list of K arrays, each shape (len(vi_global), d).
Rows correspond positionally to vi_global — no global indexing.
cur_labels : NMF aspect labels for cur_reprs.
cur_tfidf : TF-IDF vectorizer fitted on this scope's texts.
"""
if depth >= max_depth or len(vi_global) < min_cluster_size:
for vi in vi_global:
_add_child(node_map, parent_id, var_nodes[vi])
return
# NEW: shortcut for tiny homogeneous clusters — same _group, ≤3 vars.
# Avoids spending GMM/silhouette evaluation on already-meaningful leaves.
if len(vi_global) <= 3:
groups_here = {str(can.iloc[vi]['_group']) for vi in vi_global}
if len(groups_here) == 1:
for vi in vi_global:
_add_child(node_map, parent_id, var_nodes[vi])
return
sub_texts = [texts[vi] for vi in vi_global]
# Evaluate every aspect — pick the one with highest silhouette [ZHU §3.3]
best_sil, best_k_idx, best_lbls = -1.0, -1, None
for k_idx, rep in enumerate(cur_reprs):
if rep.shape[0] < 3:
continue
lbls, _, sil = cluster_aspect_gmm(
rep, max_k=min(max_clusters_per_split, len(vi_global) // 2))
if sil > best_sil:
best_sil, best_k_idx, best_lbls = sil, k_idx, lbls
if best_k_idx == -1 or best_sil < sil_threshold or best_lbls is None:
for vi in vi_global:
_add_child(node_map, parent_id, var_nodes[vi])
return
# NEW: split-quality rejection (TraCo-inspired structural check).
# Reject splits that are extremely imbalanced or mostly singletons —
# silhouette can be high even when one cluster swallows most variables.
cluster_sizes = list(Counter(best_lbls).values())
max_ratio = max(cluster_sizes) / sum(cluster_sizes)
n_singletons = sum(1 for s in cluster_sizes if s == 1)
if max_ratio > 0.85 or n_singletons > len(cluster_sizes) // 2:
for vi in vi_global:
_add_child(node_map, parent_id, var_nodes[vi])
return
# Group variables by cluster assignment
cluster_to_global: dict = defaultdict(list)
for local_pos, (vi, cl) in enumerate(zip(vi_global, best_lbls)):
cluster_to_global[int(cl)].append(vi)
aspect_name = cur_labels[best_k_idx]
global_to_local = {vi: idx for idx, vi in enumerate(vi_global)}
for cl_id, cl_global in cluster_to_global.items():
if not cl_global:
continue
# Singleton prevention: attach single-variable clusters directly [NEW]
if len(cl_global) == 1:
_add_child(node_map, parent_id, var_nodes[cl_global[0]])
continue
cl_texts = [texts[vi] for vi in cl_global]
cl_groups = [str(can.iloc[vi]['_group']) for vi in cl_global]
lbl, prov = label_cluster(cl_texts, sub_texts, cur_tfidf,
cluster_groups=cl_groups,
parent_path=parent_path,
llm_label_fn=llm_label_fn,
return_provenance=True)
desc = (f'Aspect: {aspect_name} | '
f'Silhouette: {best_sil:.3f} | '
f'Variables: {len(cl_global)}')
nid = _next_id(nodes)
nd = _make_node(nid, lbl, desc=desc)
nd['label_provenance'] = prov
nd['structure_provenance'] = {
'route': 'aspect_clustering',
'aspect_method': aspect_method_tag,
'silhouette': round(float(best_sil), 4),
'slot_coverage': None,
}
nodes.append(nd)
node_map[nid] = nd
_add_child(node_map, parent_id, nid)
# Slice embeddings to this cluster's local positions and recurse
cl_pos = [global_to_local[vi] for vi in cl_global]
sub_reprs = [rep[cl_pos] for rep in cur_reprs]
_recurse(cl_global, sub_reprs, cur_labels, cur_tfidf,
nid, depth + 1, f'{parent_path} > {lbl}',
aspect_method_tag=aspect_method_tag)
# ── group-anchored entry ─────────────────────────────────────────────────
# The _group column is built from user-confirmed group_cols — fully data-driven.
# We materialise each path segment as an aggregation node, then run NMF/GMM
# only inside each terminal group.
group_node_ids: dict = {} # path string → node id
def _get_or_create_group_node(path: str, parent_id: int) -> int:
if path in group_node_ids:
return group_node_ids[path]
seg_name = path.split(' > ')[-1].strip()
nid = _next_id(nodes)
nd = _make_node(nid, seg_name, ntype='aggregation', desc=f'Group: {path}')
nd['structure_provenance'] = {
'route': 'group_anchor',
'aspect_method': None,
'silhouette': None,
'slot_coverage': None,
}
nodes.append(nd)
node_map[nid] = nd
_add_child(node_map, parent_id, nid)
group_node_ids[path] = nid
return nid
# Collect global variable positions per terminal group path
terminal_groups: dict = defaultdict(list)
for vi, (_, row) in enumerate(can.iterrows()):
terminal_groups[str(row['_group'])].append(vi)
non_ungrouped = [g for g in terminal_groups if g.strip().lower() != 'ungrouped']
if not non_ungrouped:
# No group structure detected → fall back to global NMF from root
full_reprs = [rep[list(range(n_vars))] for rep in aspect_reprs]
_recurse(list(range(n_vars)), full_reprs, aspect_labels, tfidf,
0, 0, project)
else:
# Optional Streamlit progress bar — visible feedback for large datasets
# like HCP where the per-group loop dominates runtime.
try:
import streamlit as _st_progress
_pbar = _st_progress.progress(0.0, text='Building groups…')
_show_progress = True
except Exception:
_pbar = None
_show_progress = False
sorted_groups = sorted(terminal_groups.items())
n_groups_total = len(sorted_groups)
for _g_idx, (group_path, vi_list) in enumerate(sorted_groups):
if _show_progress and _pbar is not None:
try:
short_path = group_path[:60] + ('…' if len(group_path) > 60 else '')
_pbar.progress((_g_idx + 1) / max(1, n_groups_total),
text=f'[{_g_idx + 1}/{n_groups_total}] '
f'{short_path} ({len(vi_list)} vars)')
except Exception:
pass
# Build L1/L2 path nodes from detected group metadata
segments = [s.strip() for s in group_path.split(' > ') if s.strip()]
pid = 0
for depth_seg in range(len(segments)):
cumpath = ' > '.join(segments[:depth_seg + 1])
pid = _get_or_create_group_node(cumpath, pid)
existing_depth = len(segments)
if len(vi_list) < min_cluster_size:
for vi in vi_list:
_add_child(node_map, pid, var_nodes[vi])
continue
# ── Aspect-discovery callback for this group ──
# Top-level: optionally uses FASTopic (slow, transformer-based,
# semantic). Sub-recursion: NMF only (fast, lexical). This split
# gives the best of both: rich top-level structure + quick deeper
# splits. Critical for HCP performance (~10× speedup vs running
# FASTopic in every sub-recursion).
def _aspect_recurse_for_group(vi_sub: list, parent: int,
depth_left: int = 99,
is_top_level: bool = False) -> int:
local_texts = [texts[vi] for vi in vi_sub]
use_reprs, use_labels, use_tfidf = None, None, None
aspect_method_tag = 'sliced_global'
if (is_top_level and use_fastopic and _FASTOPIC_AVAILABLE
and len(vi_sub) >= fastopic_min_size):
try:
f_tfidf, _, _, _, f_H, _, f_labels = discover_aspects_fastopic(
local_texts, max_aspects=max_aspects)
f_reprs = per_aspect_representations(
local_texts, f_H, f_tfidf, sbert_model)
use_reprs, use_labels, use_tfidf = f_reprs, f_labels, f_tfidf
aspect_method_tag = 'fastopic'
except Exception:
use_reprs = None
if use_reprs is None and local_nmf and len(vi_sub) >= min_local_nmf_size:
try:
l_tfidf, _, _, _, l_H, _, l_labels = discover_aspects(
local_texts, max_aspects)
l_reprs = per_aspect_representations(
local_texts, l_H, l_tfidf, sbert_model)
use_reprs, use_labels, use_tfidf = l_reprs, l_labels, l_tfidf
aspect_method_tag = 'nmf'
except Exception:
use_reprs = None
if use_reprs is None:
use_reprs = [rep[vi_sub] for rep in aspect_reprs]
use_labels, use_tfidf = aspect_labels, tfidf
before = sum(1 for n in nodes if n.get('type') == 'aggregation')
effective_depth = max(0, min(depth_left, max_depth))
_recurse(vi_sub, use_reprs, use_labels, use_tfidf,
parent, max(0, max_depth - effective_depth), group_path,
aspect_method_tag=aspect_method_tag)
after = sum(1 for n in nodes if n.get('type') == 'aggregation')
return max(0, after - before)
local_texts = [texts[vi] for vi in vi_list]
_top_grp = group_path.split(' > ')[0].strip()
role_built = False
role_regularity = None
route_label = None
# ── ROUTING 1A: PER-ROW LLM ROLE EXTRACTION (Zhu et al. EMNLP 2025) ──
# The cleanest semantic route: one LLM call per row extracts
# measure / statistic / condition / subtype values directly from
# the description text. Strict substring grounding prevents
# hallucination. Bypasses SBERT phrase clustering entirely.
if (use_per_row_role_extraction
and per_row_role_extractor_fn is not None
and len(vi_list) >= 4):
try:
role_result = discover_roles_via_per_row_extraction(
local_texts,
text_col_names=text_col_names,
per_row_extractor_fn=per_row_role_extractor_fn,
group_name=_top_grp,
regularity_threshold=role_regularity_threshold,
)
role_regularity = role_result.get('regularity')
# Always capture audit data, even when valid=False, so the
# Role Decomposition tab can show what the LLM proposed
# and why proposals were rejected. group_path + vi_list
# let the display map each audit row back to its variable.
try:
st.session_state.a2_per_row_audit.append({
'group_name': _top_grp,
'group_path': group_path,
'vi_list': [int(v) for v in vi_list],
'n_rows': len(vi_list),
'coverage': role_result.get('coverage'),
'valid': role_result.get('valid'),
'roles_final': list(role_result.get('roles', {}).keys()),
'summary': role_result.get('extractor_summary', {}),
'per_row_audit': role_result.get('per_row_audit', []),
})
except Exception:
pass
if role_result['valid']:
role_built = build_role_hierarchy(
vi_list, role_result, can,
pid, group_path, nodes, node_map, var_nodes,
max_depth_remaining=max(2, max_depth - existing_depth),
post_split_fn=lambda vi_grp, p, d:
_aspect_recurse_for_group(vi_grp, p, d,
is_top_level=False),
min_post_split_size=max(6, min_cluster_size + 4))
if role_built:
route_label = 'per_row_llm_extraction'
except Exception:
role_built = False
# ── ROUTING 1B: SBERT phrase clustering fallback (Option D original) ──
if not role_built and (use_role_decomposition and sbert_model is not None
and len(vi_list) >= 6):
try:
role_result = discover_roles_via_sbert_phrase_clustering(
local_texts,
text_col_names=text_col_names,
sbert_model=sbert_model,
llm_role_namer_fn=llm_role_namer_fn,
group_name=_top_grp,
regularity_threshold=role_regularity_threshold,
)
role_regularity = role_result.get('regularity')
if role_result['valid']:
role_built = build_role_hierarchy(
vi_list, role_result, can,
pid, group_path, nodes, node_map, var_nodes,
max_depth_remaining=max(2, max_depth - existing_depth),
post_split_fn=lambda vi_grp, p, d:
_aspect_recurse_for_group(vi_grp, p, d,
is_top_level=False),
min_post_split_size=max(6, min_cluster_size + 4))
if role_built:
route_label = 'sbert_phrase_clustering'
except Exception:
role_built = False
if role_built:
anchor = node_map.get(pid)
if anchor is not None:
anchor.setdefault('structure_provenance', {})
anchor['structure_provenance']['phrase_regularity'] = role_regularity
anchor['structure_provenance']['route_used'] = route_label
continue
# ── ROUTING 2: phrase-slot mining (IE / mutual-exclusion) ──
slot_built = False
if use_slot_mining and len(vi_list) >= 6:
try:
slot_result = mine_phrase_slots(
local_texts,
text_col_names=text_col_names,
group_name=_top_grp,
llm_role_classifier_fn=llm_role_classifier_fn,
)
if slot_result['valid']:
slot_built = build_slot_hierarchy(
vi_list, slot_result, can,
pid, group_path, nodes, node_map, var_nodes,
max_depth_remaining=max(2, max_depth - existing_depth),
post_slot_split_fn=lambda vi_grp, p, d:
_aspect_recurse_for_group(vi_grp, p, d,
is_top_level=False),
min_post_slot_size=max(6, min_cluster_size + 4))
except Exception:
slot_built = False
if slot_built:
anchor = node_map.get(pid)
if anchor is not None:
anchor.setdefault('structure_provenance', {})
anchor['structure_provenance']['phrase_regularity'] = role_regularity
anchor['structure_provenance']['route_used'] = 'slot_mining_fallback'
continue
# ── ROUTING 3: aspect discovery (FASTopic / NMF) ──
anchor = node_map.get(pid)
if anchor is not None:
anchor.setdefault('structure_provenance', {})
anchor['structure_provenance']['phrase_regularity'] = role_regularity
anchor['structure_provenance']['route_used'] = 'aspect_clustering_fallback'
_aspect_recurse_for_group(vi_list, pid, is_top_level=True)
# Clear the progress bar
try:
if _show_progress and _pbar is not None:
_pbar.empty()
except Exception:
pass
# ── POST-BUILD PASS 1 — sibling common-prefix factoring [Castanet 2007] ──
try:
n_factored = factor_sibling_common_prefixes(
nodes, min_siblings=3, min_prefix_tokens=2)
except Exception:
n_factored = 0
# ── POST-BUILD PASS 2 — conservative noise-label pruning ──────────────────
try:
n_dissolved = prune_low_quality_aggregations(nodes)
except Exception:
n_dissolved = 0
# ── POST-BUILD PASS 3 — strip group prefix from child labels ──────────────
# 'DMS — Total Errors' under DMS → 'Total Errors'. Reduces visual
# redundancy without altering tree structure.
try:
n_stripped = strip_group_prefix_from_labels(nodes)
except Exception:
n_stripped = 0
# ── POST-BUILD PASS 4 — enforce single parent (collapse DAG → tree) ───────
# Keeps each variable under its most specific parent so the hierarchy is a
# true tree, matching the viewer/evaluator assumptions and rendering proportionally.
try:
n_reparented = enforce_single_parent(nodes)
except Exception:
n_reparented = 0
# ── POST-BUILD PASS 5 — drop aggregation nodes left childless by PASS 4 ───
try:
n_empty_pruned = prune_empty_aggregations(nodes)
except Exception:
n_empty_pruned = 0
# Annotate the root with post-build statistics
if nodes and nodes[0].get('type') == 'root':
nodes[0]['post_build_stats'] = {
'sibling_factor_nodes_inserted': int(n_factored),
'low_quality_nodes_dissolved': int(n_dissolved),
'group_prefix_labels_stripped': int(n_stripped),
'dag_links_removed': int(n_reparented),
'empty_aggregations_pruned': int(n_empty_pruned),
}
# Deduplicate children
for nd in nodes:
nd['related'] = list(dict.fromkeys(int(x) for x in nd['related']))
return nodes
# ──────────────────────────────────────────────────────────────────────────────
# STEP 9 — EVALUATION [ZHU Table 2 / TraCo / TICL §3.4]
# ──────────────────────────────────────────────────────────────────────────────
def purity_score(true_labels, pred_labels) -> float:
true = np.array(true_labels); pred = np.array(pred_labels)
total = len(true)
if total == 0:
return 0.0
score = 0
for cl in np.unique(pred):
mask = pred == cl
if not mask.any():
continue
counts = np.bincount(true[mask].astype(int))
score += counts.max()
return score / total
def evaluate(true_labels, pred_labels) -> dict:
le = LabelEncoder()
tl = le.fit_transform([str(x) for x in true_labels])
pl = np.array(pred_labels, dtype=int)
return {
'NMI': round(float(normalized_mutual_info_score(tl, pl)), 4),
'ARI': round(float(adjusted_rand_score(tl, pl)), 4),
'Purity': round(purity_score(tl, pl), 4),
}
def hierarchy_quality_metrics(nodes: list, tfidf: TfidfVectorizer) -> dict:
"""
Compute lightweight structural quality metrics inspired by TraCo (AAAI 2024).
[TraCo] Wu et al. identify three failure modes in hierarchical topic models:
low affinity (children unrelated to parent), low rationality (children not
more specific than parent), and low diversity (sibling topics too similar).
Here we measure:
• parent-child coherence: mean cosine(parent_label, child_label) — proxy for affinity.
• sibling diversity: mean (1 - pairwise cosine) among siblings — proxy for diversity.
"""
node_map = {n['id']: n for n in nodes}
coherence_scores, diversity_scores = [], []
for n in nodes:
if n['type'] != 'aggregation' or not n['related']:
continue
children = [node_map[c] for c in n['related'] if c in node_map]
child_labels = [c['name'] for c in children
if c.get('type') == 'aggregation' and c['name'] != 'Group']
if len(child_labels) < 2:
continue
try:
parent_vec = tfidf.transform([n['name']])
child_vecs = tfidf.transform(child_labels)
# Parent-child coherence
coh = float(cosine_similarity(parent_vec, child_vecs).mean())
coherence_scores.append(coh)
# Sibling diversity
sib_sims = cosine_similarity(child_vecs)
np.fill_diagonal(sib_sims, 0)
diversity_scores.append(float(1 - sib_sims.mean()))
except Exception:
pass
return {
'mean_parent_child_coherence': round(np.mean(coherence_scores), 4) if coherence_scores else 0.0,
'mean_sibling_diversity': round(np.mean(diversity_scores), 4) if diversity_scores else 0.0,
'n_aggregation_nodes': len([n for n in nodes if n['type'] == 'aggregation']),
'n_singleton_splits': len([n for n in nodes
if n['type'] == 'aggregation'
and len(n['related']) == 1]),
}
# ──────────────────────────────────────────────────────────────────────────────
# DISPLAY-TIME ONE-CHILD CHAIN COMPRESSION (visualization only — structure preserved in JSON)
# ──────────────────────────────────────────────────────────────────────────────
def compress_one_child_chains(nodes: list) -> list:
"""
Display-only transformation: collapse chains where an aggregation node has
exactly one aggregation child (e.g. 'DMS → DMS Recommended Standard').
The collapsed node displays as 'DMS / DMS Recommended Standard'.
Structural data in the exported JSON is NOT modified — this returns a new
node list used only for visualisation.
"""
nodes = _filter_dissolved(nodes)
nm = {int(n['id']): dict(n) for n in nodes} # deep-ish copy
parent_of: dict = {}
for n in nodes:
for c in n.get('related', []):
parent_of.setdefault(int(c), int(n['id']))
def _is_chain_link(n):
if n.get('type') != 'aggregation':
return False
children = n.get('related', [])
return (len(children) == 1
and nm.get(int(children[0]), {}).get('type') == 'aggregation')
changed = True
while changed:
changed = False
for nid, n in list(nm.items()):
if _is_chain_link(n):
child_id = int(n['related'][0])
child = nm[child_id]
# Merge: child takes parent's id, parent's slot, but combined name
merged_name = f"{n['name']} / {child['name']}"
new_node = dict(child)
new_node['id'] = nid
new_node['name'] = merged_name
new_node['desc'] = f"{n.get('desc','')} | {child.get('desc','')}"
# Rewire child's children to nid (already nid)
nm[nid] = new_node
# Remove the original child node
if child_id in nm:
del nm[child_id]
# Re-point any references to child_id → nid
for other in nm.values():
other['related'] = [nid if int(c) == child_id else int(c)
for c in other.get('related', [])]
changed = True
break
return list(nm.values())
# ──────────────────────────────────────────────────────────────────────────────
# VISUALISATION HELPERS
# ──────────────────────────────────────────────────────────────────────────────
def _filter_dissolved(nodes: list) -> list:
"""
Drop dissolved/hidden nodes for visualisation. Children of dissolved
nodes were already promoted to the grandparent by the pruning pass, so
dropping the dissolved wrapper here makes the tree render cleanly.
"""
drop_ids = {int(n['id']) for n in nodes
if n.get('type') == 'dissolved' or n.get('isShown') is False}
if not drop_ids:
return nodes
out = []
for n in nodes:
if int(n['id']) in drop_ids:
continue
m = dict(n)
m['related'] = [int(c) for c in n.get('related', [])
if int(c) not in drop_ids]
out.append(m)
return out
def _leaf_ids(nodes: list, nid: int) -> list:
m = {int(n['id']): n for n in nodes}
out = []
def rec(x):
n = m.get(int(x))
if not n: return
if n.get('type') == 'attribute': out.append(int(x)); return
for c in n.get('related', []): rec(int(c))
rec(nid)
return list(dict.fromkeys(out))
def _parent_map(nodes: list) -> dict:
pm = {}
for n in nodes:
for c in n.get('related', []):
if int(c) not in pm:
pm[int(c)] = int(n['id'])
return pm
def _tree_value_map(nodes: list, pm: dict) -> dict:
"""
Leaf count per node measured along the *rendered* tree (each node has
exactly one parent, per `pm`). Plotly draws sectors using that same
single-parent structure, so values built this way always satisfy
parent == sum(children) — which is what branchvalues='total' requires.
The full hierarchy can be a DAG (a variable promoted under more than one
role branch), in which case `_leaf_ids` double-counts a shared leaf and a
parent's unique-leaf count comes out *less* than the sum of its children's
counts. Feeding those numbers to a 'total' chart blanks it. Counting on
the rendered tree instead avoids that without changing the hierarchy.
"""
kids = {}
for child, par in pm.items():
kids.setdefault(int(par), []).append(int(child))
nodemap = {int(n['id']): n for n in nodes}
memo = {}
def count(nid: int) -> int:
if nid in memo:
return memo[nid]
memo[nid] = 1 # guard against cycles while recursing
n = nodemap.get(nid)
if n is not None and n.get('type') == 'attribute':
memo[nid] = 1
return 1
ch = kids.get(nid, [])
v = sum(count(c) for c in ch) if ch else 1
memo[nid] = max(1, v)
return memo[nid]
return {nid: count(nid) for nid in nodemap}
def _wrap_hover(text: str, width: int = 80) -> str:
"""Soft-wrap long descriptions onto multiple
-separated lines so the
Plotly hover tooltip shows the full text instead of being cut off."""
import textwrap as _tw
s = str(text or '')
if not s:
return ''
lines = []
for raw_line in s.split('\n'):
lines.extend(_tw.wrap(raw_line, width=width) or [''])
return '
'.join(lines)
def plot_sunburst(nodes: list, max_depth: int = 4):
nodes = _filter_dissolved(nodes)
pm = _parent_map(nodes)
vm = _tree_value_map(nodes, pm)
ids, labels, parents, values, hover = [], [], [], [], []
for n in nodes:
nid = int(n['id'])
lc = len(_leaf_ids(nodes, nid))
ids.append(str(nid))
labels.append(str(n.get('name', ''))[:40])
parents.append('' if nid == 0 else str(pm.get(nid, 0)))
values.append(vm.get(nid, 1))
hover.append(f"{n.get('name','')}
Type: {n.get('type','')}
"
f"Variables: {lc}
{_wrap_hover(n.get('desc',''))}")
fig = go.Figure(go.Sunburst(
ids=ids, labels=labels, parents=parents, values=values,
branchvalues='total', hovertext=hover, hoverinfo='text',
maxdepth=max_depth, insidetextorientation='radial',
marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
title=dict(text='Click sector to drill down — click centre to go back',
font=dict(size=13), x=0.5))
return fig
def plot_treemap(nodes: list):
nodes = _filter_dissolved(nodes)
pm = _parent_map(nodes)
vm = _tree_value_map(nodes, pm)
ids, labels, parents, values, hover = [], [], [], [], []
for n in nodes:
nid = int(n['id'])
lc = len(_leaf_ids(nodes, nid))
ids.append(str(nid))
labels.append(str(n.get('name', ''))[:40])
parents.append('' if nid == 0 else str(pm.get(nid, 0)))
values.append(vm.get(nid, 1))
hover.append(f"{n.get('name','')}
Variables: {lc}
"
f"{_wrap_hover(n.get('desc',''))}")
fig = go.Figure(go.Treemap(
ids=ids, labels=labels, parents=parents, values=values,
branchvalues='total', hovertext=hover, hoverinfo='text',
textinfo='label+value',
marker=dict(colorscale='Viridis', line=dict(width=1, color='white'))))
fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
return fig
# ──────────────────────────────────────────────────────────────────────────────
# NODE-LINK TREE — Reingold-Tilford layout (matches Approach 1.1 interface)
# ──────────────────────────────────────────────────────────────────────────────
def _a2_node_color(n: dict) -> str:
t = n.get('type', '')
if t == 'root': return '#c44e52'
if t == 'attribute': return '#4C72B0'
if t == 'collapsed': return '#bbbbbb'
return '#8C8C8C'
def _display_graph(nodes: list, max_depth: int = 4, show_hidden: bool = False):
"""Walk tree to chosen depth, inserting 'collapsed' placeholders for cut-off branches."""
m = {int(n['id']): n for n in nodes}
dnodes: dict = {}
edges: list = []
counter = 10 ** 9
def rec(nid, depth):
nonlocal counter
n = m.get(int(nid))
if not n:
return
if not show_hidden and n.get('isShown') is False and depth > 0:
return
dnodes[int(nid)] = n
if depth >= max_depth and n.get('related'):
counter += 1
cid = counter
n_leaves = len(_leaf_ids(nodes, nid))
dnodes[cid] = {'id': cid,
'name': f'… {n_leaves} variables',
'type': 'collapsed', 'dtype': 'determine',
'related': [], 'desc': f"Collapsed: {n.get('name')}",
'isShown': True}
edges.append((int(nid), cid))
return
for c in n.get('related', []):
ch = m.get(int(c))
if not ch:
continue
if not show_hidden and ch.get('isShown') is False:
continue
edges.append((int(nid), int(c)))
rec(int(c), depth + 1)
rec(0, 0)
return list(dnodes.values()), edges
def _positions(dnodes: list, edges: list):
"""Reingold-Tilford style positions: x=depth, y=subtree-aware vertical."""
H_SCALE = 3.0
V_SPACE = 1.8
children: dict = defaultdict(list)
for p, c in edges:
children[p].append(c)
pos: dict = {}
counter = {'v': 0}
def rec(nid, depth):
ch = children.get(nid, [])
if not ch:
y_pos = counter['v'] * V_SPACE
counter['v'] += 1
pos[nid] = (depth * H_SCALE, y_pos)
return y_pos
child_ys = [rec(c, depth + 1) for c in ch]
y_pos = float(np.mean(child_ys))
pos[nid] = (depth * H_SCALE, y_pos)
return y_pos
rec(0, 0)
return pos
def plot_node_link(nodes: list, max_depth: int = 4,
show_hidden: bool = False, show_leaf_labels: bool = False):
"""
Node-link tree with elbow edges (matches Approach 1.1 layout).
Best for exploring structure at moderate depth — Sunburst remains
recommended for large hierarchies per Taxonomizer (Bian et al. 2020).
"""
nodes = _filter_dissolved(nodes)
dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
pos = _positions(dnodes, edges)
# Elbow edges
ex, ey = [], []
for p, c in edges:
if p not in pos or c not in pos:
continue
x0, y0 = pos[p]
x1, y1 = pos[c]
xm = (x0 + x1) / 2
ex += [x0, xm, xm, x1, None]
ey += [y0, y0, y1, y1, None]
traces = [go.Scatter(x=ex, y=ey, mode='lines',
line=dict(width=1, color='#c8c8c8'),
hoverinfo='skip', showlegend=False)]
agg_xs, agg_ys, agg_labels, agg_colors, agg_hover = [], [], [], [], []
lf_xs, lf_ys, lf_labels, lf_colors, lf_hover = [], [], [], [], []
for n in dnodes:
nid = int(n['id'])
if nid not in pos:
continue
x, y = pos[nid]
lc = len(_leaf_ids(nodes, nid))
lab = n.get('name', str(nid))
htxt = (f"{n.get('name','')}
"
f"Type: {n.get('type','')}
"
f"Variables: {lc}
{_wrap_hover(n.get('desc',''))}")
col = _a2_node_color(n)
if n.get('type') in ('root', 'aggregation', 'collapsed'):
display_lab = (lab + (f' ({lc})' if lc else ''))[:50]
agg_xs.append(x); agg_ys.append(y)
agg_labels.append(display_lab); agg_colors.append(col); agg_hover.append(htxt)
else:
display_lab = lab[:40] if show_leaf_labels else ''
lf_xs.append(x); lf_ys.append(y)
lf_labels.append(display_lab); lf_colors.append(col); lf_hover.append(htxt)
if agg_xs:
traces.append(go.Scatter(
x=agg_xs, y=agg_ys, mode='markers+text',
text=agg_labels, textposition='middle right',
hovertext=agg_hover, hoverinfo='text',
marker=dict(size=16, color=agg_colors,
line=dict(color='white', width=2)),
showlegend=False))
if lf_xs:
traces.append(go.Scatter(
x=lf_xs, y=lf_ys, mode='markers+text',
text=lf_labels, textposition='middle right',
hovertext=lf_hover, hoverinfo='text',
marker=dict(size=7, color=lf_colors, symbol='circle',
opacity=0.75, line=dict(color='white', width=1)),
showlegend=False))
n_leaves = max(12, len(lf_xs))
fig = go.Figure(traces)
fig.update_layout(
height=max(700, min(4000, int(n_leaves * 32))),
margin=dict(l=20, r=220, t=40, b=20),
plot_bgcolor='white', paper_bgcolor='white',
xaxis=dict(visible=False, fixedrange=False),
yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
dragmode='pan',
annotations=[dict(
text='Tip: Sunburst is better for large hierarchies [Taxonomizer 2020]',
xref='paper', yref='paper', x=0.0, y=1.01,
showarrow=False, font=dict(size=11, color='grey'), align='left')]
)
return fig
# ──────────────────────────────────────────────────────────────────────────────
# STREAMLIT APP
# ──────────────────────────────────────────────────────────────────────────────
st.set_page_config(page_title='Approach 2 — Multi-Aspect Hierarchy',
layout='wide')
st.title('Approach 2 — Role-Decomposed Metadata Hierarchy')
st.caption('Group anchoring → LLM role extraction → role-nested LoD tree. '
'Full method details and citations in the Method tab.')
# Session-state init — defensive: always ensure these keys exist with safe
# defaults. Streamlit can occasionally drop attribute-style access if the key
# was set via [setter] in a previous run; using setdefault here works on both
# new and existing sessions.
st.session_state.setdefault('a2_nodes', None)
st.session_state.setdefault('a2_can', None)
st.session_state.setdefault('a2_meta', {})
st.session_state.setdefault('a2_per_row_audit', []) # list of group audits
# Local-LLM auto-detection: probe Ollama server. LLM refinement defaults to
# ON when Ollama is reachable; user can switch it off any time. Env vars
# OLLAMA_URL and OLLAMA_MODEL override the localhost / qwen2.5:3b defaults.
_ollama_url = os.environ.get('OLLAMA_URL', OLLAMA_URL_DEFAULT).strip() or OLLAMA_URL_DEFAULT
_ollama_model = os.environ.get('OLLAMA_MODEL', OLLAMA_MODEL_DEFAULT).strip() or OLLAMA_MODEL_DEFAULT
_ollama_reachable = _ping_ollama(_ollama_url)
# Groq detection: env-var GROQ_API_KEY makes the cloud option available.
_groq_url = os.environ.get('GROQ_URL', GROQ_URL_DEFAULT).strip() or GROQ_URL_DEFAULT
_groq_model = os.environ.get('GROQ_MODEL', GROQ_MODEL_DEFAULT).strip() or GROQ_MODEL_DEFAULT
_groq_key_env = os.environ.get('GROQ_API_KEY', '').strip()
_default_provider = 'groq' if (_groq_key_env and _LLM_CLIENT_AVAILABLE) else 'ollama'
_default_llm_on = (
(_ollama_reachable or bool(_groq_key_env)) and _LLM_CLIENT_AVAILABLE)
with st.sidebar:
st.header('1 · Input')
uploads = st.file_uploader('Metadata / data-dictionary file(s)',
type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
accept_multiple_files=True)
st.header('2 · Algorithm')
max_aspects = st.slider('Max aspects (K upper bound)', 3, 15, 6)
max_depth = st.slider('Max tree depth', 2, 10, 6)
min_cluster_sz = st.slider('Min variables per cluster', 1, 10, 2)
sil_thresh = st.slider('Silhouette threshold',
0.01, 0.30,
value=0.04 if not _SBERT_AVAILABLE else 0.05,
step=0.01)
max_k_split = st.slider('Max child clusters per split', 2, 12, 5)
use_sbert = st.checkbox('SBERT embeddings',
value=_SBERT_AVAILABLE,
disabled=not _SBERT_AVAILABLE)
local_nmf = st.checkbox('Local NMF fallback', value=True)
min_local_nmf = st.slider('Min group size for NMF', 5, 30, 6)
use_slot_mining = st.checkbox('Phrase-slot mining', value=True)
use_fastopic = st.checkbox(
'FASTopic aspects',
value=_FASTOPIC_AVAILABLE,
disabled=not _FASTOPIC_AVAILABLE,
)
fastopic_min_sz = st.slider('Min group size for FASTopic', 6, 100, 40)
if not _FASTOPIC_AVAILABLE:
st.warning('FASTopic not installed — using NMF.')
st.header('3 · LLM provider')
provider_options = ['Ollama (local)', 'Groq (cloud)']
provider_default_idx = 1 if _default_provider == 'groq' else 0
provider_label = st.radio('Provider', provider_options,
index=provider_default_idx, horizontal=True)
llm_provider = 'groq' if provider_label.startswith('Groq') else 'ollama'
if llm_provider == 'groq':
if not _LLM_CLIENT_AVAILABLE:
st.warning('`openai` package not installed.')
elif _groq_key_env:
st.success('GROQ_API_KEY detected → ready')
groq_key_in = st.text_input(
'Groq API key', value=_groq_key_env, type='password',
help='Free key at console.groq.com/keys, or set GROQ_API_KEY env var.')
groq_model_in = st.text_input('Groq model', value=_groq_model)
llm_base_url = _groq_url
llm_model = groq_model_in
llm_api_key = groq_key_in
llm_ready = bool(groq_key_in and _LLM_CLIENT_AVAILABLE)
else:
if not _LLM_CLIENT_AVAILABLE:
st.warning('`openai` package not installed.')
elif _ollama_reachable:
st.success(f'Ollama reachable at `{_ollama_url}`')
else:
st.warning(f'Ollama not reachable at `{_ollama_url}`.')
ollama_url_in = st.text_input('Ollama URL', value=_ollama_url)
ollama_model_in = st.text_input('Ollama model', value=_ollama_model)
llm_base_url = ollama_url_in
llm_model = ollama_model_in
llm_api_key = ''
llm_ready = bool(_LLM_CLIENT_AVAILABLE and _ollama_reachable)
st.header('4 · LLM features')
use_per_row_role_extraction = st.checkbox(
'Per-row role extraction (primary route)',
value=llm_ready,
disabled=not llm_ready,
help='One LLM call per variable extracts measure / statistic / '
'condition / subtype, grounded to the description text.'
)
use_llm = st.checkbox(
'LLM label refinement',
value=llm_ready,
disabled=not llm_ready,
)
use_llm_roles = st.checkbox(
'LLM phrase-role classifier (fallback)',
value=llm_ready,
disabled=not llm_ready,
)
use_role_decomposition = st.checkbox(
'SBERT phrase clustering (fallback)',
value=_SBERT_AVAILABLE and llm_ready,
disabled=not (_SBERT_AVAILABLE and llm_ready),
)
role_namer_constrained = st.checkbox(
'Constrained role vocabulary',
value=True,
)
role_regularity_threshold = st.slider(
'Min phrase regularity', 0.05, 0.80, 0.20, 0.05,
)
st.header('5 · Project')
project_name = st.text_input('Project name', value='project')
# ── load and configure files ──────────────────────────────────────────────────
if uploads:
import tempfile
tmp = Path(tempfile.mkdtemp())
raw_by = {}
cfg_by = {}
st.subheader('Step 1 — Inspect metadata')
for f in uploads:
p = tmp / safe_name(f.name)
p.write_bytes(f.getbuffer())
try:
df = load_any(p)
raw_by[f.name] = df
cfg_by[f.name] = detect_roles(df)
with st.expander(f'{f.name}', expanded=False):
st.write(f'Rows: **{len(df):,}** Columns: **{len(df.columns)}**')
st.dataframe(df.head(8), use_container_width=True)
except Exception as e:
st.error(f'Could not load {f.name}: {e}')
st.subheader('Step 2 — Confirm column roles')
configs = {}
for name, df in raw_by.items():
cols = list(df.columns)
auto = cfg_by[name]
with st.expander(f'{name}', expanded=True):
c1, c2 = st.columns(2)
with c1:
leaf = st.multiselect('Leaf variable column(s)', cols,
default=[c for c in auto['leaf_cols'] if c in cols],
key=f'lf_{name}')
group = st.multiselect('Group / task column(s)', cols,
default=[c for c in auto['group_cols'] if c in cols],
key=f'gr_{name}')
with c2:
text = st.multiselect('Description column(s)', cols,
default=[c for c in auto['text_cols'] if c in cols],
key=f'tx_{name}')
meta = st.multiselect('Type / unit column(s)', cols,
default=[c for c in auto['meta_cols'] if c in cols],
key=f'mt_{name}')
configs[name] = {'leaf_cols': leaf, 'group_cols': group,
'text_cols': text, 'meta_cols': meta}
if st.button('Build Approach 2 Hierarchy', type='primary'):
try:
# Clear stale audit data from any previous build
st.session_state.a2_per_row_audit = []
cans = [build_canonical(df, configs[name], name)
for name, df in raw_by.items()]
can = pd.concat(cans, ignore_index=True)
st.session_state.a2_can = can
texts = can['_text'].fillna('').astype(str).tolist()
# Global NMF (used as fallback and for facet trees)
with st.spinner('Step 3 — Discovering global aspects via NMF …'):
tfidf, X, nmf, W, H, K, alabels = discover_aspects(texts, max_aspects)
st.session_state.a2_meta = {
'K': K, 'aspect_labels': alabels, 'tfidf': tfidf, 'W': W, 'H': H
}
st.info(f'Global aspects discovered: **{K}** — {" | ".join(alabels)}')
sbert_model = None
if use_sbert and _SBERT_AVAILABLE:
with st.spinner('Loading SBERT model …'):
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
with st.spinner('Step 4 — Building global per-aspect representations …'):
reprs = per_aspect_representations(texts, H, tfidf, sbert_model)
# Build LLM function trio — same provider + endpoint for all.
llm_fn = None
if use_llm:
llm_fn = make_llm_label_fn(llm_base_url, llm_model,
provider=llm_provider,
api_key=llm_api_key)
if llm_fn is None:
st.warning('LLM unreachable — deterministic labels only.')
llm_role_namer = None
if use_role_decomposition:
llm_role_namer = make_llm_role_namer_fn(
llm_base_url, llm_model,
constrained=role_namer_constrained,
provider=llm_provider, api_key=llm_api_key)
llm_role_fn = None
if use_llm_roles:
llm_role_fn = make_llm_role_classifier_fn(
llm_base_url, llm_model,
provider=llm_provider, api_key=llm_api_key)
# NEW: per-row role extractor (Zhu et al. EMNLP 2025) — primary route
per_row_extractor = None
if use_per_row_role_extraction:
per_row_extractor = make_per_row_role_extractor_fn(
llm_base_url, llm_model,
provider=llm_provider, api_key=llm_api_key)
if per_row_extractor:
st.info(f'Per-row role extraction: **{llm_provider}** · `{llm_model}`')
else:
st.warning('LLM unreachable for per-row role extraction.')
# Collect detected text-column names across all uploaded configs —
# used by phrase-slot mining to identify description-like fields.
all_text_cols: list = []
for cfg in configs.values():
for c in cfg.get('text_cols', []):
if c not in all_text_cols:
all_text_cols.append(c)
# Replace the module-level FIELD_NAME_NOISE with a noise set DERIVED
# from the actual detected column names — zero hardcoding. All
# downstream calls (label_cluster, _bigram_preferred_terms) read
# the module-level name so this swap propagates everywhere.
import sys as _sys
_sys.modules[__name__].FIELD_NAME_NOISE = build_field_noise(configs)
with st.spinner('Step 6 — Building group-anchored LoD tree '
'(slot mining → FASTopic → NMF) …'):
nodes = build_dynamic_lod_tree(
can, reprs, alabels, tfidf,
max_depth=max_depth,
min_cluster_size=min_cluster_sz,
sil_threshold=sil_thresh,
max_clusters_per_split=max_k_split,
project=project_name,
local_nmf=local_nmf,
min_local_nmf_size=min_local_nmf,
max_aspects=max_aspects,
sbert_model=sbert_model,
llm_label_fn=llm_fn,
use_slot_mining=use_slot_mining,
text_col_names=all_text_cols,
use_fastopic=use_fastopic,
fastopic_min_size=fastopic_min_sz,
llm_role_classifier_fn=llm_role_fn,
use_role_decomposition=use_role_decomposition,
llm_role_namer_fn=llm_role_namer,
role_regularity_threshold=role_regularity_threshold,
per_row_role_extractor_fn=per_row_extractor,
use_per_row_role_extraction=use_per_row_role_extraction,
)
st.session_state.a2_nodes = nodes
# Concise build summary — per-route node counts
route_counts: dict = Counter()
for n in nodes:
if n.get('type') == 'aggregation':
route_counts[
n.get('structure_provenance', {}).get('route', '—')] += 1
n_leaves = len([n for n in nodes if n.get('type') == 'attribute'])
n_internal = len([n for n in nodes if n.get('type') == 'aggregation'])
route_str = ' · '.join(f'{r}: {c}' for r, c in route_counts.most_common())
st.success(f'Done — {n_leaves} variables · {n_internal} internal nodes '
f'({route_str})')
# If any LLM call hit a rate-limit (429), the model ran out of
# tokens — tell the user to switch model in the sidebar and rebuild.
ran_out = any(
('RateLimit' in str(r) or '429' in str(r))
for a in (st.session_state.get('a2_per_row_audit') or [])
for r in (a.get('summary') or {})
)
if ran_out:
st.error(f'Ran out of tokens on `{llm_model}`. '
f'Switch to another Groq model in the sidebar '
f'(e.g. llama-3.1-8b-instant) and rebuild.')
except Exception as e:
st.error(f'Build failed: {e}')
import traceback; st.code(traceback.format_exc())
# ── display ───────────────────────────────────────────────────────────────────
# Robust session-state reads — use .get() so a partial/incomplete build that
# wrote some keys but not others doesn't crash the display layer.
if st.session_state.get('a2_nodes') is None:
st.info('Upload a metadata file and click **Build Approach 2 Hierarchy** to start.')
st.stop()
nodes = st.session_state.get('a2_nodes')
can = st.session_state.get('a2_can')
meta = st.session_state.get('a2_meta') or {}
tabs = st.tabs(['LoD Tree', 'Evaluation', 'Role Decomposition',
'Label Provenance', 'Metadata', 'Export', 'Method'])
with tabs[0]:
# ── Visualization controls (above chart — easy to find, matches Approach 1.1) ─
vc1, vc2, vc3, vc4, vc5 = st.columns([2, 2, 1, 1, 1])
with vc1:
viz_mode = st.radio(
'View mode',
['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
horizontal=True, index=0,
help='Sunburst best for large hierarchies [Taxonomizer]. '
'Node-link best for moderate depth structure inspection.'
)
with vc2:
depth_display = st.slider('Depth (Level of Detail)', 1, 8, 4, 1)
with vc3:
show_leaf_labels = st.checkbox('Leaf labels', value=False)
with vc4:
show_hidden = st.checkbox('Hidden nodes', value=False)
with vc5:
compress_chains = st.checkbox('Compress chains', value=True,
help='Merge one-child aggregation chains '
'(e.g. "DMS → DMS Recommended Standard") '
'for display. Export JSON keeps original structure.')
st.divider()
display_nodes = compress_one_child_chains(nodes) if compress_chains else nodes
if viz_mode == 'Sunburst (drill-down)':
st.plotly_chart(plot_sunburst(display_nodes, max_depth=depth_display),
use_container_width=True)
elif viz_mode == 'Treemap':
st.plotly_chart(plot_treemap(display_nodes), use_container_width=True)
else:
st.plotly_chart(plot_node_link(display_nodes, depth_display,
show_hidden, show_leaf_labels),
use_container_width=True)
n_l = len([n for n in nodes if n.get('type') == 'attribute'])
n_i = len([n for n in nodes if n.get('type') == 'aggregation'])
# max depth
pm = _parent_map(nodes)
def _node_depth(nid):
d = 0; cur = nid
while cur in pm:
cur = pm[cur]; d += 1
return d
max_d = max((_node_depth(n['id']) for n in nodes), default=0)
c1, c2, c3, c4 = st.columns(4)
c1.metric('Variables', n_l)
c2.metric('Internal nodes', n_i)
c3.metric('Global aspects', meta.get('K', '?'))
c4.metric('Max depth', max_d)
with tabs[1]:
import hierarchy_eval as he
st.markdown('### Evaluation')
if can is None or meta.get('W') is None:
st.info('Run the builder first.')
else:
st.caption(
'No manually curated reference taxonomy is available for this experiment. '
'The primary metrics below are **reference-free** — they assess the hierarchy '
'itself. Group-column figures later are descriptive self-consistency checks only. '
'Identical definitions to the Baseline and Approach 1 apps make the primary '
'diagnostics directly comparable.'
)
# ── PRIMARY: reference-free hierarchy quality (compute on demand) ──────
# These use SBERT, which is slow to load. Computing them only on a button
# click keeps the tree, sliders and Save button instant.
st.markdown('#### Primary — reference-free hierarchy quality')
if st.button('▶Compute reference-free metrics', key='a2_eval_btn'):
with st.spinner('Computing reference-free metrics (loads SBERT once)…'):
tm = he.traco_metrics(nodes)
npmi = he.npmi_coherence(nodes, can['_text'].tolist())
st.session_state['a2_eval_cache'] = {'tm': tm, 'npmi': npmi}
_ev = st.session_state.get('a2_eval_cache')
if _ev:
tm, npmi = _ev['tm'], _ev['npmi']
p1, p2, p3 = st.columns(3)
p1.metric('Parent–child coherence', tm['pc_coherence'],
help='TraCo (Wu et al., AAAI 2024). Children nest under parent theme.')
p2.metric('Sibling diversity', tm['sibling_diversity'],
help='TraCo (Wu et al., AAAI 2024). Higher = distinct siblings; LOW = redundant.')
p3.metric('NPMI label coherence', npmi,
help='Lau et al., EACL 2014. Label terms genuinely co-occur in the data.')
st.caption(f'Embedding backend: **{tm["encoder"]}**.')
else:
st.info('Click the button above to compute coherence / diversity / NPMI '
'(takes a few seconds the first time while SBERT loads).')
# ── Label-quality proxies (interpretability) ──────────────────────────
st.markdown('#### Label quality *(interpretability — reference-free)*')
lq = he.label_quality(nodes)
l1, l2, l3 = st.columns(3)
l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
help='% of internal labels that read as a real concept (short noun '
'phrase, WordNet head) rather than a "/"-joined term fragment.')
l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
help='% of internal labels duplicating a sibling label (lower is better).')
l3.metric('Avg label words', lq['avg_label_words'],
help='Mean label length in words.')
# ── Structural statistics ─────────────────────────────────────────────
st.markdown('#### Structural statistics')
sm = he.structural_stats(nodes)
s1, s2, s3, s4, s5 = st.columns(5)
s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
s2.metric('Max leaf depth', sm['max_depth'])
s3.metric('Avg leaf depth', sm['avg_leaf_depth'])
s4.metric('Avg branching', sm['avg_branching_factor'])
s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%")
# ── Group-structure self-consistency (descriptive, NOT accuracy) ───────
st.markdown('#### Group-structure self-consistency *(descriptive — not accuracy)*')
st.caption(
'The group column is a **construction input** (group-anchored L1/L2), so this '
'only confirms the NMF aspect partition reflects its own input — expected high, '
'NOT a quality signal and NOT comparable across methods as an accuracy metric.'
)
true_labels = can['_group'].apply(
lambda x: str(x).split(' > ')[0].strip()).tolist()
W = meta['W']
pred_nmf = np.argmax(W, axis=1).tolist()
metrics = evaluate(true_labels, pred_nmf)
st.metric('ARI (self-consistency)', metrics['ARI'])
# ── legacy global aspect table (diagnostic only) ─────────────────────
with st.expander('Legacy global NMF aspect table (diagnostic — not the '
'primary result)'):
st.caption('Global NMF aspects are a coarse lexical fallback. The '
'hierarchy is built from per-variable role extraction — '
'see the Role Decomposition tab for the actual result.')
alabs = meta.get('aspect_labels', [])
W_df = pd.DataFrame(
W, columns=[f'Aspect {k+1}: {alabs[k][:30]}' for k in range(W.shape[1])])
W_df.insert(0, 'Variable', can['_label'].tolist())
st.dataframe(W_df.round(4), use_container_width=True)
with tabs[2]:
st.markdown('### Role decomposition')
st.caption('Each variable decomposed into measure / statistic / condition / subtype.')
if can is None:
st.info('Build a hierarchy first.')
else:
# ── Per-group summary ─────────────────────────────────────────────────
st.markdown('#### Per-group routing')
reg_rows = []
for n in nodes:
sp = n.get('structure_provenance', {})
if sp.get('route') == 'group_anchor' and 'phrase_regularity' in sp:
reg_rows.append({
'Group': n.get('name', ''),
'Regularity': sp.get('phrase_regularity'),
'Route used': sp.get('route_used', '—'),
})
if reg_rows:
reg_df = pd.DataFrame(reg_rows).sort_values(
'Regularity', ascending=False, na_position='last')
st.dataframe(reg_df, use_container_width=True, hide_index=True)
# ── Per-variable role table ───────────────────────────────────────────
st.markdown('#### Per-variable role table')
# Primary source: raw per-row LLM extractions captured in the audit
# (complete — includes roles that didn't become tree levels, e.g.
# condition values skipped by singleton prevention).
# Fallback per variable: roles collected from tree-ancestor nodes.
audit_roles_by_vi: dict = {}
for a in (st.session_state.get('a2_per_row_audit') or []):
vi_list_a = a.get('vi_list') or []
for r in a.get('per_row_audit', []):
ridx = r.get('row_idx')
accepted = r.get('accepted_roles') or {}
if ridx is not None and ridx < len(vi_list_a) and accepted:
audit_roles_by_vi[int(vi_list_a[ridx])] = accepted
# Tree-walk fallback (roles that became hierarchy levels)
node_map_disp = {int(n['id']): n for n in nodes}
parent_lookup: dict = {}
for n in nodes:
for c in n.get('related', []):
parent_lookup.setdefault(int(c), int(n['id']))
def _tree_roles_for_attr(node_id: int) -> dict:
roles_here: dict = {}
cur = node_id
while cur in parent_lookup:
cur = parent_lookup[cur]
cur_node = node_map_disp.get(cur)
if not cur_node:
break
lp = cur_node.get('label_provenance', {})
role = (lp.get('role')
or cur_node.get('structure_provenance', {}).get('slot_role'))
if role and role not in roles_here:
roles_here[role] = cur_node.get('name', '')
return roles_here
role_rows = []
for vi, (_, crow) in enumerate(can.iterrows()):
# Audit roles win; tree roles fill any gaps
roles_here = dict(_tree_roles_for_attr(vi + 1)) # attribute ids = 1..n
for r, v in (audit_roles_by_vi.get(vi) or {}).items():
if v:
roles_here[r] = v
row = {
'Group': str(crow.get('_group', '')).split(' > ')[0].strip(),
'Variable': str(crow.get('_label', '')),
}
for std_role in ('measure', 'statistic', 'condition',
'subtype', 'outcome', 'modifier'):
row[std_role.title()] = roles_here.pop(std_role, '')
if roles_here:
row['Other roles'] = '; '.join(
f'{r}: {v}' for r, v in roles_here.items())
role_rows.append(row)
if role_rows:
role_df = pd.DataFrame(role_rows)
st.dataframe(role_df, use_container_width=True, hide_index=True)
st.download_button(
'Download per-variable role CSV',
data=role_df.to_csv(index=False).encode('utf-8'),
file_name=f'{safe_name(project_name)}_approach2_role_decomposition.csv',
mime='text/csv',
)
# ── Per-row LLM extractor audit ───────────────────────────────────────
audits = st.session_state.get('a2_per_row_audit', []) or []
if audits:
st.markdown('#### Extraction audit')
# Summary table per group
sum_rows = []
for a in audits:
sum_rows.append({
'Group': a.get('group_name', ''),
'Rows': a.get('n_rows', 0),
'Coverage': a.get('coverage'),
'Valid': a.get('valid'),
'Roles found': ', '.join(a.get('roles_final', []))[:60],
'Reasons': ', '.join(f'{k}:{v}' for k, v in
(a.get('summary', {}) or {}).items()),
})
st.dataframe(pd.DataFrame(sum_rows), use_container_width=True,
hide_index=True)
# Drill-down per group
grp_names = [a.get('group_name', '?') for a in audits]
if grp_names:
sel_grp = st.selectbox(
'Drill into a group to see per-row proposals + rejections:',
grp_names)
sel_audit = next((a for a in audits
if a.get('group_name') == sel_grp), None)
if sel_audit:
row_rows = []
for r in sel_audit.get('per_row_audit', [])[:60]:
accepted = r.get('accepted_roles', {}) or {}
rejected = r.get('rejected', []) or []
row_rows.append({
'Row #': r.get('row_idx', ''),
'Description': r.get('description_snippet', ''),
'Accepted': '; '.join(f'{k}={v}'
for k, v in accepted.items())[:140],
'Rejected': '; '.join(
f'{x[0]}={x[1]!r} (missing stems: {x[2]})'
if isinstance(x, (list, tuple)) and len(x) >= 3
else str(x) for x in rejected)[:200],
'Reason': r.get('reason', ''),
})
if row_rows:
st.dataframe(pd.DataFrame(row_rows),
use_container_width=True, hide_index=True)
# Download as CSV for offline analysis
csv_bytes = pd.DataFrame(row_rows).to_csv(index=False).encode('utf-8')
st.download_button(
'Download per-row audit for this group',
data=csv_bytes,
file_name=f'{safe_name(project_name)}_audit_{safe_name(sel_grp)}.csv',
mime='text/csv',
)
else:
st.info('No role assignments recorded yet — Option D may have '
'fallen back to slot mining or aspect clustering for all '
'groups in this dataset.')
with tabs[3]:
st.markdown('### Label provenance')
st.caption('Audit trail: which stage produced each node label.')
rows = []
for n in nodes:
if n.get('type') != 'aggregation':
continue
p = n.get('label_provenance', {})
s = n.get('structure_provenance', {})
rows.append({
'Node': n.get('name', ''),
'Source': p.get('label_source', '—'),
'Route': s.get('route', '—'),
'Aspect method': s.get('aspect_method') or '—',
'Silhouette': s.get('silhouette') if s.get('silhouette') is not None else '—',
'LLM used': p.get('llm_used', False),
'LLM rejected': p.get('llm_rejected', False),
'LLM proposed': p.get('llm_raw_label', ''),
'LLM reason': p.get('llm_reason', '')[:60],
'Confidence': round(float(p.get('confidence', 1.0)), 3),
'Evidence': ', '.join(str(t) for t in p.get('evidence_terms', []))[:120],
})
if not rows:
st.info('No internal nodes yet — build a hierarchy first.')
else:
prov_df = pd.DataFrame(rows)
# ── Labels by source ──────────────────────────────────────────────────
source_counts = prov_df['Source'].value_counts()
st.write('**Labels by source**')
cols_src = st.columns(min(5, max(2, len(source_counts))))
for i, (src, cnt) in enumerate(source_counts.items()):
cols_src[i % len(cols_src)].metric(str(src), int(cnt))
# ── Structure routes ──────────────────────────────────────────────────
am_counts = prov_df['Aspect method'].value_counts()
st.write('**Structure routes used**')
cols_am = st.columns(min(5, max(2, len(am_counts))))
for i, (am, cnt) in enumerate(am_counts.items()):
cols_am[i % len(cols_am)].metric(str(am), int(cnt))
# ── LLM usage — split per-row extraction from the downstream refiner ──
# Per-row nodes are LLM-BUILT (source 'per_row_llm_role'); the refiner
# only renames deterministically-labeled nodes (source 'llm' when its
# proposal is accepted). Counting them together made the panel read
# "N calls, 0 accepted" even on a fully successful build.
n_per_row = int((prov_df['Source'] == 'per_row_llm_role').sum())
refiner_accepted = int((prov_df['Source'] == 'llm').sum())
refiner_rejected = int(((prov_df['LLM rejected'] == True) # noqa: E712
& (prov_df['Source'] != 'per_row_llm_role')).sum())
st.write('**LLM usage**')
cL1, cL2, cL3 = st.columns(3)
cL1.metric('Per-row extraction nodes', n_per_row)
cL2.metric('Refiner accepted', refiner_accepted)
cL3.metric('Refiner rejected', refiner_rejected)
if refiner_accepted == 0 and refiner_rejected == 0 and n_per_row > 0:
st.caption('Label refiner did not run — the tree was built entirely '
'by per-row extraction, leaving no deterministic labels '
'to refine.')
if refiner_rejected > 0:
with st.expander('Rejected refiner proposals'):
rej = prov_df[(prov_df['LLM rejected'] == True) # noqa: E712
& (prov_df['LLM proposed'].astype(str).str.len() > 0)]
if len(rej):
st.dataframe(rej[['Node', 'LLM proposed', 'LLM reason']],
use_container_width=True, hide_index=True)
# ── Full provenance table ─────────────────────────────────────────────
st.write('**Full per-node provenance**')
st.dataframe(prov_df, use_container_width=True, hide_index=True)
with tabs[4]:
if can is not None:
st.dataframe(can.drop(columns=['_row'], errors='ignore'),
use_container_width=True)
with tabs[5]:
# ── derive a per-CSV base name from the uploaded files ────────────────────
# Uses the actual uploaded file names so different CSVs get different
# output filenames (e.g. ai-mind-…json vs HCP_S1200_…json).
csv_basis = ''
if can is not None and '_source' in can.columns:
sources = [str(s) for s in can['_source'].dropna().unique().tolist()]
# Drop extensions, join with '+' if multiple files merged
bases = []
for s in sources:
stem = Path(s).stem
bases.append(safe_name(stem))
csv_basis = '+'.join(bases) if bases else safe_name(project_name)
if not csv_basis:
csv_basis = safe_name(project_name)
lod_fname = f'{csv_basis}_approach2_lod.json'
st.caption(f'Filename basis: **{csv_basis}** '
f'(taken from the uploaded CSV — different CSVs export under different names)')
col1, col2 = st.columns(2)
with col1:
if nodes:
st.download_button(
'LoD tree JSON',
data=json.dumps(nodes, indent=2, ensure_ascii=False).encode(),
file_name=f'{csv_basis}_approach2_lod.json',
mime='application/json',
use_container_width=True,
)
with col2:
if can is not None:
st.download_button(
'Canonical CSV',
data=can.to_csv(index=False).encode('utf-8'),
file_name=f'{csv_basis}_approach2_canonical.csv',
mime='text/csv',
use_container_width=True,
)
st.divider()
# ── Save directly into the project's outputs/approach_2/ folder ────────────
_out_dir = Path(__file__).resolve().parent / 'outputs' / 'approach_2'
st.markdown('### Save to project folder')
st.caption(
'The download buttons above go to your browser’s Downloads folder (a browser '
f'restriction). This button instead writes the files into `{_out_dir}` with the '
'dataset name — convenient for `evaluate_all.py`.'
)
if st.button('Save all to outputs/approach_2/', type='primary',
use_container_width=True):
try:
_out_dir.mkdir(parents=True, exist_ok=True)
saved = []
if nodes:
(_out_dir / f'{csv_basis}_approach2_lod.json').write_text(
json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
saved.append(f'{csv_basis}_approach2_lod.json')
if can is not None:
can.to_csv(_out_dir / f'{csv_basis}_approach2_canonical.csv', index=False)
saved.append(f'{csv_basis}_approach2_canonical.csv')
st.success(f'Saved to `{_out_dir}`:\n\n- ' + '\n- '.join(saved))
except Exception as _e:
st.error(f'Could not save: {_e}')
with tabs[6]:
st.markdown("""
## Approach 2 — Role-Decomposed Hierarchy via SBERT Phrase Clustering
### Option D primary route + slot mining + FASTopic + constrained LLM labels
### Algorithm
```
Step 1 Build metadata text objects (variable name + description + group)
[GON §3] — Gonçalves et al. (2019)
Step 2 Group-anchored L1/L2 structure [NEW]
— detected group columns → path nodes (no hardcoding)
— e.g. category > assessment, or task > variant
— falls back to global NMF if no groups detected
Step 3a Phrase-slot mining (slot-first routing) [IE / slot induction]
— For each terminal group, attempt deterministic phrase-slot
decomposition of variable descriptions:
• extract concept-prefix per row
• mine repeated n-grams (1–4 tokens, ≥2 rows)
• compute mutual-exclusion across phrase pairs:
M[a,b] = 1 − cooc[a,b] / min(count[a], count[b])
• cluster phrases by mutual exclusion → slots
— A slot is a set of phrases that rarely co-occur within a row but
each co-occur with phrases from other slots (= alternatives at
the same semantic position).
— Activates only when slot structure is statistically strong:
• ≥ 2 slots discovered
• ≥ 55% row coverage
• each slot ≥ 2 distinct phrases
— No domain hardcoding: phrase content is discovered from the data;
slot names = the highest-coverage phrase in each slot.
— When valid, the local hierarchy is built directly from slot values.
— When invalid (free-form descriptions, e.g. parts of HCP), routing
falls through to NMF (Step 3b).
Step 3b Local NMF aspect discovery per terminal group [ZHU §3.1 adapted]
— Fallback path when slot mining does not apply.
— NMF runs inside each group, not across all variables
— prevents globally-dominant terms from polluting local aspects
— K selected by reconstruction-error elbow (deterministic)
— falls back to global embeddings for small groups (< min_local_nmf_size)
Step 4 Per-aspect variable representations [ZHU §3.1]
— for each aspect k: filter text to top-T terms → SBERT or masked TF-IDF
Step 5 Independent per-aspect GMM clustering [ZHU §3.2]
— GMM with diagonal covariance + BIC for stable k selection
— runs inside each group's aspect space
Step 6 Simplified best-aspect split [ZHU §3.3 adapted]
— at each node: evaluate all K aspects by silhouette score
— highest silhouette → GMM split → child aggregation nodes
— NOTE: this is a silhouette-based greedy split, not the full
probabilistic search of Zhu et al. Eq. 6/7
— singleton prevention: 1-variable clusters attach directly (no wrapper node)
Step 5a UPSTREAM LLM phrase-role classification [TopicGPT, NAACL 2024 adapted]
— One LLM call per terminal group:
input : (i) all repeated mined phrases (verbatim from the corpus)
(ii) 2–4 sample variable descriptions for context
(iii) the group name
output : {role_name: [phrases]} e.g. {measure: [...], statistic: [...],
condition: [...]}
— Anti-hallucination:
• every phrase in the returned roles MUST match an input phrase
verbatim (validator drops anything else)
• role names must be 1–2 generic English words (drops fancy/long names)
• ≥ 2 valid roles required; else falls back to mutual-exclusion
— Phrase ↔ role mapping drives the slot hierarchy in Step 6:
measure-like roles become outer levels, condition-like roles inner.
— STRUCTURAL use of the LLM (TopicGPT discovers topics). Distinct
from the downstream label-refiner step (TopicTag, see Step 7e).
Step 7 Node labeling [ZHU §4.3 / TopicTag DocEng 2024]
a) description-prefix phrase shared by ≥60% of cluster
b) group-purity prefix: if ≥70% share one _group top-level value
c) data-driven boilerplate + FIELD_NAME_NOISE filter
d) bigram-preferred discriminative TF-IDF suffix
e) OPTIONAL downstream LLM refinement [TopicTag]:
— receives only evidence terms + parent path + sample descriptions
— strict grounding check: every label word must appear in evidence
— rejected proposals fall back to deterministic label
— provenance stored on each node (label_source, confidence, evidence_terms)
Step 8 Evaluation [TraCo AAAI 2024 / Lau EACL 2014 / TICL §3.4]
— reference-free parent-child coherence and sibling diversity
— reference-free NPMI label coherence and label-quality proxies
— optional group-structure self-consistency, explicitly not manual-reference accuracy
— label and structure provenance audit tables
```
### Key design decisions
| Decision | Rationale |
|---|---|
| FASTopic replaces NMF as primary aspect discovery | NMF (1999) is lexical only; FASTopic (NeurIPS 2024) uses pretrained Transformer + Dual Semantic-relation Reconstruction → semantic, not lexical. |
| NMF kept as fallback | Required for very small groups or when FASTopic / SBERT model is unavailable. |
| Slot mining tried first | Decomposes variables along multiple semantic dimensions before any topic model. No document-level method (NMF, BERTopic, FASTopic) can do this — they all collapse one variable into one vector. |
| No facet trees | Removed: a single coherent LoD tree is easier to defend than parallel views of one clustering. |
| Deterministic labels = default thesis result | Reproducible without API access. LLM is opt-in re-phrasing only. |
| LLM via local Ollama | Localhost OpenAI-compatible endpoint (`http://localhost:11434/v1`) → LLM ON by default whenever Ollama is reachable; easy to disable. Override `OLLAMA_URL` / `OLLAMA_MODEL` env vars for non-default deployments. No external API, no key management, fully reproducible from a known model checkpoint. |
| Strict LLM grounding | Every label word must appear in evidence — labels come from the CSV, LLM only rewords. |
| Per-node provenance | Audit trail: `label_source ∈ {description_prefix, tfidf_bigram, group_anchor, phrase_slot, llm, fallback}`. |
### Thesis wording (defense-safe)
*Approach 2 is a dataset-constrained multi-aspect hierarchy with strict separation
between structural decisions and label generation. The hierarchy topology is produced
deterministically: detected group metadata anchors the upper levels, IE-style phrase-slot
mining decomposes variables along multiple semantic dimensions when description structure
permits, and FASTopic (NeurIPS 2024) discovers latent semantic aspects in the remaining
groups, with NMF retained as a lexical fallback for small groups. Concept labels are
generated by a deterministic five-stage pipeline whose evidence comes exclusively from
the dataset itself. An optional TopicTag-style LLM refinement layer may re-phrase these
labels, but every LLM proposal must pass a grounding check — each word in the proposed
label must appear in the extracted evidence — and every label records its provenance
(source stage, confidence, evidence terms). The LLM can neither alter the tree structure
nor introduce vocabulary absent from the input CSV.*
### Papers used
| Ref | Citation | Role in this method |
|---|---|---|
| [ZHU] | Zhu et al. (2025). *Context-Aware Hierarchical Taxonomy Generation via LLM-Guided Multi-Aspect Clustering.* EMNLP 2025. | Main scaffold — adapted (FASTopic+NMF replace LLM aspect generation; greedy silhouette replaces Eq. 6/7 search). |
| [FASTopic] | Wu et al. (2024). *FASTopic: Pretrained Transformer is a Fast, Adaptive, Stable, and Transferable Topic Model.* NeurIPS 2024 (arXiv:2405.17978). | Recent SOTA replacement for NMF — semantic topic discovery via Dual Semantic-relation Reconstruction with optimal transport. |
| [IE-Slot] | Established IE literature on slot induction (surveyed *ACM Computing Surveys* 2022). | Phrase-slot mining adaptation — decomposes one variable into multiple alternative-phrase signals before clustering. |
| [GON] | Gonçalves et al. (2019). ESWC 2019. | Canonical metadata text-object construction. |
| [TopicGPT] | Pham et al. (2024). *TopicGPT: A Prompt-based Topic Modeling Framework.* NAACL 2024 (arXiv:2311.01449). | **STRUCTURAL** upstream LLM use — discovers semantic-role schema from mined phrases (one call per group); drives slot ordering in the hierarchy. Anti-hallucination contract: every phrase verbatim from corpus. |
| [TopicTag] | Eren et al. (2024). DocEng 2024 (arXiv:2407.19616). | Constrained LLM label-refinement pattern — LLM only names existing clusters, never modifies structure. Downstream use only. |
| [Qwen2.5] | Qwen Team (2024). *Qwen 2.5 Technical Report.* arXiv:2412.15115. | Open instruction-tuned model used as the local LLM (via Ollama) for label refinement. Replaces a hosted LLM for full offline reproducibility. |
| [TraCo] | Wu et al. (2024). AAAI 2024 (arXiv:2401.14113). | Diagnostic metrics: parent-child coherence + sibling diversity. |
| [TaxoAdapt] | Kargupta et al. (2025). ACL 2025 (arXiv:2506.10737). | Multidimensional taxonomy motivation. |
| [SC-Taxo] | (2026). arXiv:2605.00620. | Future work — bidirectional semantic consistency. |
| [BISE-26] | Motamedi, Novalija, Rei (2026). Springer BISE. | Validates group-anchored entry strategy. |
| [TICL] | Kejriwal et al. (2022). EAAI 108, 104548. | Concept-label evaluation framework. |
### Known limitations (honest)
* **FASTopic is still document-level** — better than NMF semantically, but a single variable
is still one vector. Multi-dimension decomposition relies on phrase-slot mining.
* **Greedy silhouette split** — not the full probabilistic search of Zhu et al. Eq. 6/7.
* **TraCo metrics are diagnostic only** — measured, not enforced (no neural transport plan).
* **LLM labels are only as recent as your Anthropic model** — model choice affects reproducibility;
the deterministic pipeline is the canonical thesis result.
""")