# approach 1.py — Automatic Metadata Hierarchy Builder — Approach 1 # # Algorithm (no hardcoded domain-specific labels): # 1. Read metadata → detect roles → build canonical schema # 2. Extract candidate concepts automatically from metadata text # 3. Detect domain → select external sources # 4. Retrieve concept TABLE from external sources (Wikidata, Wikipedia, WordNet, BioPortal) # 5. Embed variables + concept table (SBERT or TF-IDF fallback) # 6. Compute N×M cosine similarity matrix [GON] — variables × concepts # 7. Score concept assignment: embedding + string + frequency + source + hierarchy # 8. Build task/group-first hierarchy using automatically assigned concept labels # 9. HiExpan refinement: sibling coherence, width expansion, depth expansion, global opt # 10. VIANNA LoD tree + Castanet parallel facets # 11. Export with label provenance # # Papers: # [GON] Gonçalves et al. — biomedical metadata alignment via N×M concept similarity matrix # [TAX] Taxonomizer (Sultanum et al.) — leaf=attribute, internal node=abstract group # [HIE] HiExpan (Shen et al.) — width/depth expansion, sibling coherence, global opt # [CAS] Castanet — parallel faceted hierarchies over the same variable set from __future__ import annotations import csv, json, re, time, warnings from collections import defaultdict from difflib import SequenceMatcher from pathlib import Path import numpy as np import pandas as pd import plotly.graph_objects as go from plotly.subplots import make_subplots import requests import streamlit as st from sklearn.cluster import AgglomerativeClustering from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score from sklearn.metrics.pairwise import cosine_distances, cosine_similarity from sklearn.preprocessing import LabelEncoder try: from sentence_transformers import SentenceTransformer _ST_AVAILABLE = True except ImportError: _ST_AVAILABLE = False try: import nltk for _pkg in ('wordnet', 'omw-1.4'): try: nltk.data.find(f'corpora/{_pkg}') except LookupError: nltk.download(_pkg, quiet=True) from nltk.corpus import wordnet as wn _WORDNET_AVAILABLE = True except Exception: _WORDNET_AVAILABLE = False warnings.filterwarnings('ignore') st.set_page_config(page_title='Metadata Hierarchy — Approach 1', page_icon='🌳', layout='wide') st.title('Metadata Hierarchy Builder — Approach 1') st.caption( 'Automatic concept-label extraction from metadata text + HiExpan refinement + Castanet facets. ' 'External enrichment (Wikidata / Wikipedia / PubMed) activates automatically for biomedical, ' 'cognitive, and neurological domains.' ) # ───────────────────────────────────────────────────────────────────────────── # CONSTANTS # ───────────────────────────────────────────────────────────────────────────── LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split() GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split() TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation comment'.split() META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split() RELATION_TYPES = { 'has_measure': 'has measure type', 'is_statistic_of': 'is statistic of', 'has_condition': 'has condition', 'part_of': 'part of', # Wikidata P361 'instance_of': 'instance of', # Wikidata P31 'subclass_of': 'subclass of', # Wikidata P279 'belongs_to': 'belongs to', 'related_to': 'semantically related to', } # Source confidence weights for concept scoring [GON] SOURCE_CONFIDENCE = { 'group_path': 0.95, 'description_title': 0.91, # [FIX4][TAX][LOB] Text before first colon in description — highly discriminative 'bioportal': 0.92, 'cognitive_atlas': 0.94, # [C5] Cognitive Atlas — domain-specific for CANTAB/cognitive, above Wikidata 'wikidata': 0.88, 'wordnet': 0.83, 'pubmed': 0.82, 'wikipedia': 0.78, 'metadata_tfidf': 0.65, 'noun_phrase': 0.55, } # English stop words (standard, domain-agnostic) _STOP = { 'the','a','an','is','are','was','were','be','been','being','have','has','had', 'do','does','did','will','would','shall','should','may','might','must','can', 'could','of','in','on','at','to','for','with','by','from','about','as','into', 'through','during','before','after','above','below','between','each','all', 'both','few','more','most','other','some','such','no','nor','not','only', 'same','so','than','too','very','just','this','that','these','those','which', 'who','when','where','why','how','what','and','but','or','if','then','because', 'while','although','however','therefore','thus','hence','also','well','used', 'using','use','based','given','defined','number','value','values','score', } # ─── KeyBERT / labelling configuration ─────────────────────────────────────── # These tune the KeyBERT label synthesizer used in the hybrid scorer. # # USE_NOUN_PHRASES — True: candidate phrases are NLTK POS-tagged noun phrases # (needs the 'averaged_perceptron_tagger' corpus); False: plain n-gram candidates # from tokens. False is robust for short CANTAB/AI-MIND descriptions and avoids the # extra NLTK dependency. USE_NOUN_PHRASES = False # USE_CTFIDF — True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide # boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid. USE_CTFIDF = True # KEYBERT_DIVERSITY — MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid # (pick the single most relevant phrase); 0.5 = standard MMR diversification. KEYBERT_DIVERSITY = 0 # ─── Title-SEEDED KeyBERT label-scorer weights ─────────────────────────────── # Concept labels are FORMED FROM THE DESCRIPTIONS (KeyBERT candidate phrases over the # cluster's member descriptions). The pre-colon title is a ranking SEED/anchor, not the # label itself: LABEL_W_TITLE controls how strongly it biases the choice toward the # human-canonical phrasing (this is "Guided/Seeded KeyBERT"). Set LABEL_W_TITLE=0 for a # pure-description ablation. Magnitudes are relative (need not sum to 1). LABEL_W_RELEVANCE = 0.45 # cosine(candidate, cluster centroid) — description fit (α) LABEL_W_TITLE = 0.35 # cosine(candidate, pre-colon title) — title influence (β) LABEL_W_CONTRAST = 0.15 # discriminativeness vs sibling clusters (γ) # NOTE: node labels are formed from DESCRIPTIONS + pre-colon TITLE only. External # ontology sources (Cognitive Atlas / Wikidata / WordNet / PubMed) inform the embedding # space / semantic understanding but are never used to name a node — so there is no # external-grounding term in the label score. # Corpus IDF over description n-grams; populated in build_concept_hierarchy() and # consumed by _keybert_label when USE_CTFIDF=True. _CORPUS_IDF: dict = {} # Active dataset domain; set in build_concept_hierarchy(), read by the hybrid label # scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing). _ACTIVE_DOMAIN: str = 'general' # Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from # data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows # embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the # embedding text so they can neither name a node nor distort clustering. Domain-agnostic # documentation/scale tokens only — not concept vocabulary. _LABEL_BOILERPLATE = { 'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link', 'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category', 'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat', } # Inline URLs in free text (http://…, www.…/…) — removed from the embedding text. _URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)', re.IGNORECASE) # ───────────────────────────────────────────────────────────────────────────── # FILE LOADING # ───────────────────────────────────────────────────────────────────────────── def safe_name(name): return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name) def try_read_csv(path): best, best_score = None, -1 for enc in ['utf-8-sig', 'utf-8', 'latin1']: for sep in [None, ',', '\t', ';', '|']: try: df = pd.read_csv(path, sep=sep, engine='python', encoding=enc) score = df.shape[1] * 10 - float(df.isna().mean().mean()) if score > best_score: best, best_score = df, score except Exception: pass if best is None: raise ValueError(f'Could not read {path.name}') best.columns = [str(c).strip().replace(';', '') for c in best.columns] if len(best) > 0: first = best.iloc[:, 0].astype(str) other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0 if first.str.contains(',').mean() > 0.50 and other_null > 0.70: lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines() if lines: header = [h.strip().replace(';', '') for h in lines[0].split(',')] rows = [] for line in lines[1:]: line = line.strip().rstrip(';') if not line: continue if line.startswith('"') and line.endswith('"'): line = line[1:-1] try: parts = next(csv.reader([line], quotechar='"')) except Exception: continue if len(parts) >= len(header): rows.append(parts[:len(header)]) if rows: best = pd.DataFrame(rows, columns=header) best.columns = [str(c).strip().replace(';', '') for c in best.columns] return best def load_any(path): s = path.suffix.lower() if s in ['.csv', '.tsv', '.txt']: return try_read_csv(path) if s in ['.xlsx', '.xls']: return pd.read_excel(path) if s == '.json': obj = json.loads(path.read_text(encoding='utf-8', errors='replace')) if isinstance(obj, list): return pd.json_normalize(obj) if isinstance(obj, dict): for v in obj.values(): if isinstance(v, list): return pd.json_normalize(v) if s in ['.md', '.markdown']: rows = [] for ln in path.read_text(encoding='utf-8', errors='replace').splitlines(): if '|' not in ln: continue cells = [c.strip() for c in ln.strip().strip('|').split('|')] if cells and not all(re.fullmatch(r':?-{2,}:?', c or '') for c in cells): rows.append(cells) if len(rows) < 2: raise ValueError('No Markdown table found') header = rows[0] data = [r[:len(header)] + [''] * max(0, len(header) - len(r)) for r in rows[1:]] return pd.DataFrame(data, columns=header) raise ValueError(f'Unsupported: {s}') def probably_raw(df): cols = [str(c).lower() for c in df.columns] return df.shape[1] > 20 and not any(any(k in c for k in TEXT_KEYS) for c in cols) def raw_to_metadata(df): rows = [] for c in df.columns: s = df[c] dtype = 'number' if pd.api.types.is_numeric_dtype(s) else 'string' sample = ', '.join(map(str, s.dropna().astype(str).unique()[:5])) rows.append({'name': str(c), 'description': f'Column dtype:{dtype}. Values:{sample}', 'dtype': dtype}) return pd.DataFrame(rows) import tempfile def save_uploads(files): tmp = Path(tempfile.mkdtemp(prefix='meta_app1_')) paths = [] for f in files: p = tmp / safe_name(f.name) p.write_bytes(f.getbuffer()) paths.append(p) return paths # ───────────────────────────────────────────────────────────────────────────── # ROLE DETECTION # ───────────────────────────────────────────────────────────────────────────── def norm(c): return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_') def kscore(c, keys): nc = norm(c) return sum(1 for k in keys if k in nc) def profile_columns(df): out = [] n = max(len(df), 1) for col in df.columns: s = df[col] non = float(s.notna().mean()) nun = int(s.nunique(dropna=True)) ur = nun / n avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0 out.append({ 'column': str(col), 'non_null': round(non, 3), 'unique_values': nun, 'unique_ratio': round(ur, 3), 'avg_length': round(avg, 1), 'leaf_score': 4 * kscore(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0), 'group_score': 4 * kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n * 0.5, 80) else 0) + (1 if avg < 60 else 0), 'text_score': 5 * kscore(col, TEXT_KEYS) + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0), 'metadata_score': 4 * kscore(col, META_KEYS) + (2 if 1 < nun < min(n * 0.8, 100) else 0), }) return pd.DataFrame(out) def detect_roles(df): prof = profile_columns(df) leaf = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist() text = prof[(prof.text_score >= 4) | (prof.avg_length > 80)].sort_values('text_score', ascending=False)['column'].tolist() or leaf.copy() group = prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)].sort_values('group_score', ascending=False)['column'].head(3).tolist() meta = prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))].sort_values('metadata_score', ascending=False)['column'].head(5).tolist() # DDI/CDISC: representation columns must never become structural hierarchy levels [GON][TAX] # These substrings identify physical metadata — universally, across any domain. _META_SUBSTR_BLOCK = { 'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale', 'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing', } def _col_is_repr(col_name): nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower()) return any(sub in nc for sub in _META_SUBSTR_BLOCK) # Force representation columns out of group and into metadata meta_extra = [c for c in prof['column'].tolist() if _col_is_repr(c) and c not in text and c not in leaf and c not in meta] group = [c for c in group if not _col_is_repr(c)] meta = list(dict.fromkeys(meta + meta_extra))[:8] return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof def sv(x): return '' if pd.isna(x) else str(x).strip() def guess_dtype(row, dtype_cols, label): joined = ' '.join(sv(row.get(c, '')) for c in dtype_cols).lower() if any(t in joined for t in ['num', 'int', 'float', 'double', 'decimal', 'continuous', 'number']): return 'number' if any(t in joined for t in ['string', 'text', 'char', 'category', 'categorical', 'nominal']): return 'string' if re.search(r'(name|country|gender|sex|site|visit|status)', label.lower()): return 'string' return 'determine' def build_canonical(df, cfg, source): """[GON] Build unified metadata text object from any tabular metadata file.""" leaf_cols = cfg.get('leaf_cols', []) group_cols = cfg.get('group_cols', []) text_cols = cfg.get('text_cols', []) meta_cols = cfg.get('metadata_cols', []) if not leaf_cols: raise ValueError('Choose at least one leaf column') rows = [] for i, row in df.iterrows(): leaf_parts = [sv(row.get(c, '')) for c in leaf_cols] leaf_parts = [p for p in leaf_parts if p] label = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}' group_parts = [sv(row.get(c, '')) for c in group_cols] group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']] gpath = ' > '.join(group_parts) if group_parts else 'Ungrouped' parts = [] for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)): v = sv(row.get(c, '')) if v: parts.append(f'{c}: {v}') text = ' | '.join(parts) if parts else label # _semantic_text: description values only, no "fieldname: " prefixes [TAX][GON] # TAX embeds description text, not the full metadata row. # This is the input to SBERT and TF-IDF candidate extraction. sem_parts = [] for c in text_cols: v = sv(row.get(c, '')) if v: sem_parts.append(v) if not sem_parts: sem_parts = list(leaf_parts) if leaf_parts else [] semantic_text = ' '.join(sem_parts) if sem_parts else text # Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens # cannot dominate either the embedding (clustering) or the KeyBERT label. semantic_text = _URL_RE.sub(' ', semantic_text) rows.append({ '_source_file': source, '_row_index': int(i), '_leaf_label': label, '_leaf_id': f'{gpath}.{label}' if gpath != 'Ungrouped' else label, '_group_path': gpath, '_text': text, '_semantic_text': semantic_text, '_dtype': guess_dtype(row, meta_cols, label), '_raw': row.to_dict(), '_concept_label': '', '_concept_score': 0.0, '_concept_source': '', '_code_family': '', }) can = pd.DataFrame(rows) if can['_leaf_id'].duplicated().any(): cnt = defaultdict(int) ids = [] for lid in can['_leaf_id']: cnt[lid] += 1 ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}') can['_leaf_id'] = ids return can # ───────────────────────────────────────────────────────────────────────────── # [F3] EARLY FACET PRE-COMPUTATION [CAS] # Castanet: parallel facets (Statistic, Condition) are orthogonal split dimensions. # These must be available BEFORE build_concept_hierarchy so _cluster_and_label # can use them for sub-splitting. detect_facets/build_castanet_facets is called # AFTER the hierarchy build, which is too late — so we compute them here first. # ───────────────────────────────────────────────────────────────────────────── def precompute_stat_cond_facets(can): """ Pre-compute _facet_cond on can (numeric experimental conditions only). Called before build_concept_hierarchy so that _cluster_and_label can use it to insert Condition sub-tiers. NOTE: the statistic tier (Mean / Median / SD / …) is NO LONGER computed here. It used to come from a hardcoded statistic vocabulary regex, which (a) is domain hardcoding and (b) is not derived from the data's own concept titles. Statistic depth is now produced data-drivenly by _nest_by_measure(), which discovers the shared measure phrase and keeps the residual (Mean/Median/SD) as children — no word list. Condition detection below stays: it is structural (a digit in the code validated against the description text), not a hardcoded vocabulary. [CAS] Castanet parallel facets · [HIE] HiExpan sub-set discovery """ can = can.copy() sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text' # ── Condition: digit in variable code VALIDATED by description text ────────── # [FIX2][GON] Gonçalves et al. (ESWC 2019): structural code alignment must be # validated against description text — the description is the authoritative source. # Previous rule: any digit in the code = condition value → caused false labels # like "468" (from SWMBE468) and HCP numeric suffixes that are not conditions. # New rule: a digit is accepted as a condition only if it ALSO appears as a # standalone token in the variable's description text, confirming it is a real # experimental parameter (delay, boxes, items, etc.). _num_re = re.compile(r'(\d+)') def _extract_cond(row): code = str(row['_leaf_label']).split('/')[0].strip() hits = _num_re.findall(code) if not hits: return '' desc_text = str(row.get(sem_col, row.get('_text', ''))).lower() for digit in hits: # Accept digit only if it appears as a whole word in the description if re.search(r'\b' + re.escape(digit) + r'\b', desc_text): return digit return '' cond_col = can.apply(_extract_cond, axis=1) can['_facet_cond'] = cond_col.where(cond_col != '', '') return can # ───────────────────────────────────────────────────────────────────────────── # SEMANTIC EMBEDDER [TAX][GON] # ───────────────────────────────────────────────────────────────────────────── class SemanticEmbedder: """SBERT with TF-IDF+SVD fallback. [TAX] Word2Vec→SBERT; [GON] GloVe→SBERT. Critical fix: in TF-IDF mode, a single vectorizer+SVD is fit jointly on ALL texts (variables + concept entries) so both live in the same vector space. Without this, N×M cosine similarity between separately-fit spaces is meaningless. """ def __init__(self, model_name='all-MiniLM-L6-v2'): self.model_name = model_name self._model = None self._using_st = False self._joint_vec = None # shared TF-IDF vectorizer (fallback mode only) self._joint_svd = None # shared SVD (fallback mode only) self._joint_dim = 64 def load(self): if _ST_AVAILABLE: try: self._model = SentenceTransformer(self.model_name) self._using_st = True return True, f'Loaded {self.model_name} (SBERT)' except Exception as e: return False, f'sentence-transformers failed: {e}' return False, 'sentence-transformers not installed — using TF-IDF+SVD fallback' def fit_joint(self, all_texts): """ Call once with variable texts + concept full_texts combined BEFORE encoding. Ensures TF-IDF fallback uses a single shared vector space for N×M alignment. No-op when SBERT is active (SBERT is already a universal space). """ if self._using_st: return clean = [str(t) for t in all_texts if str(t).strip()] if len(clean) < 2: return vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000, min_df=1) X = vec.fit_transform(clean) n_comp = min(self._joint_dim, X.shape[1] - 1, X.shape[0] - 1) if n_comp >= 2: svd = TruncatedSVD(n_components=n_comp, random_state=42) svd.fit(X) self._joint_vec = vec self._joint_svd = svd def encode(self, texts): if self._using_st and self._model is not None: embs = self._model.encode(texts, show_progress_bar=False, batch_size=64, normalize_embeddings=True) return np.array(embs) # TF-IDF fallback — use shared space if available clean = [str(t) for t in texts] if self._joint_vec is not None and self._joint_svd is not None: X = self._joint_vec.transform(clean) embs = self._joint_svd.transform(X) else: # Independent fit (only before fit_joint is called — e.g. early pipeline stages) vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000, min_df=1) X = vec.fit_transform(clean) n_comp = min(self._joint_dim, X.shape[1] - 1, X.shape[0] - 1) embs = (TruncatedSVD(n_components=n_comp, random_state=42).fit_transform(X) if n_comp >= 2 else X.toarray().astype(float)) norms = np.linalg.norm(embs, axis=1, keepdims=True) norms[norms == 0] = 1 return (embs / norms).astype(float) @property def backend(self): return self.model_name if self._using_st else 'TF-IDF+SVD (joint-fit, fallback)' # ───────────────────────────────────────────────────────────────────────────── # CODE / ACRONYM ANALYSIS [GON] # Detects coded variable names and groups them by shared structural prefix. # Gonçalves et al. use string-distance clustering before semantic alignment. # ───────────────────────────────────────────────────────────────────────────── def detect_coded_variables(can): """ Returns mask of rows whose leaf label looks like a variable code: all-uppercase strings with digits, short, no spaces (e.g. DMSL0SD). """ pattern = re.compile(r'^[A-Z][A-Z0-9_]{2,}$') return can['_leaf_label'].apply(lambda x: bool(pattern.match(str(x).strip().split('/')[0].strip()))) def cluster_codes_by_prefix(can): """ [F7] Groups coded variable names by their structural prefix. Improvement over simple ^[A-Z]+ regex: uses longest-common-prefix detection so that codes without digits (DMSLADSD, DMSLSSD) join the same family as codes with digits (DMSL0SD, DMSL4SD, DMSL12SD). Algorithm: 1. For each coded variable, extract the alphabetic prefix before first digit (same as before for codes WITH digits, e.g. DMSL from DMSL0SD). 2. For codes WITHOUT digits, try progressively shorter prefixes until finding one shared by ≥2 other codes — so DMSLADSD tries "DMSLADSD", "DMSLADS", "DMSLA", "DMSL" → "DMSL" matches ≥2 others → family = "DMSL". 3. Assign the LONGEST matching prefix as the family key. Result: DMSL0SD, DMSL4SD, DMSL12SD, DMSLADSD, DMSLSSD, DMSLSD all share family "DMSL" regardless of digit presence. Works on any CSV domain. """ can = can.copy() coded_mask = detect_coded_variables(can) if not coded_mask.any(): return can # Step 1: collect all codes and their alpha prefix before first digit idx_to_code = {} idx_to_alpha = {} for idx, row in can[coded_mask].iterrows(): code = str(row['_leaf_label']).strip().split('/')[0].strip() idx_to_code[idx] = code m = re.match(r'^([A-Z]+)', code) idx_to_alpha[idx] = m.group(1) if m else code # Step 2: build prefix → {indices} map for all possible prefix lengths ≥ 3 prefix_to_idxs = defaultdict(set) for idx, alpha in idx_to_alpha.items(): for length in range(3, len(alpha) + 1): prefix_to_idxs[alpha[:length]].add(idx) # Step 3: for each code find the longest prefix with ≥2 total matching codes prefix_counts = {p: len(idxs) for p, idxs in prefix_to_idxs.items()} best_prefix = {} for idx, alpha in idx_to_alpha.items(): chosen = None for length in range(len(alpha), 2, -1): # try longest first candidate = alpha[:length] if prefix_counts.get(candidate, 0) >= 2: chosen = candidate break best_prefix[idx] = chosen # Step 4: assign — only use a prefix if it appears in ≥2 variables for idx, prefix in best_prefix.items(): if prefix: can.at[idx, '_code_family'] = prefix return can def expand_variable_codes(can): """ [GON] Automatically expand variable code segments to human-readable terms. Three evidence sources — all data-driven, no hardcoded domain terms: 1. Parenthetical patterns in description text: 'DMS (Delayed Matching to Sample)' → DMS = Delayed Matching to Sample 2. Repeated positional suffix across a code family: DMSL0SD, DMSL4SD → suffix 'SD' constant → search descriptions for 'SD' expansion 3. Group name as expansion of code prefix: codes in group 'Delayed Matching to Sample' → prefix DMSL ≈ group name Returns dict: {segment → {'expansion': str, 'evidence': [str]}} """ expansions = {} # Source 1: parenthetical patterns "(ABBR)" or "(Full Name)" paren_re = re.compile( r'\b([A-Z]{2,8})\b\s*[\(\[]\s*([A-Za-z][^)\]]{3,80})\s*[\)\]]' r'|([A-Za-z][^(\[]{3,60})\s*[\(\[]\s*([A-Z]{2,8})\s*[\)\]]' ) for text in can['_text'].fillna('').astype(str): for m in paren_re.finditer(text): if m.group(1): # ABBR (Full Name) seg, exp = m.group(1), m.group(2).strip() else: # Full Name (ABBR) seg, exp = m.group(4), m.group(3).strip() exp = exp.split('.')[0].split(';')[0].strip() if len(exp) >= 4 and seg not in expansions: expansions[seg] = {'expansion': exp, 'evidence': ['description_parenthetical']} # Source 2: repeated positional suffix across a code family coded_mask = detect_coded_variables(can) family_rows = defaultdict(list) for _, row in can[coded_mask].iterrows(): fam = str(row.get('_code_family', '')) if fam: family_rows[fam].append(row) seg_tok = re.compile(r'([A-Z]{2,})') for fam, rows in family_rows.items(): if len(rows) < 2: continue codes = [str(r['_leaf_label']).strip().split('/')[0] for r in rows] all_segs = [seg_tok.findall(c) for c in codes] min_len = min((len(s) for s in all_segs), default=0) for pos in range(-1, -min_len - 1, -1): vals = [s[pos] for s in all_segs if len(s) >= abs(pos)] if not vals or vals[0].isdigit(): continue seg_val = vals[0] if all(v == seg_val for v in vals) and seg_val not in expansions: look_re = re.compile( rf'\b{re.escape(seg_val)}\b[\s\-–:]*([A-Za-z][a-zA-Z ]+)', re.IGNORECASE ) for r in rows: hit = look_re.search(str(r.get('_text', ''))) if hit: exp = hit.group(1).strip().split('.')[0].split('(')[0].strip() if 4 <= len(exp) <= 60: expansions[seg_val] = { 'expansion': exp, 'evidence': [f'code_family_{fam}_positional_suffix'] } break # Source 3: group name as prefix expansion for fam, rows in family_rows.items(): if fam in expansions: continue groups = [str(r.get('_group_path', '')).split(' > ')[0].strip() for r in rows if str(r.get('_group_path', '')) not in ('', 'nan', 'Ungrouped')] if groups and groups[0].lower() != fam.lower(): expansions[fam] = {'expansion': groups[0], 'evidence': ['group_name_match']} return expansions # ───────────────────────────────────────────────────────────────────────────── # DOMAIN DETECTION # Routes to domain-specific external sources automatically. # ───────────────────────────────────────────────────────────────────────────── _DOMAIN_SIGNALS = { 'cognitive': [ 'reaction time', 'response time', 'memory', 'attention', 'executive', 'cognitive', 'correct', 'error', 'delay', 'task', 'trial', 'stimulus', 'recall', 'recognition', 'working memory', 'inhibition', 'processing speed', 'latency', 'accuracy', 'hit', 'false alarm', 'miss', ], 'biomedical': [ 'patient', 'clinical', 'diagnosis', 'treatment', 'disease', 'symptom', 'medication', 'hospital', 'brain', 'neural', 'mri', 'fmri', 'eeg', 'biomarker', 'genetic', 'phenotype', 'cohort', 'longitudinal', ], 'finance': [ 'price', 'return', 'portfolio', 'equity', 'bond', 'yield', 'market', 'stock', 'currency', 'gdp', 'inflation', 'revenue', 'profit', 'index', ], 'environment': [ 'temperature', 'precipitation', 'climate', 'emission', 'pollution', 'biodiversity', 'ecosystem', 'carbon', 'species', 'habitat', 'soil', ], 'survey': [ 'questionnaire', 'likert', 'respondent', 'survey', 'agree', 'disagree', 'strongly', 'satisfaction', 'attitude', 'opinion', ], } def detect_domain(can): """ Detect domain from all metadata text. Returns domain string used to select external sources. """ all_text = ' '.join(can['_text'].fillna('').astype(str).tolist()).lower() scores = {domain: sum(1 for sig in signals if sig in all_text) for domain, signals in _DOMAIN_SIGNALS.items()} best = max(scores, key=scores.get) return best if scores[best] >= 2 else 'general' # ───────────────────────────────────────────────────────────────────────────── # CANDIDATE CONCEPT EXTRACTION FROM METADATA [GON][TAX] # Mines the metadata text itself for candidate concept labels. # No external source needed at this stage — purely data-driven. # Sources: group path components, TF-IDF n-grams, repeated noun phrases. # ───────────────────────────────────────────────────────────────────────────── def extract_candidate_concepts_from_metadata(can, max_concepts=150): """ Extract candidate concept labels from the metadata itself. Uses _semantic_text (description values only) to avoid field-name contamination [TAX][HIE][YAKE]. Returns list of dicts: {label, full_text, frequency, source, tfidf_score} """ # No hardcoded structural/boilerplate word list. Candidate EXTRACTION now keeps # everything that survives standard stop-word removal; boilerplate suppression is # done downstream and SEMANTICALLY in score_concepts_for_cluster() via the # specificity (semantic-IDF) signal, which is corpus-derived and dataset-agnostic. # A term like "Calculated Assessed Trials" is no longer blacklisted here — it is # simply ranked low because it is close to every group centroid. [GON][TaxoGen] _STRUCT_NOISE: set = set() candidates = {} # label_lower → dict # Boolean / value-state noise tokens — candidates made entirely of these words # are documentation artefacts, NOT semantic concepts [FIX5][TaxoGen KDD 2018] _BOOL_NOISE = { 'true', 'false', 'yes', 'no', 'completed', 'incomplete', 'missing', 'unknown', 'none', 'other', 'na', 'n/a', 'not', 'done', 'pending', 'available', 'unavailable', } def _is_bool_noise(label): """Return True if every non-stop word in label is a boolean/value-state token.""" words = set(re.findall(r'\b[a-z]{2,}\b', label.lower())) - _STOP return len(words) > 0 and words.issubset(_BOOL_NOISE) sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text' # ── Source 0: Description titles — colon-structured descriptions only ────── # [FIX4][TAX] Taxonomizer: "text before first colon" is the concept anchor # ONLY when a genuine colon separates label from explanation, e.g. # "Reaction Time: time from stimulus to response" → anchor = "Reaction Time". # [Sultanum & Mueller, IEEE TVCG 2019]; [Lobo et al., ISWC 2023] # # [FIX-R1] Cross-task leakage fix: # CANTAB/AI-Mind descriptions have NO colon — they are plain text like # "DMS Correct Latency Standard Deviation 0 second delay". The naive # implementation treated the FULL description as the title, so # "Prm Correct Latency (Sd) Delayed" entered the global pool and was # assigned to DMS clusters (high SBERT similarity). # Gate: only accept title_raw that is <80% the length of the full description. # This confirms a colon genuinely separates a short label from a long explanation. # CANTAB descriptions (no colon → title == full text) are skipped entirely. # Additional guards: strip task prefix (data-driven), ≤4 words, freq ≥ 2. # Discover top-level task tokens from _group_path — data-driven, NOT hardcoded top_task_tokens: set = set() if '_group_path' in can.columns: for _gp in can['_group_path'].dropna().astype(str): _first = _gp.split(' > ')[0].strip() if _first and _first.lower() not in ('ungrouped', 'nan', ''): top_task_tokens.add(_first.lower()) _task_pfx_re = ( re.compile( r'^(?:' + '|'.join(re.escape(t) for t in sorted(top_task_tokens, key=len, reverse=True)) + r')\s+', re.IGNORECASE, ) if top_task_tokens else None ) title_counts: dict = defaultdict(int) for raw_text in can[sem_col].fillna('').astype(str): desc_part = raw_text if 'description:' in raw_text.lower(): desc_part = re.split(r'description\s*:', raw_text, maxsplit=1, flags=re.IGNORECASE)[-1].strip() full_len = len(desc_part.strip()) if full_len < 3: continue title_raw = re.split(r'[:|]', desc_part)[0].strip() # Gate: title must be genuinely shorter than the full description. # If title ≥ 80% of full text there is no colon structure → skip. if len(title_raw) >= full_len * 0.80: continue title_clean = re.sub(r'^[\s\d\W]+', '', title_raw).strip() if len(title_clean) < 3 or title_clean.replace(' ', '').isdigit(): continue # Strip leading task prefix (data-driven) if _task_pfx_re: title_clean = _task_pfx_re.sub('', title_clean).strip() if len(title_clean) < 3: continue # ≤4 words: a concept anchor must be a short label, not a sentence if len(title_clean.split()) > 4: continue if _is_bool_noise(title_clean): # [FIX5] continue title_counts[title_clean] += 1 for title, cnt in title_counts.items(): if cnt < 2: # must appear in ≥2 variables to be a real shared concept continue # Reject titles containing underscores — always raw variable/column names if '_' in title: continue key = title.lower() if key not in candidates: candidates[key] = { 'label': title, 'full_text': title, 'frequency': cnt, 'source': 'description_title', 'tfidf_score': 0.95, 'cross_group_count': 0, } else: candidates[key]['frequency'] = max(candidates[key]['frequency'], cnt) # Source 1: Group path components — already structured, highest quality for gpath in can['_group_path'].dropna().unique(): for part in str(gpath).split(' > '): part = part.strip() if len(part) >= 3 and part.lower() not in ('ungrouped', 'nan', 'none', ''): key = part.lower() if key not in candidates: candidates[key] = { 'label': part, 'full_text': part, 'frequency': 0, 'source': 'group_path', 'tfidf_score': 1.0, 'cross_group_count': 0, # [FIX6] } candidates[key]['frequency'] += 1 # Source 2: TF-IDF n-grams from _semantic_text (description values only) [TAX][YAKE] texts = can[sem_col].fillna('').astype(str).tolist() if texts: try: vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_features=min(600, max_concepts * 4), min_df=1) X = vec.fit_transform(texts) terms = vec.get_feature_names_out() mean_tfidf = np.asarray(X.mean(axis=0)).flatten() freq_arr = np.asarray((X > 0).sum(axis=0)).flatten() for i in np.argsort(mean_tfidf)[::-1][:max_concepts]: term = terms[i] words = term.split() if not (len(term) >= 4 and any(c.isalpha() for c in term) and not term.replace(' ', '').isdigit()): continue # Skip stop words and structural noise (single-word filter) if len(words) == 1 and (term in _STOP or term.lower() in _STRUCT_NOISE): continue # [FIX5][TaxoGen] Skip boolean/value-state noise candidates if _is_bool_noise(term): continue # YAKE: single-word candidates penalised — prefer multi-word phrases score_mult = 0.5 if len(words) == 1 else 1.0 key = term.lower() if key not in candidates: candidates[key] = { 'label': term, 'full_text': term, 'frequency': int(freq_arr[i]), 'source': 'metadata_tfidf', 'tfidf_score': float(mean_tfidf[i]) * score_mult, 'cross_group_count': 0, # [FIX6] } except Exception: pass # Source 3: Repeated multi-word noun phrases from _semantic_text [TAX] phrase_re = re.compile(r'\b([a-z][a-z0-9]{1,}(?:\s+[a-z][a-z0-9]{1,}){1,3})\b') phrase_counts = defaultdict(int) for text in texts: for m in phrase_re.finditer(text.lower()): phrase = m.group(1) words = phrase.split() if any(w not in _STOP and w not in _STRUCT_NOISE and len(w) >= 3 for w in words): phrase_counts[phrase] += 1 for phrase, count in sorted(phrase_counts.items(), key=lambda x: -x[1]): if count >= 2 and len(phrase) >= 5: if _is_bool_noise(phrase): # [FIX5] continue key = phrase.lower() if key not in candidates: candidates[key] = { 'label': phrase, 'full_text': phrase, 'frequency': count, 'source': 'noun_phrase', 'tfidf_score': 0.0, 'cross_group_count': 0, # [FIX6] } # ── [FIX6][TaxoGen][CAS] Cross-group boilerplate tagging ────────────────── # A concept that appears in EVERY top-level group is dataset-wide boilerplate # (e.g. "Calculated Assessed Trials" across DMS/MOT/PAL/SWM/…). # Count how many distinct top-level groups contain each candidate label. # The penalty is applied later in score_concepts_for_cluster(). # Paper rationale — TaxoGen KDD 2018: contrastive term selection prefers # locally dominant, globally rare terms. Castanet NAACL 2007: facet labels # must discriminate between top-level categories. if '_group_path' in can.columns: top_groups = ( can['_group_path'].fillna('').astype(str) .apply(lambda p: p.split(' > ')[0].strip().lower()) ) all_top_groups = [g for g in top_groups.unique() if g not in ('', 'ungrouped', 'nan')] n_top_groups = max(1, len(all_top_groups)) # Build per-group text corpus for fast membership testing group_texts = {} for grp in all_top_groups: mask = top_groups == grp group_texts[grp] = ' '.join(can.loc[mask, sem_col].fillna('').astype(str)).lower() for key, cand in candidates.items(): cand_words = set(re.findall(r'\b[a-z]{3,}\b', cand['label'].lower())) - _STOP if not cand_words: continue count_in_groups = sum( 1 for grp_text in group_texts.values() if all(w in grp_text for w in cand_words) ) cand['cross_group_count'] = count_in_groups cand['_n_top_groups'] = n_top_groups # store for scorer # Sort: description_title / group_path first, then by tfidf_score, then by frequency _src_priority = {'group_path': 0, 'description_title': 1} result = sorted( candidates.values(), key=lambda x: (_src_priority.get(x['source'], 2), -x['tfidf_score'], -x['frequency']) ) return result[:max_concepts] # ───────────────────────────────────────────────────────────────────────────── # EXTERNAL CONCEPT SOURCES # Build a concept TABLE (not just append text). Each entry has a full_text # that is encoded by SBERT for the N×M alignment matrix. # ───────────────────────────────────────────────────────────────────────────── @st.cache_data(ttl=3600, show_spinner=False) def wikidata_search(term): """Wikidata entity search — returns concatenated descriptions. [GON][HIE]""" try: resp = requests.get( 'https://www.wikidata.org/w/api.php', params={'action': 'wbsearchentities', 'search': term, 'language': 'en', 'format': 'json', 'limit': 3}, timeout=6, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} ) items = resp.json().get('search', []) descs = [it.get('description', '') for it in items if it.get('description')] return ' '.join(descs[:2]) except Exception: return '' @st.cache_data(ttl=3600, show_spinner=False) def wikidata_broader(term): """ P31=instance_of, P279=subclass_of, P361=part_of from Wikidata SPARQL. These are the beyond-is-a relations from [HIE]. """ try: sr = requests.get( 'https://www.wikidata.org/w/api.php', params={'action': 'wbsearchentities', 'search': term, 'language': 'en', 'format': 'json', 'limit': 1}, timeout=5, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} ) items = sr.json().get('search', []) if not items: return [] qid = items[0]['id'] sparql = f""" SELECT ?rel ?broaderLabel WHERE {{ VALUES ?prop {{ wdt:P31 wdt:P279 wdt:P361 }} wd:{qid} ?prop ?broader . BIND(REPLACE(STR(?prop),'.*P','P') AS ?rel) SERVICE wikibase:label {{ bd:serviceParam wikibase:language 'en' . }} }} LIMIT 5 """ resp = requests.get( 'https://query.wikidata.org/sparql', params={'query': sparql, 'format': 'json'}, headers={'Accept': 'application/json', 'User-Agent': 'MetadataHierarchyTool/1.0'}, timeout=10 ) results = resp.json().get('results', {}).get('bindings', []) rel_map = {'P31': 'instance_of', 'P279': 'subclass_of', 'P361': 'part_of'} out = [] for r in results: rel = rel_map.get(r.get('rel', {}).get('value', ''), 'related_to') lbl = r.get('broaderLabel', {}).get('value', '') if lbl: out.append((rel, lbl)) return out except Exception: return [] @st.cache_data(ttl=3600, show_spinner=False) def wikipedia_summary(term): """Wikipedia intro paragraph. Taxonomizer trained on Wikipedia — same corpus. [TAX]""" try: resp = requests.get( 'https://en.wikipedia.org/api/rest_v1/page/summary/' + term.replace(' ', '_'), timeout=6, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} ) extract = resp.json().get('extract', '') return extract[:300] if extract else '' except Exception: return '' @st.cache_data(ttl=3600, show_spinner=False) def pubmed_keywords(query): """PubMed enrichment — biomedical domain only. [GON]""" try: search = requests.get( 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params={'db': 'pubmed', 'term': query[:100], 'retmax': 3, 'retmode': 'json'}, timeout=8 ) ids = search.json().get('esearchresult', {}).get('idlist', []) if not ids: return '' fetch = requests.get( 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params={'db': 'pubmed', 'id': ','.join(ids), 'rettype': 'abstract', 'retmode': 'text'}, timeout=10 ) return fetch.text[:400] except Exception: return '' @st.cache_data(ttl=3600, show_spinner=False) def bioportal_search(phrase, api_key): """BioPortal ontology search — biomedical domain, requires free API key. [GON]""" if not api_key: return [] try: resp = requests.get( 'https://data.bioontology.org/search', params={'q': phrase, 'pagesize': 3, 'display_links': 'false'}, headers={'Authorization': f'apikey token={api_key}'}, timeout=8 ) results = resp.json().get('collection', []) out = [] for r in results: lbl = r.get('prefLabel', '') defn = ' '.join(r.get('definition', []))[:200] if lbl: out.append({ 'label': lbl, 'full_text': f'{lbl}. {defn}' if defn else lbl, 'source': 'bioportal', 'frequency': 0, 'tfidf_score': 0.0, 'broader_relations': [], }) return out except Exception: return [] def wordnet_lookup(phrase): """WordNet definition + hypernyms — synonym/hypernym expansion. [GON]""" if not _WORDNET_AVAILABLE: return None try: synsets = wn.synsets(phrase.replace(' ', '_')) if not synsets: synsets = wn.synsets(phrase.split()[0]) if phrase.split() else [] if not synsets: return None ss = synsets[0] defn = ss.definition() hypernyms = [h.name().replace('_', ' ').split('.')[0] for h in ss.hypernyms()[:3]] return {'definition': defn, 'hypernyms': hypernyms} except Exception: return None def _is_acronym(phrase): """True if phrase is a short all-caps token — high Wikidata polysemy risk. [GON][BLINK] These tokens must not be queried raw; use the expanded form instead.""" p = phrase.strip() return p.isupper() and 2 <= len(p) <= 6 and sum(c.isalpha() for c in p) >= 2 @st.cache_data(ttl=86400, show_spinner=False) def cognitive_atlas_search(term): """Cognitive Atlas REST API — domain-specific for cognitive/neurological tasks. Preferred over Wikidata for cognitive domain codes. [GON]""" try: resp = requests.get( 'https://www.cognitiveatlas.org/api/v-alpha/task', params={'search': term, 'format': 'json'}, timeout=8, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} ) items = resp.json() if isinstance(items, list) and items: item = items[0] name = item.get('name', '') defn = item.get('definition_text', '') or item.get('alias', '') if name: return f'{name}. {defn[:250]}' if defn else name except Exception: pass return '' def retrieve_concept_table(candidates, domain='general', use_wikidata=True, use_wikipedia=False, use_wordnet=True, use_pubmed=False, bioportal_key='', progress_cb=None, code_expansions=None): """ Build a concept TABLE from candidates + external sources. Each entry: {label, full_text, source, frequency, tfidf_score, broader_relations}. full_text = label + external description → encoded by SBERT for N×M matrix. This is the right-hand side of the Gonçalves N×M alignment matrix. [GON] """ # Start with all candidates as base entries table = {} # label_lower → dict for c in candidates: key = c['label'].lower() table[key] = { 'label': c['label'], 'full_text': c['label'], 'source': c['source'], 'frequency': c.get('frequency', 0), 'tfidf_score': c.get('tfidf_score', 0.0), 'broader_relations': [], } # Enrich top candidates with external sources api_candidates = sorted(candidates, key=lambda x: (0 if x['source'] == 'group_path' else 1, -x.get('tfidf_score', 0), -x.get('frequency', 0)))[:60] n_api = len(api_candidates) code_expansions = code_expansions or {} for i, c in enumerate(api_candidates): if progress_cb: progress_cb(i / n_api) phrase = c['label'] key = phrase.lower() # Determine the query phrase — never query raw acronyms on Wikidata [GON][BLINK] if _is_acronym(phrase): exp = code_expansions.get(phrase, {}).get('expansion', '') query_phrase = exp if exp else None # None = skip Wikidata entirely else: query_phrase = phrase # Cognitive Atlas (cognitive/neurological domain — before Wikidata) [GON] if domain in ('cognitive', 'neurological', 'biomedical') and query_phrase: cat_def = cognitive_atlas_search(query_phrase) if cat_def and key in table: table[key]['full_text'] = f'{phrase}. {cat_def}' table[key]['source'] = 'cognitive_atlas' # Wikidata — use expanded form for acronyms, skip if no expansion found if use_wikidata and query_phrase is not None: wd_desc = wikidata_search(query_phrase) wd_rel = wikidata_broader(query_phrase) if key in table: if wd_desc and table[key]['source'] not in ('cognitive_atlas',): table[key]['full_text'] = f'{phrase}. {wd_desc}' table[key]['source'] = 'wikidata' table[key]['broader_relations'] = wd_rel # WordNet — synonyms, hypernyms, definitions if use_wordnet and _WORDNET_AVAILABLE: wn_res = wordnet_lookup(phrase) if wn_res: wn_key = f'wordnet_{key}' table[wn_key] = { 'label': phrase, 'full_text': f'{phrase}. {wn_res["definition"]}', 'source': 'wordnet', 'frequency': c.get('frequency', 0), 'tfidf_score': c.get('tfidf_score', 0.0), 'broader_relations': [('related_to', h) for h in wn_res.get('hypernyms', [])], } # Wikipedia (optional) if use_wikipedia and i < 20: wiki = wikipedia_summary(phrase) if wiki: wp_key = f'wikipedia_{key}' table[wp_key] = { 'label': phrase, 'full_text': f'{phrase}. {wiki[:200]}', 'source': 'wikipedia', 'frequency': c.get('frequency', 0), 'tfidf_score': c.get('tfidf_score', 0.0), 'broader_relations': [], } # PubMed (biomedical only, optional) if use_pubmed and domain in ('biomedical', 'cognitive') and i < 8: pm = pubmed_keywords(phrase) if pm: pm_key = f'pubmed_{key}' table[pm_key] = { 'label': phrase, 'full_text': f'{phrase}. {pm[:200]}', 'source': 'pubmed', 'frequency': c.get('frequency', 0), 'tfidf_score': c.get('tfidf_score', 0.0), 'broader_relations': [], } time.sleep(0.35) # NCBI rate limit # BioPortal (biomedical only, optional API key) if bioportal_key and domain in ('biomedical', 'cognitive') and i < 20: for bp in bioportal_search(phrase, bioportal_key): bp_key = f"bioportal_{bp['label'].lower()}" table[bp_key] = bp return list(table.values()) # ───────────────────────────────────────────────────────────────────────────── # CONCEPT ALIGNMENT — N×M COSINE SIMILARITY [GON] # Gonçalves et al. build an N×M similarity matrix between metadata field # embeddings and ontology term embeddings, then rank alignments. # Here: N=variable clusters, M=concept table entries. # ───────────────────────────────────────────────────────────────────────────── def _string_overlap(cluster_texts, concept_label): """ Word overlap between cluster descriptions and concept label words. Measures string-level evidence that this concept matches this cluster. """ concept_words = set(re.findall(r'\b[a-z]{3,}\b', concept_label.lower())) - _STOP if not concept_words: return 0.0 cluster_combined = ' '.join(cluster_texts).lower() cluster_words = set(re.findall(r'\b[a-z]{3,}\b', cluster_combined)) - _STOP overlap = len(concept_words & cluster_words) / len(concept_words) return float(overlap) def _seq_sim(a, b): """SequenceMatcher ratio between two strings — for code/label similarity.""" return SequenceMatcher(None, a.lower(), b.lower()).ratio() def score_concepts_for_cluster(cluster_emb, concept_embs, concept_table, cluster_texts, n_total_vars=None, member_embs=None, sibling_centroids=None, ref_centroids=None, corpus_centroid=None, own_group_centroid=None): """ Fully-semantic, multi-signal concept scoring for one cluster. [GON] Gonçalves ESWC 2019 (IDF-weighted embeddings + cosine); [TaxoGen KDD 2018] contrastive term selection; [CAS] Castanet facet contrast. Every signal is cosine-in-embedding-space — no word-overlap, no hardcoded boilerplate lists. All references are data-derived, so it transfers to any set. score = 0.30 × fit (mean cosine of label to THIS cluster's members) + 0.35 × contrast (fit − best cosine to a SIBLING cluster, same task) + 0.25 × home (cosine to OWN-task centroid − mean over all tasks) + 0.10 × source_conf − url_noise_penalty Why three signals, and what each one kills: • fit — must actually describe this cluster. • contrast — kills dataset-wide BOILERPLATE ("Calculated Assessed Trials"): it sits in every sibling cluster too, so fit ≈ sibling-sim → contrast ≈ 0. A real sub-topic ("Total Errors") is in its own cluster but not the latency sibling → positive contrast. • home — kills CROSS-TASK leakage ("Rvp 3 Targets" under DMS): it is far from the DMS group centroid, so (cos to own group − mean over groups) is negative → clipped to 0. A genuine DMS term is at or above the cross-task average → positive. This is task-RELATIVE, unlike a group-agnostic max−mean peak (which wrongly rewarded a sharp RVP-specific label even while labelling a DMS cluster). `own_group_centroid` is the centroid of the current task's variables (passed in by the caller); `ref_centroids` are all top-level task centroids. Returns list of dicts sorted by score descending. """ if concept_embs is None or len(concept_table) == 0: return [] concept_embs = np.asarray(concept_embs, dtype=float) # Similarity of each candidate to this cluster's centroid emb_sims = cosine_similarity([cluster_emb], concept_embs)[0] # ── fit: mean cosine to the cluster's MEMBER embeddings (robust to outliers) if member_embs is not None and len(member_embs) > 0: fit = cosine_similarity(concept_embs, np.asarray(member_embs, dtype=float)).mean(axis=1) else: fit = emb_sims # ── contrast: fit minus closeness to the nearest SIBLING cluster (same task) if sibling_centroids is not None and len(sibling_centroids) > 0: sib_sims = cosine_similarity(concept_embs, np.asarray(sibling_centroids, dtype=float)) contrast = np.clip(fit - sib_sims.max(axis=1), 0.0, 1.0) else: contrast = np.zeros(len(concept_table)) # ── home: does the label belong to THIS task more than to tasks on average? # Task-relative — measured against the CURRENT group centroid, not a peak. home_active = False if own_group_centroid is not None and ref_centroids is not None and len(ref_centroids) >= 2: own_sim = cosine_similarity(concept_embs, [own_group_centroid])[:, 0] all_mean = cosine_similarity(concept_embs, np.asarray(ref_centroids, dtype=float)).mean(axis=1) home = np.clip((own_sim - all_mean) * 3.0, 0.0, 1.0) home_active = True elif own_group_centroid is not None and corpus_centroid is not None: own_sim = cosine_similarity(concept_embs, [own_group_centroid])[:, 0] gen = cosine_similarity(concept_embs, [corpus_centroid])[:, 0] home = np.clip((own_sim - gen) * 3.0, 0.0, 1.0) home_active = True else: # No task reference (e.g. single Ungrouped bucket): neutral, don't filter. home = np.full(len(concept_table), 0.34) src_sc = np.array([SOURCE_CONFIDENCE.get(c.get('source', 'noun_phrase'), 0.55) for c in concept_table]) # URL / HTML artifact penalty — strips documentation junk, not domain terms _url_noise_re = re.compile( r'\b(http|href|wiki|neurolex|org|www|definition|category|link|url)\b', re.IGNORECASE ) noise_penalty = np.array( [0.35 if _url_noise_re.search(c['label']) else 0.0 for c in concept_table] ) total = (0.30 * fit + 0.35 * contrast + 0.25 * home + 0.10 * src_sc - noise_penalty) # Reported only (provenance/debug) — not scored. str_sims = np.array([_string_overlap(cluster_texts, c['label']) for c in concept_table]) results = [] for i, concept in enumerate(concept_table): # Drop candidates that don't fit this cluster, or (when a task reference # exists) that belong to a DIFFERENT task — i.e. home collapsed to 0. if float(fit[i]) < 0.12: continue if home_active and float(home[i]) <= 0.0: continue results.append({ 'label': concept['label'], 'score': float(total[i]), 'embedding_sim': float(emb_sims[i]), 'coverage': float(fit[i]), 'contrast': float(contrast[i]), 'specificity': float(home[i]), 'string_sim': float(str_sims[i]), 'source': concept.get('source', 'unknown'), 'broader_relations': concept.get('broader_relations', []), '_emb': concept_embs[i], }) return sorted(results, key=lambda x: -x['score']) def assign_concept_label(scores, fallback='Group', min_score=0.08, ancestor_names=None, used_sibling_labels=None, top_level_tasks=None, ancestor_embs=None, sibling_label_embs=None, dup_sim=0.82): """ Pick best concept label from scored results. Rejection combines STRUCTURAL guards (domain-agnostic, not hardcoding) with SEMANTIC ones (embedding cosine): Structural: - token self-repetition ("Dms Dms") - label is a substring of / equal to an ancestor, or vice-versa (kills "Dms" and "Dms Recommended Standard" sitting under ancestor "DMS") - all of the label's content words already appear in an ancestor label - exact match with an already-used sibling label - FOREIGN-TASK token: label contains a top-level task name that is NOT the current ancestor task (e.g. "Rvp 3 Targets" / "Swm Errors" under DMS). Task names are discovered from _group_path — data-driven, not hardcoded. Semantic: - cosine(label_emb, any ancestor_emb) > dup_sim → parent paraphrase - cosine(label_emb, any sibling_emb) > dup_sim → sibling paraphrase Returns (label, provenance_dict). """ ancestor_set = {str(a).lower().strip() for a in (ancestor_names or [])} used_set = {str(u).lower().strip() for u in (used_sibling_labels or [])} anc_embs = np.asarray(ancestor_embs, dtype=float) if ancestor_embs is not None and len(ancestor_embs) else None sib_embs = list(sibling_label_embs) if sibling_label_embs else [] # Current task = the ancestor that is itself a top-level task (data-driven) _task_set = {str(t).lower() for t in (top_level_tasks or [])} _current_task = next((str(a).lower() for a in (ancestor_names or []) if str(a).lower() in _task_set), None) def _is_degenerate(lbl, emb=None): """True if label should be rejected.""" l = lbl.strip().lower() # Structural 1: token self-repetition ("Dms Dms", "Swm Swm") toks = l.split() if len(toks) >= 2 and len(set(toks)) < len(toks): return True # Structural 2: substring of / equal to an ancestor (or vice-versa) for anc in ancestor_set: if l == anc or l in anc or anc in l: return True # Structural 3: all content words already present in an ancestor label lbl_words = set(re.findall(r'\b[a-z]{3,}\b', l)) - _STOP for anc in ancestor_set: anc_words = set(re.findall(r'\b[a-z]{3,}\b', anc)) - _STOP if lbl_words and lbl_words.issubset(anc_words): return True # Structural 4: already used by a sibling group if l in used_set: return True # Structural 5: foreign-task token (cross-task contamination) if _current_task and _task_set: for task in _task_set: if task != _current_task and re.search(r'\b' + re.escape(task) + r'\b', l): return True # Semantic parent-duplication: candidate paraphrases an ancestor label if emb is not None and anc_embs is not None: if float(cosine_similarity([emb], anc_embs).max()) > dup_sim: return True # Semantic sibling-duplication: candidate paraphrases a chosen sibling label if emb is not None and sib_embs: if float(cosine_similarity([emb], np.asarray(sib_embs, dtype=float)).max()) > dup_sim: return True return False # Walk ranked scores; skip degenerate candidates chosen = None for s in scores: if s['score'] < min_score: break candidate = s['label'].strip().title() if not _is_degenerate(candidate, s.get('_emb')): chosen = s break if chosen is None: return fallback, { 'node_label': fallback, 'confidence': 0.0, 'alternatives': [], 'source_evidence': ['tfidf_fallback'], 'embedding_sim': 0.0, 'string_sim': 0.0, 'coverage': 0.0, 'contrast': 0.0, 'specificity': 0.0, } label = chosen['label'].strip().title() alts = [s['label'] for s in scores[1:4] if s['label'] != chosen['label'] and not _is_degenerate(s['label'].strip().title(), s.get('_emb'))] provenance = { 'node_label': label, 'confidence': round(chosen['score'], 3), 'alternatives': alts, 'source_evidence': [chosen['source']], 'embedding_sim': round(chosen['embedding_sim'], 3), 'coverage': round(chosen.get('coverage', 0.0), 3), 'contrast': round(chosen.get('contrast', 0.0), 3), 'specificity': round(chosen.get('specificity', 0.0), 3), 'string_sim': round(chosen['string_sim'], 3), } return label, provenance # ───────────────────────────────────────────────────────────────────────────── # TF-IDF FALLBACK CLUSTERING # Used when concept table is unavailable or similarity is too low. # ───────────────────────────────────────────────────────────────────────────── def tfidf_cluster_labels(texts, max_clusters=8): """[GON] TF-IDF agglomerative clustering — discriminative label per cluster.""" n = len(texts) if n <= 1: return [''] * n try: vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=500, min_df=1) X = vec.fit_transform(texts) n_clust = min(max_clusters, max(2, n // 3), X.shape[0]) dist = cosine_distances(X).astype(float) np.fill_diagonal(dist, 0.0) labels = AgglomerativeClustering(n_clusters=n_clust, metric='precomputed', linkage='average').fit_predict(dist) terms = vec.get_feature_names_out() X_dense = X.toarray() cnames = {} for k in range(n_clust): mask = labels == k if not mask.any(): cnames[k] = f'Group {k+1}' continue scores = X_dense[mask].mean(axis=0) - (X_dense[~mask].mean(axis=0) if (~mask).any() else 0) top = [i for i in np.argsort(scores)[::-1] if len(terms[i]) > 3] cnames[k] = terms[top[0]].title() if top else f'Group {k+1}' return [cnames[int(lb)] for lb in labels] except Exception: return [''] * n # ───────────────────────────────────────────────────────────────────────────── # [C8] WORDNET HYPERNYM CHAIN FALLBACK [CAS][TAX] # Castanet: "carves out a structure from WordNet that reflects the collection." # Taxonomizer: "labeling inner nodes requires the identification of hypernyms." # Walks IS-A chain upward from the dominant noun in cluster texts. # Returns the highest-confidence hypernym that is not in excluded_names. # ───────────────────────────────────────────────────────────────────────────── def wordnet_hypernym_fallback(cluster_texts, excluded_names=None): """ [C8][CAS][TAX] Walk WordNet IS-A chain upward from cluster centroid noun. Returns the best hypernym that is: - not in excluded_names (ancestors, parent label) - not a stop word - not too generic (not 'entity','object','thing','abstraction','whole') Falls back to None if WordNet unavailable or no valid hypernym found. """ if not _WORDNET_AVAILABLE: return None excluded = {str(n).lower().strip() for n in (excluded_names or [])} _too_generic = {'entity', 'object', 'thing', 'abstraction', 'whole', 'physical entity', 'psychological feature', 'group', 'attribute', 'measure', 'amount', 'number', 'quantity'} # Extract most frequent meaningful nouns from cluster texts all_text = ' '.join(cluster_texts).lower() words = [w for w in re.findall(r'\b[a-z]{4,}\b', all_text) if w not in _STOP and w not in _too_generic] if not words: return None from collections import Counter freq = Counter(words) candidates_words = [w for w, _ in freq.most_common(8)] best_label = None best_depth = 0 # prefer specific (deeper) hypernyms over generic ones for word in candidates_words: try: synsets = wn.synsets(word, pos=wn.NOUN) if not synsets: continue ss = synsets[0] # Walk hypernym chain — collect all hypernyms with their depth paths = ss.hypernym_paths() for path in paths: for depth, hyp_ss in enumerate(reversed(path)): hyp_name = hyp_ss.name().split('.')[0].replace('_', ' ') if (hyp_name.lower() not in excluded and hyp_name.lower() not in _too_generic and hyp_name.lower() not in _STOP and len(hyp_name) > 3 and depth > 0): # skip the synset itself (depth=0) if depth > best_depth: best_depth = depth best_label = hyp_name.title() break # use deepest valid hypernym per path except Exception: continue return best_label if best_depth > 0 else None # ───────────────────────────────────────────────────────────────────────────── # [C3] DISCRIMINATIVE TF-IDF LABEL PER GROUP [GON][TAX] # Gonçalves: N×M alignment uses text signals from descriptions, not raw codes. # TaxoGen: "local embedding module for discriminative power at each level." # Computes TF-IDF across ALL groups so terms unique to THIS group score high. # ───────────────────────────────────────────────────────────────────────────── def get_discriminative_tfidf_label(cluster_texts, all_groups_texts): """ [C3][GON][TAX] Return the most discriminative 1-2 word label for cluster_texts relative to all_groups_texts (list of text lists from sibling groups). Uses TF-IDF contrast: high TF in cluster, low IDF across all groups = discriminative. Returns a title-cased string or None. """ try: # Build one document per group (cluster + all siblings) cluster_doc = ' '.join(cluster_texts) sibling_docs = [' '.join(g) for g in all_groups_texts if g] all_docs = [cluster_doc] + sibling_docs if len(all_docs) < 2: return None vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=300, min_df=1) X = vec.fit_transform(all_docs) terms = vec.get_feature_names_out() cluster_vec = X[0].toarray()[0] # Score = cluster TF-IDF score — mean across sibling docs sibling_mean = X[1:].toarray().mean(axis=0) if X.shape[0] > 1 else np.zeros(len(terms)) contrast = cluster_vec - sibling_mean best_idxs = [i for i in np.argsort(contrast)[::-1] if len(terms[i]) > 3 and contrast[i] > 0.01] if best_idxs: return terms[best_idxs[0]].title() except Exception: pass return None # ───────────────────────────────────────────────────────────────────────────── # NODE MANIPULATION UTILITIES # ───────────────────────────────────────────────────────────────────────────── def nmap(nodes): return {int(n['id']): n for n in nodes} def next_id(nodes): return max([int(n['id']) for n in nodes] or [0]) + 1 def add_child(nodes, parent, child): m = nmap(nodes); p = m.get(int(parent)) if not p: return rel = list(p.get('related', [])) if int(child) not in rel: rel.append(int(child)) p['related'] = rel def remove_child(nodes, parent, child): m = nmap(nodes); p = m.get(int(parent)) if p: p['related'] = [x for x in p.get('related', []) if int(x) != int(child)] def make_agg(id, name, related=None, op='concat', dtype='determine', desc='', shown=True, relation_type='belongs_to', provenance=None): node = { 'id': int(id), 'name': str(name), 'related': [int(x) for x in (related or [])], 'type': 'aggregation', 'info': {'operation': op, 'usedAttributes': [], 'formula': '', 'exec': '', 'relation_type': relation_type, 'relation_label': RELATION_TYPES.get(relation_type, 'belongs to')}, 'isShown': bool(shown), 'desc': desc or '', 'dtype': dtype, 'recover': True, } if provenance: node['concept_provenance'] = provenance return node def get_node(nodes, id): return nmap(nodes).get(int(id)) def update_node(nodes, id, **upd): for n in nodes: if int(n['id']) == int(id): n.update(upd) return nodes def parents(nodes, child): return [int(n['id']) for n in nodes if int(child) in [int(x) for x in n.get('related', [])]] def ancestor_names(nodes, nid): """ [FIX1][HIE] Walk up the tree from nid collecting all ancestor node names. HiExpan (Shen et al., KDD 2018) Section 4.3 — Conflict Resolution: "avoid assigning a label already present in the path from root to the node." Used by hiexpan_depth_expansion_semantic to pass ancestor context to assign_concept_label, preventing repeated labels across hierarchy levels. """ m = nmap(nodes) result, cur, visited = [], int(nid), set() while cur not in visited: visited.add(cur) pars = parents(nodes, cur) if not pars: break p = pars[0] pn = m.get(p) if pn and pn.get('name') and pn.get('type') != 'root': result.append(str(pn['name'])) cur = p return result def descendants(nodes, id): m = nmap(nodes); seen = [] def rec(nid): n = m.get(int(nid)) if not n: return for c in n.get('related', []): c = int(c) if c in seen: continue seen.append(c); rec(c) rec(id); return seen def leaf_ids(nodes, id): m = nmap(nodes); out = [] def rec(nid): n = m.get(int(nid)) if not n: return if n.get('type') == 'attribute': out.append(int(nid)); return for c in n.get('related', []): rec(int(c)) rec(id); return list(dict.fromkeys(out)) def move_node(nodes, child, new_parent): if int(child) == 0 or int(new_parent) == int(child) or int(new_parent) in descendants(nodes, child): return nodes for p in parents(nodes, child): remove_child(nodes, p, child) add_child(nodes, new_parent, child); return nodes def delete_agg(nodes, id, reattach=True): if int(id) == 0: return nodes n = get_node(nodes, id) if not n or n.get('type') == 'attribute': return nodes ps = parents(nodes, id); children = list(n.get('related', [])) for p in ps: remove_child(nodes, p, id) if reattach: for c in children: add_child(nodes, p, c) return [x for x in nodes if int(x['id']) != int(id)] def build_parent_map(nodes): pm = {} for n in nodes: for c in n.get('related', []): cid = int(c) if cid not in pm: pm[cid] = int(n['id']) return pm # ───────────────────────────────────────────────────────────────────────────── # EVALUATION HELPERS # ───────────────────────────────────────────────────────────────────────────── def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list: """Map each canonical row to the id of its depth-1 aggregation ancestor.""" pm = build_parent_map(nodes) def depth1(nid: int) -> int: while pm.get(nid, -1) not in (-1, 0): nid = pm[nid] return nid lid_to_nid = {n['metadata']['leaf_id']: int(n['id']) for n in nodes if n.get('type') == 'attribute' and 'metadata' in n} return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1 for lid in can['_leaf_id']] def _purity_score(y_true, y_pred) -> float: from collections import Counter clusters: dict = {} for t, p in zip(y_true, y_pred): clusters.setdefault(p, []).append(t) correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values()) return correct / max(len(y_true), 1) def _structural_stats(nodes: list) -> dict: pm = build_parent_map(nodes) def depth_of(nid: int) -> int: d = 0 while nid in pm: nid = pm[nid]; d += 1 return d agg = [n for n in nodes if n.get('type') == 'aggregation'] leafs = [n for n in nodes if n.get('type') == 'attribute'] depths = [depth_of(int(n['id'])) for n in leafs] branches = [len(n.get('related', [])) for n in agg] singletons = sum(1 for b in branches if b == 1) return { 'n_aggregation_nodes': len(agg), 'max_depth': int(max(depths, default=0)), 'avg_leaf_depth': round(float(np.mean(depths)), 2) if depths else 0.0, 'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0, 'singleton_nodes_%': round(100.0 * singletons / max(len(agg), 1), 1), } def path_rows(nodes): m = nmap(nodes); rows = [] def rec(nid, path, depth): n = m.get(int(nid)) if not n: return label = n.get('name', str(nid)) full = f'{path} / {label}' if path else label lc = len(leaf_ids(nodes, nid)) rows.append({'id': int(nid), 'name': label, 'path': full, 'depth': depth, 'type': n.get('type', ''), 'leaf_count': lc, 'relation': n.get('info', {}).get('relation_label', ''), 'choice': f'{full} [{n.get("type","")}, {lc} vars]'}) for c in n.get('related', []): rec(int(c), full, depth + 1) rec(0, '', 0); return rows def attr_opts(nodes): return [{'label': f'{n["name"]} (id={n["id"]})', 'id': int(n['id']), 'node': n} for n in nodes if n.get('type') == 'attribute'] def agg_opts(nodes, root=True): types = ['aggregation', 'root'] if root else ['aggregation'] return [{'label': f'{n["name"]} (id={n["id"]})', 'id': int(n['id']), 'node': n} for n in nodes if n.get('type') in types] def _centroid(embs): if embs is None or len(embs) == 0: return None c = np.mean(embs, axis=0) norm = np.linalg.norm(c) return c / norm if norm > 0 else c # ───────────────────────────────────────────────────────────────────────────── # [F4] FACET-GUIDED SUB-SPLITTING HELPERS [CAS][HIE] # # Castanet (Stoica & Hearst, 2007): "a single hierarchy conflates dimensions # and produces hard-to-label, counter-intuitive sub-hierarchies." # These helpers insert Statistic and Condition tiers below each concept node, # using the _facet_stat and _facet_cond columns pre-computed by # precompute_stat_cond_facets(). Only splits when ≥2 valid sub-groups exist # with ≥2 variables each — consistent with HiExpan's coherence requirement. # No hardcoding: the column values (Mean/SD/Median, 0/4/12/...) come from data. # ───────────────────────────────────────────────────────────────────────────── _MIN_FACET_GROUP = 2 # minimum variables per facet sub-group def _do_facet_subsplit(sub_can, parent_id, current_path, nodes, leaf_to_id, ensure_path_fn): """ [F4][CAS] Facet sub-split by _facet_cond (numeric condition) only. The statistic tier is no longer inserted here — it came from a hardcoded statistic vocabulary and is now produced data-drivenly by _nest_by_measure(). Kept defensive: if a legacy _facet_stat column is present it is still honoured, but precompute_stat_cond_facets() no longer produces one. """ # A facet tier that merely repeats the parent concept label (e.g. a "Total" # statistic under a "Total" concept) is redundant — skip it. _parent_lbl = str(current_path[-1]).lower() if current_path else '' def _dups_parent(name): nl = str(name).lower().strip() return bool(nl) and (nl == _parent_lbl or nl in _parent_lbl or _parent_lbl in nl) if '_facet_stat' in sub_can.columns: stat_groups = {} for sv in sub_can['_facet_stat'].fillna('').unique(): if str(sv) in ('', 'nan') or _dups_parent(sv): continue sg = sub_can[sub_can['_facet_stat'] == sv] if len(sg) >= _MIN_FACET_GROUP: stat_groups[sv] = sg if len(stat_groups) >= 2: # Identify leftover rows not in any valid stat group grouped_idx = pd.concat(stat_groups.values()).index if stat_groups else pd.Index([]) leftover = sub_can[~sub_can.index.isin(grouped_idx)] for sv, sg in stat_groups.items(): stat_pid = ensure_path_fn(current_path + [str(sv)], relation='is_statistic_of') _do_cond_subsplit(sg, stat_pid, current_path + [str(sv)], nodes, leaf_to_id, ensure_path_fn) # Leftover variables (no valid stat value) go directly under parent for _, row in leftover.iterrows(): add_child(nodes, parent_id, leaf_to_id[row['_leaf_id']]) return # No valid stat split — try cond split at this level directly _do_cond_subsplit(sub_can, parent_id, current_path, nodes, leaf_to_id, ensure_path_fn) def _do_cond_subsplit(sub_can, parent_id, current_path, nodes, leaf_to_id, ensure_path_fn): """ [F4][CAS] Split by _facet_cond (numeric condition: delay/session/timepoint). Castanet (Stoica & Hearst, NAACL 2007) treats a condition as a valid facet when it has ≥2 distinct values, but HiExpan (Shen et al., KDD 2018) sibling coherence requires each resulting group to hold ≥2 variables — a node with a single child is not a coherent group. [FIX8] Earlier code (the reverted [FIX3] rule) allowed groups of size 1 whenever ≥3 distinct condition values existed, to expose 0/4/12-second delay tiers. On AI-MIND this produced one aggregation node per delay value, each wrapping a SINGLE variable (e.g. `Standard Deviation > 0 > DMSL0SD`): 53.7% singleton nodes and meaningless bare-digit labels. We now require ≥2 variables per condition group unconditionally; variables whose condition value is unique fall through and attach directly to the statistic node, keeping siblings together instead of fragmenting them. This is both more faithful to HiExpan and removes the over-split that the granularity-tolerant set-overlap metric flagged. """ if '_facet_cond' in sub_can.columns: # Count all distinct non-empty condition values in this sub-group all_cond_vals = [cv for cv in sub_can['_facet_cond'].fillna('').unique() if str(cv) not in ('', 'nan')] # [FIX8] Require ≥2 variables per condition group (HiExpan sibling coherence). min_size = _MIN_FACET_GROUP cond_groups = {} for cv in all_cond_vals: cg = sub_can[sub_can['_facet_cond'] == cv] if len(cg) >= min_size: cond_groups[cv] = cg if len(cond_groups) >= 2: grouped_idx = pd.concat(cond_groups.values()).index if cond_groups else pd.Index([]) leftover = sub_can[~sub_can.index.isin(grouped_idx)] for cv, cg in cond_groups.items(): cond_pid = ensure_path_fn(current_path + [str(cv)], relation='has_condition') for _, row in cg.iterrows(): add_child(nodes, cond_pid, leaf_to_id[row['_leaf_id']]) for _, row in leftover.iterrows(): add_child(nodes, parent_id, leaf_to_id[row['_leaf_id']]) return # No facet split possible — attach directly for _, row in sub_can.iterrows(): add_child(nodes, parent_id, leaf_to_id[row['_leaf_id']]) # ───────────────────────────────────────────────────────────────────────────── # MAIN HIERARCHY BUILDER [GON][TAX][HIE][CAS] # # Algorithm: # 1. Create leaf nodes (all variables) # 2. Group by top-level _group_path (task/domain — structural backbone) # 3. For each group: embed variables → agglomerative cluster → for each cluster # compute centroid → score N×M against concept table → assign best label # 4. [F4] For each concept cluster: facet sub-split by Statistic → Condition # 5. Store concept assignment back on each variable in can # ───────────────────────────────────────────────────────────────────────────── def _noun_phrases(text, max_words=4): """ Grammatical noun phrases via NLTK POS tagging (used when USE_NOUN_PHRASES=True). Returns [] if NLTK / the tagger is unavailable, so the caller falls back to n-grams. Phrases are contiguous runs of adjectives/nouns up to max_words long. """ try: import nltk for _pkg in ('averaged_perceptron_tagger', 'punkt'): try: nltk.data.find(f'taggers/{_pkg}' if 'tagger' in _pkg else f'tokenizers/{_pkg}') except LookupError: nltk.download(_pkg, quiet=True) toks = nltk.word_tokenize(str(text)) tags = nltk.pos_tag(toks) except Exception: return [] phrases, cur = [], [] for w, t in tags: if t.startswith('NN') or t.startswith('JJ'): cur.append(w) if len(cur) > max_words: cur = cur[-max_words:] else: if len(cur) >= 1: phrases.append(' '.join(cur)) cur = [] if cur: phrases.append(' '.join(cur)) return [p for p in phrases if len(p) >= 3] def _keybert_label(member_texts, cluster_centroid, embedder, ancestor_words=None, corpus_centroid=None, used_labels=None, max_words=4, gen_weight=0.0, diversity=KEYBERT_DIVERSITY, cap=500): """ KeyBERT-style extractive labeller. Extract candidate phrases from the cluster's DESCRIPTIONS, embed them, and pick by: score = (1 − diversity)·cos(phrase, cluster_centroid) − diversity ·cos(phrase, mean candidate phrase) # MMR redundancy With diversity=0 this is plain cosine-to-centroid (argmax relevance). When USE_CTFIDF=True the relevance is modulated by corpus IDF so boilerplate (low IDF) is suppressed. Candidates come from noun phrases (USE_NOUN_PHRASES=True) or n-grams. Extractive — never hallucinates a label. Returns a title-cased string. """ ancestor_words = ancestor_words or set() used = {str(u).lower() for u in (used_labels or [])} cand = set() for t in member_texts: raw = re.sub(r'\([^)]*\)', ' ', str(t)) # drop parentheticals nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else [] if nps: for p in nps: toks = [w for w in p.lower().split() if w not in _STOP and w not in ancestor_words] if toks: cand.add(' '.join(toks)) else: toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in _STOP and w not in ancestor_words] for nlen in range(1, max_words + 1): for i in range(len(toks) - nlen + 1): cand.add(' '.join(toks[i:i + nlen])) # Junk filter: drop used labels, pure-number phrases, immediately-repeated words. cand = [c for c in cand if len(c) >= 4 and c.lower() not in used and not c.replace(' ', '').isdigit() and not re.search(r'\b(\w+)\s+\1\b', c.lower())] if not cand: return '' cand = cand[:cap] embs = np.asarray(embedder.encode(cand), dtype=float) sims = cosine_similarity([cluster_centroid], embs)[0] # relevance if USE_CTFIDF and _CORPUS_IDF: mx = max(_CORPUS_IDF.values()) or 1.0 idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), mx) / mx) for c in cand]) sims = sims * (0.5 + 0.5 * idf) if gen_weight and corpus_centroid is not None: sims = sims - gen_weight * cosine_similarity([corpus_centroid], embs)[0] if diversity > 0 and len(embs) > 1: # MMR penalty generic = cosine_similarity(embs.mean(axis=0, keepdims=True), embs)[0] score = (1.0 - diversity) * sims - diversity * generic else: score = sims return cand[int(np.argmax(score))].title() def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None, max_words=3, cap=500): """ Extract the KeyBERT CANDIDATE phrases from a cluster's member descriptions — the same generation logic as _keybert_label but returns the full candidate list (un-ranked) so the caller can score them with the title-seeded scorer. Phrases are noun phrases (USE_NOUN_PHRASES=True) or n-grams, with ancestor/task words, pure numbers, used labels and immediate repeats filtered out. """ ancestor_words = ancestor_words or set() used = {str(u).lower() for u in (used_labels or [])} block = _STOP | ancestor_words | _LABEL_BOILERPLATE # boilerplate/web/Likert tokens out cand = set() for t in member_texts: raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t))) nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else [] if nps: for p in nps: toks = [w for w in p.lower().split() if w not in block] if toks: cand.add(' '.join(toks)) else: toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block] for nlen in range(1, max_words + 1): for i in range(len(toks) - nlen + 1): cand.add(' '.join(toks[i:i + nlen])) def _ok(c): words = c.split() if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit(): return False if re.search(r'\b(\w+)\s+\1\b', c.lower()): # adjacent word repeat return False if len(words) == 4 and words[:2] == words[2:]: # phrase repeat "x y x y" return False if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)): # bare fragment/acronym return False return True return [c for c in cand if _ok(c)][:cap] def _concept_title(text): """ Extract the human-written concept TITLE from a metadata description. Data-dictionary descriptions read `Title: long definition sentence`. The title is the colon-segment immediately BEFORE the longest segment (the definition). Structural, no vocabulary — works whether the title is segment 0 ("DMS Correct Latency SD: The standard deviation...") or later ("KEY: SWM Between errors: The number of times..."). Returns the title only. """ t = str(text).strip() if not t: return '' segs = t.split(':') if len(segs) < 2: title = t else: lens = [len(s.strip()) for s in segs] def_i = max(range(1, len(segs)), key=lambda i: lens[i]) # longest = definition title = segs[def_i - 1].strip() or t # A genuine concept title is short. If what we extracted is a full SENTENCE # (e.g. MOT has no "Title: def" structure — just prose), it is not a title; # return '' so the caller falls back to the embedding concept scorer instead of # labelling from a sentence. Length-based, no vocabulary. if len(re.findall(r'[A-Za-z]+', title)) > 9: return '' return title def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None, max_words=4, used_labels=None): """ Label a cluster from the concept TITLES its members share, chosen CONTRASTIVELY against sibling clusters (tree-based local-IDF). Titles are concept-dense (no boilerplate definition text), so this returns the genuine shared concept — "Correct Latency", "Standard Deviation" — never "Calculated Assessed Trials". Strips ancestor/task tokens and avoids repeating a parent or a used sibling. Returns a title-cased label or ''. """ ancestor_words = {w.lower() for w in (ancestor_words or [])} used_labels = {str(u).lower() for u in (used_labels or [])} def _phrases(title): t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower())) # drop parens + URLs toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t) if w not in _STOP and w not in ancestor_words and w not in _LABEL_BOILERPLATE] # web/Likert out out = set() for nlen in range(1, max_words + 1): for i in range(len(toks) - nlen + 1): out.add(' '.join(toks[i:i + nlen])) return out M = len(member_titles) if M == 0: return '' member_df = defaultdict(int) for ph_set in (_phrases(t) for t in member_titles): for ph in ph_set: member_df[ph] += 1 sib_flat = [t for lst in (sibling_title_lists or []) for t in lst] S = max(1, len(sib_flat)) sib_df = defaultdict(int) for t in sib_flat: for ph in _phrases(t): sib_df[ph] += 1 best, best_score = '', -1.0 for ph, mdf in member_df.items(): if mdf < 2: # must be shared by ≥2 members continue words = ph.split() if all(w in ancestor_words for w in words): # don't repeat the parent continue if ph in used_labels: # don't repeat a sibling continue score = (mdf / M - sib_df.get(ph, 0) / S) * (1.0 + 0.25 * (len(words) - 1)) if score > best_score: best_score, best = score, ph return best.title() if best else '' def _raw_title(text): """Title segment, keeping parentheticals (the error TYPE lives in them).""" segs = str(text).split(':') if len(segs) < 2: return str(text).strip() lens = [len(s.strip()) for s in segs] di = max(range(1, len(segs)), key=lambda i: lens[i]) return segs[di - 1].strip() def _label_from_own_title(title, ancestor_words, max_words=4): """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task words and parentheticals). Returns '' for sentence-like / empty titles.""" t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower())) toks = [w for w in re.findall(r'[a-z][a-z\-]+', t) if w not in _STOP and w not in ancestor_words and w not in _LABEL_BOILERPLATE] if not toks or len(toks) > 7: # >7 words ⇒ prose, not a concept title return '' return ' '.join(toks[:max_words]).title() def _strip_leading_prose(label): """Drop a leading word that is a verb but NOT a noun in WordNet (e.g. 'Include Shapes' → 'Shapes', from a prose description) — data-driven, no word list. Keeps qualifiers like 'Correct' only when applied where appropriate (sub-split labels).""" if not _WORDNET_AVAILABLE: return label words = label.split() while len(words) > 1: w = words[0].lower() try: if wn.synsets(w, pos=wn.VERB) and not wn.synsets(w, pos=wn.NOUN): words = words[1:] else: break except Exception: break return ' '.join(words) def _subsplit_concept_by_title(nodes): """ [Fix2] Split a concept node's leaves by a distinctive TITLE descriptor when they fall into ≥2 groups of ≥2 — e.g. DMS 'Error' → {Incorrect Colour, Incorrect Pattern, Distractor} (the type is in the title parenthetical). Greedy prefers LONGER descriptors so "incorrect colour"/"incorrect pattern" win over the bare "incorrect". The ≥2-groups-of-≥2 gate is what stops a delay over-split: delay variants form only ONE group ("second delay") so they are never split out. No hardcoded vocabulary. """ pm = build_parent_map(nodes) def _anc_words(nid): w, x = set(), nid m = nmap(nodes) while True: nd = m.get(x) if nd and nd.get('type') != 'root': w |= set(re.findall(r'[a-z]{3,}', str(nd.get('name', '')).lower())) if x not in pm: break x = pm[x] return w for node in [n for n in nodes if n.get('type') == 'aggregation']: nid = int(node['id']) m = nmap(nodes) leaf_children = [int(c) for c in node.get('related', []) if m.get(int(c), {}).get('type') == 'attribute'] if len(leaf_children) < 4: continue aw = _anc_words(nid) # Tokens present in (nearly) ALL leaves are parent-level, not sub-categories # — e.g. "sd" under a Standard Deviation node. Excluding them stops the # delay over-split (without them, delay variants form only one group). nL = len(leaf_children) tok_df = defaultdict(int) for cid in leaf_children: ln = m[cid] title = _raw_title(ln.get('semantic_desc', ln.get('desc', ''))) for w in set(re.findall(r'[a-z][a-z\-]+', title.lower())): if w not in _STOP and w not in aw: tok_df[w] += 1 common = {w for w, c in tok_df.items() if c > 0.7 * nL} p2l = defaultdict(set) for cid in leaf_children: ln = m[cid] title = _raw_title(ln.get('semantic_desc', ln.get('desc', ''))) toks = [w for w in re.findall(r'[a-z][a-z\-]+', title.lower()) if w not in _STOP and w not in aw and w not in common] phs = set() for nl in (3, 2, 1): for i in range(len(toks) - nl + 1): phs.add(' '.join(toks[i:i + nl])) for p in phs: p2l[p].add(cid) covered, groups = set(), [] for p, cids in sorted(p2l.items(), key=lambda kv: (-len(kv[0].split()), -len(kv[1]))): avail = cids - covered if len(avail) >= 2: groups.append((p, avail)); covered |= avail if len(groups) < 2: continue for p, cids in groups: sub_lbl = _strip_leading_prose(p.title()) or p.title() snid = next_id(nodes) nodes.append(make_agg(snid, sub_lbl, desc=f'Sub-group: {sub_lbl}', relation_type='belongs_to')) add_child(nodes, nid, snid) for cid in cids: remove_child(nodes, nid, cid) add_child(nodes, snid, cid) return nodes def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder, concept_table, concept_embs, ensure_path_fn, n_clusters_max, can, ref_centroids=None, corpus_centroid=None): """ Cluster variables in tdf and assign concept labels from concept_table. Modifies nodes in-place. Updates can['_concept_label'] for each variable. Semantic label selection: [GON][CAS] score_concepts_for_cluster ranks candidates by embedding coverage, sibling contrast, and dataset-wide specificity (semantic IDF) using the member / sibling / group (ref_centroids) embeddings passed in. [HIE] assign_concept_label rejects labels that paraphrase an ancestor or a chosen sibling (semantic dedup via embeddings, not word lists). [HIE] Singleton clusters (n=1) attach directly to parent — no group node. """ sem_col = '_semantic_text' if '_semantic_text' in tdf.columns else '_text' texts = tdf[sem_col].fillna('').tolist() # Concept TITLES (pre-definition) — clean label vocabulary, never boilerplate. titles = [_concept_title(t) for t in texts] n = len(tdf) if n == 0: return # Ancestor names + their embeddings for semantic parent-duplication filter [HIE] ancestor_names = list(path_prefix) ancestor_embs = (embedder.encode(ancestor_names) if ancestor_names else None) # [ChangeC] Discover top-level task tokens from the full dataset (data-driven) _top_level_tasks: set = set() if '_group_path' in can.columns: for _gp in can['_group_path'].dropna().astype(str): _f = _gp.split(' > ')[0].strip() if _f and _f.lower() not in ('ungrouped', 'nan', ''): _top_level_tasks.add(_f.lower()) _aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks if n < 3 or concept_embs is None or len(concept_table) == 0: # Too few variables to cluster — label each from its own title [Fix5], or # KeyBERT over its description when no title exists. ensure_path merges it # into an existing concept of the same name. pid = ensure_path_fn(path_prefix) _small = embedder.encode(texts) if texts else None for i, (_, row) in enumerate(tdf.iterrows()): lbl = _label_from_own_title(titles[i], _aw_base) if not lbl and _small is not None: lbl = _keybert_label([texts[i]], _small[i], embedder, ancestor_words=_aw_base, used_labels=set(), max_words=2, gen_weight=0.3, diversity=KEYBERT_DIVERSITY) tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \ {a.lower() for a in ancestor_names} else pid add_child(nodes, tgt, leaf_to_id[row['_leaf_id']]) return # Embed variables var_embs = embedder.encode(texts) # Centroid of THIS task/subgroup — reference for the scorer's `home` signal. own_group_centroid = _centroid(var_embs) # Choose number of clusters adaptively n_clust = min(n_clusters_max, max(2, n // 3), n) # Agglomerative clustering on variable embeddings [TAX][GON] # [FIX7][GON][TAX] Code-family cohesion bias: # Variables sharing the same _code_family (e.g. DMSL, SWMBE) are structurally # related by the instrument's own naming convention. Reduce their pairwise # cosine distance by a factor of 0.80 so the clusterer prefers to keep them # together. Rationale — Gonçalves ESWC 2019: structural prefix affinity; # Taxonomizer IEEE TVCG 2019: compound labels align with code morphology. # Factor 0.80 is a cohesion weight, not a fixed threshold — it is applied # multiplicatively so the relative ordering of similarities is preserved. try: dist = cosine_distances(var_embs).astype(float) np.fill_diagonal(dist, 0.0) # Apply code-family cohesion if _code_family is available if '_code_family' in tdf.columns: families = tdf['_code_family'].fillna('').astype(str).tolist() _COHESION_FACTOR = 0.80 # same-family pairs: distance × 0.80 (pulled together) for ii in range(n): for jj in range(ii + 1, n): if families[ii] and families[ii] == families[jj]: dist[ii, jj] *= _COHESION_FACTOR dist[jj, ii] *= _COHESION_FACTOR c_lbls = AgglomerativeClustering(n_clusters=n_clust, metric='precomputed', linkage='average').fit_predict(dist) except Exception: c_lbls = np.zeros(n, dtype=int) n_clust = 1 rows_list = list(tdf.iterrows()) # [C3] Pre-compute all cluster text groups for discriminative TF-IDF [GON][TAX] # and each cluster's centroid (used as sibling-contrast references [CAS]). all_cluster_texts = [] all_cluster_titles = [] all_centroids = [] for k in range(n_clust): mask = c_lbls == k cluster_idxs = [i for i, m in enumerate(mask) if m] all_cluster_texts.append([texts[i] for i in cluster_idxs] if cluster_idxs else []) all_cluster_titles.append([titles[i] for i in cluster_idxs] if cluster_idxs else []) all_centroids.append(_centroid(var_embs[mask]) if cluster_idxs else None) # Track used sibling labels (string) and their embeddings (semantic dedup) [TAX][GON] used_sibling_labels = set() sibling_label_embs = [] parent_pid = ensure_path_fn(path_prefix) # get parent node id upfront for k in range(n_clust): mask = c_lbls == k cluster_idxs = [i for i, m in enumerate(mask) if m] if not cluster_idxs: continue cluster_texts_k = [texts[i] for i in cluster_idxs] cluster_emb = _centroid(var_embs[mask]) # [Fix5] Singleton: label it from its OWN title and attach under that concept # (ensure_path merges it into an existing same-named concept if one exists), # instead of dumping it unclassified under the task. if len(cluster_idxs) == 1: _, row = rows_list[cluster_idxs[0]] lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base) src = 'singleton_title' if not lbl and cluster_emb is not None: lbl = _keybert_label([cluster_texts_k[0]], cluster_emb, embedder, ancestor_words=_aw_base, used_labels=used_sibling_labels, max_words=2, gen_weight=0.3, diversity=KEYBERT_DIVERSITY) src = 'singleton_keybert' if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}: tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to') can.at[row.name, '_concept_label'] = lbl else: tgt = parent_pid can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root' add_child(nodes, tgt, leaf_to_id[row['_leaf_id']]) can.at[row.name, '_concept_score'] = 0.0 can.at[row.name, '_concept_source'] = src continue if cluster_emb is not None: # Sibling centroids = every OTHER cluster in this parent (contrast ref) [CAS] sibling_centroids = [all_centroids[j] for j in range(n_clust) if j != k and all_centroids[j] is not None] scores = score_concepts_for_cluster( cluster_emb, concept_embs, concept_table, cluster_texts_k, n_total_vars=len(can), member_embs=var_embs[mask], sibling_centroids=np.array(sibling_centroids) if sibling_centroids else None, ref_centroids=ref_centroids, # all top-level task centroids corpus_centroid=corpus_centroid, own_group_centroid=own_group_centroid, # current task → home signal ) else: scores = [] # ── TITLE-SEEDED LABEL SELECTION (Guided KeyBERT) ───────────────────── # The label is FORMED FROM THE DESCRIPTIONS: candidates are KeyBERT phrases # extracted from the cluster's member descriptions (+ scored concept-table # entries). The pre-colon TITLE does NOT override — it is a ranking SEED: # score = α·cos(cand, cluster centroid) # description fit # + β·cos(cand, title embedding) # title INFLUENCE (LABEL_W_TITLE) # + γ·contrast(vs siblings) # + δ·external grounding # So the displayed label is always a description-derived phrase, pulled toward # the human-canonical title phrasing. Set LABEL_W_TITLE=0 for a pure-description # ablation. The title phrase is also added as ONE candidate so a clean title can # still win on merit (it is usually present verbatim in the descriptions anyway). ancestor_words = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks member_titles_k = [titles[i] for i in cluster_idxs] sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k] sibling_texts = [all_cluster_texts[j] for j in range(n_clust) if j != k] # Pre-colon title → used only as the SEED ANCHOR (and one candidate), never a # direct override. title_label = _title_cluster_label(member_titles_k, sibling_title_lists, ancestor_words=ancestor_words, used_labels=used_sibling_labels) title_emb = (embedder.encode([title_label])[0] if title_label else None) # Candidate phrases drawn ONLY from the cluster's DESCRIPTIONS (KeyBERT) plus # the pre-colon title. External ontology sources (Cognitive Atlas / Wikidata / # WordNet / PubMed) are deliberately NOT candidates — per design they inform the # embedding space / semantic understanding only, and must never name a node. kb_cands = _keybert_candidates(cluster_texts_k, ancestor_words=ancestor_words, used_labels=used_sibling_labels, max_words=3) pool_src = [(c, 'keybert') for c in kb_cands] if title_label: pool_src.append((title_label, 'description_title')) # Dedup; title's source tag takes priority over keybert when the phrase matches. seen_pool = {} for lbl, src in pool_src: key = lbl.lower() if key not in seen_pool or src == 'description_title': seen_pool[key] = (lbl, src) pool = [v[0] for v in seen_pool.values()] pool_srcs = [v[1] for v in seen_pool.values()] keybert_label = kb_cands[0] if kb_cands else '' # for fallback only candidate_scores = [] if pool and cluster_emb is not None: cand_embs = np.asarray(embedder.encode(pool), dtype=float) relevance = cosine_similarity([cluster_emb], cand_embs)[0] # c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic # phrases ("test", "description", "measure", "scores") lose to distinctive ones. if USE_CTFIDF and _CORPUS_IDF: _mx = max(_CORPUS_IDF.values()) or 1.0 _idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool]) relevance = relevance * (0.5 + 0.5 * _idf) if sibling_centroids: sib_sim = cosine_similarity(cand_embs, np.asarray(sibling_centroids, dtype=float)).max(axis=1) contrast = np.clip(relevance - sib_sim, 0.0, 1.0) else: contrast = np.zeros(len(pool)) # Title SEED: cosine of each description-derived candidate to the title. if title_emb is not None: title_sim = cosine_similarity(cand_embs, [title_emb])[:, 0] else: title_sim = np.zeros(len(pool)) for i, cand in enumerate(pool): hyb = (LABEL_W_RELEVANCE * float(relevance[i]) + LABEL_W_TITLE * float(title_sim[i]) + LABEL_W_CONTRAST * float(contrast[i])) candidate_scores.append({ 'label': cand, 'score': hyb, 'embedding_sim': float(relevance[i]), 'coverage': float(relevance[i]), 'contrast': float(contrast[i]), 'specificity': 0.0, 'string_sim': float(title_sim[i]), # title seed alignment 'source': pool_srcs[i], 'broader_relations': [], '_emb': cand_embs[i], }) candidate_scores.sort(key=lambda x: -x['score']) fallback_label = (title_label or keybert_label or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts) or f'Group {k+1}') label, provenance = assign_concept_label( candidate_scores, fallback=fallback_label, min_score=0.0, ancestor_names=ancestor_names, used_sibling_labels=used_sibling_labels, top_level_tasks=_top_level_tasks, ancestor_embs=ancestor_embs, sibling_label_embs=sibling_label_embs, ) # Skip the node only when there is truly NO concept name (empty title, no # scored candidate → a bare "Group k"). Title labels are trusted and kept. if (not title_label) and (not candidate_scores) and label.startswith('Group '): for ci in cluster_idxs: _, row = rows_list[ci] add_child(nodes, parent_pid, leaf_to_id[row['_leaf_id']]) can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root' can.at[row.name, '_concept_score'] = 0.0 can.at[row.name, '_concept_source'] = 'weak_label_direct' continue # WordNet hypernym — ONLY when there is no title concept name. if (not title_label) and (label == fallback_label or label.lower() in {a.lower() for a in ancestor_names}): wn_label = wordnet_hypernym_fallback(cluster_texts_k, excluded_names=ancestor_names) if wn_label: label = wn_label provenance['node_label'] = label provenance['source_evidence'] = ['wordnet_hypernym'] # Guarantee distinct siblings: qualify a colliding label with a distinguishing # word from this cluster's own titles (never emit a duplicate sibling). if label.lower() in used_sibling_labels: from collections import Counter as _Counter _cnt = _Counter() for _tt in member_titles_k: for _w in re.findall(r'[a-z]{3,}', _tt.lower()): if _w not in _STOP and _w not in ancestor_words and _w not in label.lower(): _cnt[_w] += 1 _extra = next((w for w, _ in _cnt.most_common() if f'{label} {w}'.lower() not in used_sibling_labels), None) if _extra: label = f'{label} {_extra.title()}' else: _i = 2 while f'{label} {_i}'.lower() in used_sibling_labels: _i += 1 label = f'{label} {_i}' used_sibling_labels.add(label.lower()) # register for sibling dedup (string + emb) try: sibling_label_embs.append(embedder.encode([label])[0]) except Exception: pass pid = ensure_path_fn(path_prefix + [label], relation='belongs_to', provenance=provenance) # Store concept assignment on can (needed by Castanet facets later). # Provenance reflects the HYBRID winner (title / keybert / concept_table), # not the old semantic-only scorer — so the exported labels CSV is accurate. for ci in cluster_idxs: _, row = rows_list[ci] can.at[row.name, '_concept_label'] = label can.at[row.name, '_concept_score'] = provenance.get('confidence', 0.0) can.at[row.name, '_concept_source'] = (provenance.get('source_evidence') or ['fallback'])[0] # Attach the cluster's variables directly under the concept node. The former # Statistic/Condition facet sub-split is removed: the statistic tier came from # a hardcoded vocabulary (now produced data-drivenly by _nest_by_measure), and # the numeric Condition tier produced bare-digit nodes (0/4/12) that inflated # singleton%/n_agg and moved the tree away from gold. Castanet's Condition facet # still exists as a separate parallel view via detect_facets() — not a tier. for ci in cluster_idxs: _, row = rows_list[ci] add_child(nodes, pid, leaf_to_id[row['_leaf_id']]) def _remove_phrase(tokens, phrase_tokens): """Remove the first contiguous occurrence of phrase_tokens from tokens.""" nlen = len(phrase_tokens) for i in range(len(tokens) - nlen + 1): if tokens[i:i + nlen] == phrase_tokens: return tokens[:i] + tokens[i + nlen:] return [t for t in tokens if t not in phrase_tokens] def _nest_by_measure(nodes): """ [Fix2] Group concept-sibling nodes that SHARE a measure phrase into a Measure parent, renaming each child to its residual statistic. Example under DMS: Mean Correct Latency, Median Correct Latency, Correct Latency Standard Deviation → Correct Latency → { Mean, Median, Standard Deviation } The measure is simply the phrase shared by ≥2 siblings; the statistic is what remains after removing it. No hardcoded statistic list. Adds Measure→Statistic depth only where the data supports it; other concepts stay flat. """ pm = build_parent_map(nodes) task_ids = [int(n['id']) for n in nodes if n.get('type') == 'aggregation' and pm.get(int(n['id'])) == 0] for task_id in task_ids: while True: m = nmap(nodes) task = m.get(task_id) if not task: break child_ids = [int(c) for c in task.get('related', []) if m.get(int(c), {}).get('type') == 'aggregation'] if len(child_ids) < 3: break labels = {cid: str(m[cid]['name']) for cid in child_ids} phrase_children = defaultdict(set) for cid, lbl in labels.items(): toks = [w for w in re.findall(r'[a-z][a-z\-]+', lbl.lower()) if w not in _STOP] for nlen in (3, 2): for i in range(len(toks) - nlen + 1): phrase_children[' '.join(toks[i:i + nlen])].add(cid) cand = [(ph, cids) for ph, cids in phrase_children.items() if len(cids) >= 2] if not cand: break ph, grouped = max(cand, key=lambda x: (len(x[1]), len(x[0].split()))) ptoks = ph.split() nid = next_id(nodes) nodes.append(make_agg(nid, ph.title(), desc=f'Measure: {ph.title()}', relation_type='belongs_to')) add_child(nodes, task_id, nid) for cid in list(grouped): remove_child(nodes, task_id, cid) ctoks = [w for w in re.findall(r'[a-z][a-z\-]+', labels[cid].lower()) if w not in _STOP] resid = _remove_phrase(ctoks, ptoks) if len(resid) == 1: # A lone modifier ("Double", "Within") reads poorly on its own — # qualify it with the measure's most-informative word (longest; # ties → last), e.g. "Double" → "Double Errors". No hardcoding. mword = max(ptoks, key=lambda w: (len(w), ptoks.index(w))) if mword not in resid: resid = resid + [mword] if resid: m[cid]['name'] = ' '.join(resid).title() add_child(nodes, nid, cid) else: # child label == measure → dissolve it, leaves go under new parent for leaf in list(m[cid].get('related', [])): add_child(nodes, nid, int(leaf)) nodes[:] = [x for x in nodes if int(x['id']) != cid] return nodes def _singular(w): return w[:-1] if (len(w) > 3 and w.endswith('s') and not w.endswith('ss')) else w def _nest_by_category(nodes): """ [Fix3] Add a Measure-CATEGORY tier: group a task's concept-sibling nodes by their HEAD noun (last significant word, singularised) when ≥2 share it, e.g. Total Correct, Percent Correct → Correct → { Total, Percent } Total Errors, Probability Error → Errors → { Total, Probability } The HEAD is used (not any shared word) specifically so "Correct Latency" (head = Latency) is NOT pulled under "Correct". Children are renamed to the residual (label minus the head). No hardcoded category list. """ pm = build_parent_map(nodes) task_ids = [int(n['id']) for n in nodes if n.get('type') == 'aggregation' and pm.get(int(n['id'])) == 0] for task_id in task_ids: m = nmap(nodes) task = m.get(task_id) if not task: continue child_ids = [int(c) for c in task.get('related', []) if m.get(int(c), {}).get('type') == 'aggregation'] if len(child_ids) < 3: continue head_groups, head_forms, labels = defaultdict(list), defaultdict(list), {} for cid in child_ids: lbl = str(m[cid]['name']) labels[cid] = lbl words = [w for w in re.findall(r'[a-z][a-z\-]+', lbl.lower()) if w not in _STOP] if not words: continue sg = _singular(words[-1]) head_groups[sg].append(cid) head_forms[sg].append(words[-1]) for sg, cids in list(head_groups.items()): if len(cids) < 2: continue cat = max(head_forms[sg], key=len).title() # nicest surface form nid = next_id(nodes) nodes.append(make_agg(nid, cat, desc=f'Category: {cat}', relation_type='belongs_to')) add_child(nodes, task_id, nid) for cid in cids: remove_child(nodes, task_id, cid) ctoks = [w for w in re.findall(r'[a-z][a-z\-]+', labels[cid].lower()) if w not in _STOP] resid = [t for t in ctoks if _singular(t) != sg] if resid: m[cid]['name'] = ' '.join(resid).title() add_child(nodes, nid, cid) else: for leaf in list(m[cid].get('related', [])): add_child(nodes, nid, int(leaf)) nodes[:] = [x for x in nodes if int(x['id']) != cid] m = nmap(nodes) return nodes def _merge_duplicate_concepts(nodes): """ [Fix] Merge aggregation nodes that share the SAME name within the same task (keeping the shallowest), e.g. SWM had a singleton 'Within Errors' AND an 'Errors Boxes > Within Errors' — both become one flat 'Within Errors'. Removes duplicates created when clustering split a concept's variants and #5 / _nest_by_measure labelled them identically. """ pm = build_parent_map(nodes) def depth(nid): d, x = 0, nid while x in pm: x = pm[x]; d += 1 return d def task_of(nid): x = nid while True: p = pm.get(x) if p is None or p == 0: return x x = p groups = defaultdict(list) for n in nodes: if n.get('type') == 'aggregation' and int(n['id']) in pm: groups[(task_of(int(n['id'])), str(n['name']).lower())].append(int(n['id'])) removed = set() for (_t, _name), ids in groups.items(): ids = [i for i in ids if i not in removed] if len(ids) < 2: continue keeper = min(ids, key=depth) m = nmap(nodes) for dup in ids: if dup == keeper: continue for c in list(m[dup].get('related', [])): remove_child(nodes, dup, int(c)); add_child(nodes, keeper, int(c)) # Remove dup from whatever node CURRENTLY references it (not the stale pm — # earlier post-processes may have re-parented it). Leaving a stale ref makes # a dangling child that breaks the Plotly sunburst. for pn in nodes: if dup in [int(x) for x in pn.get('related', [])]: remove_child(nodes, int(pn['id']), dup) removed.add(dup) nodes[:] = [n for n in nodes if int(n['id']) not in removed] # Defensive: drop any child reference to a node that no longer exists. _alive = {int(n['id']) for n in nodes} for n in nodes: n['related'] = [int(x) for x in n.get('related', []) if int(x) in _alive] return nodes def _prune_empty_aggregations(nodes): """Remove aggregation nodes whose subtree contains NO variable (leaf). Empty concept nodes are meaningless AND break the Plotly sunburst: every node gets a min value of 1, so an empty child makes its parent's value < sum(children) and branchvalues='total' refuses to render (blank chart).""" m = nmap(nodes) def has_leaf(nid, seen): if nid in seen: return False seen.add(nid) n = m.get(nid) if not n: return False if n.get('type') == 'attribute': return True return any(has_leaf(int(c), seen) for c in n.get('related', [])) empty = {int(n['id']) for n in nodes if n.get('type') == 'aggregation' and not has_leaf(int(n['id']), set())} if empty: nodes[:] = [n for n in nodes if int(n['id']) not in empty] alive = {int(n['id']) for n in nodes} for n in nodes: n['related'] = [int(c) for c in n.get('related', []) if int(c) in alive] return nodes def _dissolve_facet_singletons(nodes): """ Dissolve FACET tier nodes (Statistic / Condition) that wrap a single variable. A condition or statistic node with exactly one leaf child carries no grouping value — e.g. `Standard Deviation > 0 > DMSL0SD`. We remove such nodes and reattach their single child to the node's parent, keeping siblings together. Scope is deliberately narrow: only nodes whose relation_type is 'has_condition' or 'is_statistic_of' are touched, so genuine single-member CONCEPT nodes that carry a distinctive name are preserved (per the chosen policy). """ _FACET_RELS = {'has_condition', 'is_statistic_of'} changed = True while changed: changed = False pm = build_parent_map(nodes) m = nmap(nodes) for n in list(nodes): if n.get('type') != 'aggregation': continue if n['info'].get('relation_type') not in _FACET_RELS: continue nid = int(n['id']) children = [int(c) for c in n.get('related', [])] # "Single variable" = exactly one child and that child is a leaf attribute. if len(children) == 1 and m.get(children[0], {}).get('type') == 'attribute': parent = pm.get(nid) if parent is None: continue add_child(nodes, parent, children[0]) remove_child(nodes, parent, nid) nodes[:] = [x for x in nodes if int(x['id']) != nid] changed = True break return nodes def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project', n_clusters_per_group=8): """ Build hierarchy using automatic concept label assignment. No hardcoded patterns. Labels come from metadata + external concept table. [GON] N×M alignment · [TAX] leaf=attribute, node=abstract concept · [HIE] task-first """ nodes = [{'id': 0, 'name': project, 'desc': 'Root node', 'type': 'root', 'dtype': 'root', 'isShown': True, 'related': []}] leaf_to_id = {} for i, (_, r) in enumerate(can.iterrows(), start=1): leaf_to_id[r['_leaf_id']] = i nodes.append({ 'id': i, 'name': r['_leaf_label'], 'dtype': r['_dtype'], 'related': [], 'isShown': True, 'type': 'attribute', 'desc': r['_text'], 'semantic_desc': r.get('_semantic_text', r['_text']), 'source_file': r['_source_file'], 'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}, }) # Embed concept table once for the whole hierarchy build if concept_table: concept_texts = [c['full_text'] for c in concept_table] concept_embs = embedder.encode(concept_texts) else: concept_embs = None # ── Dataset-wide reference embeddings for semantic IDF / specificity [GON] ── # Encode every variable once, then build one centroid per top-level group. # A candidate label that is close to ONE group centroid and far from the rest # is discriminative; one close to ALL of them is boilerplate. corpus_centroid # is the global mean (generic = central). Both are derived purely from data. sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text' # Active domain — used by the hybrid label scorer's external-grounding signal. global _ACTIVE_DOMAIN _ACTIVE_DOMAIN = detect_domain(can) # Corpus IDF over description n-grams — KeyBERT c-TF-IDF distinctiveness weight # (only consulted when USE_CTFIDF=True). Data-derived, dataset-agnostic. global _CORPUS_IDF _CORPUS_IDF = {} try: from sklearn.feature_extraction.text import CountVectorizer as _CV _docs = can[sem_col_all].fillna('').astype(str).tolist() _cv = _CV(ngram_range=(1, 3), binary=True, lowercase=True, token_pattern=r'[a-z][a-z\-]+') _dt = _cv.fit_transform(_docs) _dfa = np.asarray(_dt.sum(axis=0)).ravel(); _N = _dt.shape[0] _CORPUS_IDF = {p: float(np.log((_N + 1) / (_dfa[i] + 1)) + 1.0) for p, i in _cv.vocabulary_.items()} except Exception: _CORPUS_IDF = {} ref_centroids = corpus_centroid = None try: all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist()) corpus_centroid = _centroid(all_var_embs) _tops = can['_group_path'].fillna('Ungrouped').apply( lambda x: str(x).split(' > ')[0].strip() or 'Ungrouped') _cent = [] for g in _tops.unique(): gm = (_tops == g).to_numpy() if gm.sum() >= 1: _cent.append(_centroid(all_var_embs[gm])) ref_centroids = np.array(_cent) if len(_cent) >= 2 else None except Exception: pass path_ids = {} def ensure_path(parts, relation='belongs_to', provenance=None): key = tuple(str(p) for p in parts) if key in path_ids: return path_ids[key] nid = next_id(nodes) path_ids[key] = nid nodes.append(make_agg(nid, parts[-1], desc=f'Concept group: {" > ".join(str(p) for p in parts)}', relation_type=relation, provenance=provenance)) parent = 0 if len(parts) == 1 else ensure_path(parts[:-1]) add_child(nodes, parent, nid) return nid # Group variables by top-level group path work = can.copy() work['_top'] = work['_group_path'].apply( lambda x: str(x).split(' > ')[0].strip() if str(x) not in ('', 'nan', 'Ungrouped') else 'Ungrouped' ) for top_label, tdf in work.groupby('_top', dropna=False, sort=False): top_label = str(top_label) # Check if sub-group paths already exist (level 2+) subgroup_paths = tdf['_group_path'].apply( lambda x: ' > '.join(str(x).split(' > ')[1:]).strip() if len(str(x).split(' > ')) > 1 else '' ) has_subgroups = subgroup_paths.str.strip().str.len().gt(0).any() if has_subgroups: # [C4][CAS] UnaryPenalty: count distinct subgroup paths under this top group. # Castanet: "eliminate a child whose name appears within the parent's name" # and nodes that create unary (1-child) chains weaken the hierarchy. # If a structural column creates only ONE branch under this parent, # it is a pass-through (e.g. "DMS Recommended Standard" under "DMS") # and should be demoted — cluster directly under the top-level node instead. distinct_subpaths = subgroup_paths[subgroup_paths.str.strip().str.len().gt(0)].unique() n_distinct_subpaths = len(distinct_subpaths) if n_distinct_subpaths <= 1: # UnaryPenalty triggered — structural column creates only 1 branch. # Cluster directly under top_label, skip the variant pass-through. [C4] _cluster_and_label( tdf, [top_label], nodes, leaf_to_id, embedder, concept_table, concept_embs, ensure_path, n_clusters_per_group, can, ref_centroids, corpus_centroid ) else: # Multiple distinct subgroups — structural column is meaningful, keep it. for subpath, sdf in tdf.groupby(subgroup_paths, dropna=False, sort=False): subpath = str(subpath).strip() if subpath: parts = [top_label] + [p.strip() for p in subpath.split(' > ') if p.strip()] else: parts = [top_label] # Cluster and label within this subgroup _cluster_and_label( sdf, parts, nodes, leaf_to_id, embedder, concept_table, concept_embs, ensure_path, n_clusters_per_group, can, ref_centroids, corpus_centroid ) else: # No pre-existing subgroups — cluster all variables under this top group _cluster_and_label( tdf, [top_label], nodes, leaf_to_id, embedder, concept_table, concept_embs, ensure_path, n_clusters_per_group, can, ref_centroids, corpus_centroid ) # [Fix2] Nest statistics under their shared measure (Correct Latency → Mean/…), # then merge same-named duplicates, THEN sub-split the consolidated concept # nodes by a distinctive title descriptor (Error → Incorrect Colour / …). Order # matters: sub-splitting last avoids the merge re-parenting sub-nodes oddly. _nest_by_measure(nodes) _merge_duplicate_concepts(nodes) _subsplit_concept_by_title(nodes) # Remove empty concept nodes (no variables) — meaningless and they break the # branchvalues='total' sunburst (parent value < sum of children → blank render). _prune_empty_aggregations(nodes) # Dissolve 1-variable Statistic/Condition facet nodes (no grouping value). _dissolve_facet_singletons(nodes) _prune_empty_aggregations(nodes) # NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted — # it regressed setOverlap (0.914→0.836: mis-grouping) and added depth beyond gold. # _nest_by_category() is kept defined but intentionally NOT called. _alive = {int(n['id']) for n in nodes} for n in nodes: n['related'] = [x for x in dict.fromkeys(int(x) for x in n.get('related', [])) if x in _alive] # dedup + drop dangling refs (sunburst safety) return nodes # ───────────────────────────────────────────────────────────────────────────── # HIEXPAN-INSPIRED REFINEMENT [HIE] # ───────────────────────────────────────────────────────────────────────────── def _leaf_texts(nodes, nid, text_cache): return [text_cache[i] for i in leaf_ids(nodes, nid) if i in text_cache] def _build_emb_cache(nodes, embedder, text_cache): """ [HIE] Pre-compute ALL leaf embeddings in one batch call. Paper-correct: HiExpan pre-computes entity representations once upfront, then all expansion passes reuse the cache — no re-encoding per leaf per pass. """ ids = [int(n['id']) for n in nodes if n['type'] == 'attribute' and text_cache.get(int(n['id']), '').strip()] if not ids: return {} texts = [text_cache[i] for i in ids] embs = embedder.encode(texts) # ONE batch call for everything return {nid: embs[i] for i, nid in enumerate(ids)} def hiexpan_sibling_coherence(nodes, embedder, text_cache, emb_cache=None): """[HIE] Mean pairwise cosine similarity of attribute children per group node.""" report = [] m = nmap(nodes) if emb_cache is None: emb_cache = _build_emb_cache(nodes, embedder, text_cache) for node in nodes: if node['type'] not in ('aggregation', 'root'): continue attr_ch = [int(c) for c in node.get('related', []) if m.get(int(c), {}).get('type') == 'attribute'] if len(attr_ch) < 2: continue embs_ = np.array([emb_cache[cid] for cid in attr_ch if cid in emb_cache]) if len(embs_) < 2: continue sims = cosine_similarity(embs_) n_ = len(embs_) mask_ = np.triu(np.ones((n_, n_), dtype=bool), k=1) mean_sim = float(sims[mask_].mean()) if mask_.any() else 1.0 report.append({'node_id': int(node['id']), 'name': node['name'], 'n_attr_children': n_, 'coherence_score': round(mean_sim, 3), 'is_incoherent': mean_sim < 0.25}) return sorted(report, key=lambda x: x['coherence_score']) def hiexpan_width_expansion(nodes, embedder, text_cache, threshold=0.45, emb_cache=None): """[HIE] Move each leaf to the sibling group with highest centroid similarity. Uses pre-built emb_cache — no per-leaf encode() calls (paper-correct, fast).""" import copy nodes = copy.deepcopy(nodes) m = nmap(nodes) if emb_cache is None: emb_cache = _build_emb_cache(nodes, embedder, text_cache) measure_nodes = [n for n in nodes if n['type'] == 'aggregation' and any(m.get(int(c), {}).get('type') == 'attribute' for c in n.get('related', []))] if len(measure_nodes) < 2: return nodes, 0 # Build group centroids from cached embeddings — no new encode() calls node_centroids = {} for mn in measure_nodes: leaf_ids_ = [int(c) for c in leaf_ids(nodes, int(mn['id'])) if int(c) in emb_cache] if leaf_ids_: embs_ = np.array([emb_cache[i] for i in leaf_ids_]) node_centroids[int(mn['id'])] = _centroid(embs_) if len(node_centroids) < 2: return nodes, 0 cent_ids = list(node_centroids.keys()) cent_array = np.stack([node_centroids[i] for i in cent_ids]) n_moves = 0 for leaf in [n for n in nodes if n['type'] == 'attribute']: lid = int(leaf['id']) if lid not in emb_cache: continue leaf_emb = emb_cache[lid] # cached — no encode() call current_parents = parents(nodes, lid) current_measure = [p for p in current_parents if p in node_centroids] if not current_measure: continue cur_p = current_measure[0] sims = cent_array.dot(leaf_emb) best_idx = int(np.argmax(sims)) best_p = cent_ids[best_idx] if best_p != cur_p and float(sims[best_idx]) > node_centroids[cur_p].dot(leaf_emb) + 0.02: nodes = move_node(nodes, lid, best_p) n_moves += 1 return nodes, n_moves def hiexpan_depth_expansion_semantic(nodes, embedder, text_cache, concept_table, concept_embs, n_subclusters=3, emb_cache=None, coherence_threshold=0.45, top_level_tasks=None): """ [HIE][C1][C2] Depth expansion — embedding-based, no hardcoded patterns. Uses pre-built emb_cache — no per-group encode() calls (paper-correct, fast). [C2][TAX][RAPTOR] Quality gate added: - Balance check: largest sub-cluster must be ≤ 70% of total (prevents degenerate splits) - Min size: each sub-cluster must have ≥ 2 variables (HiExpan coherence requires siblings) - coherence_threshold: tunable per recursive pass (lowered across passes for deeper trees) """ import copy nodes = copy.deepcopy(nodes) m = nmap(nodes) if emb_cache is None: emb_cache = _build_emb_cache(nodes, embedder, text_cache) n_exp = 0 for agg in [n for n in nodes if n['type'] == 'aggregation']: attr_ch = [int(c) for c in agg.get('related', []) if m.get(int(c), {}).get('type') == 'attribute'] if len(attr_ch) < 3: continue # Check coherence using cached embeddings — no new encode() calls cached_ids = [cid for cid in attr_ch if cid in emb_cache] if len(cached_ids) < 2: continue embs = np.array([emb_cache[cid] for cid in cached_ids]) sims = cosine_similarity(embs) n_ = len(cached_ids) mask_ = np.triu(np.ones((n_, n_), dtype=bool), k=1) coherence = float(sims[mask_].mean()) if mask_.any() else 1.0 # [F6][HIE][RAPTOR] Revised depth-expansion gate: # Original: skip if coherence ≥ threshold — WRONG for narrow-vocab domains # (CANTAB/HCP have high cosine similarity even across sub-types). # New rule: skip ONLY when coherent AND small AND low vocabulary diversity. # HiExpan: expands wide nodes; RAPTOR: splits while BIC improves. # A coherent-but-large node with diverse sub-vocabularies is a good parent # whose children have not yet been discovered — depth expansion IS needed. if coherence >= coherence_threshold: # Allow depth expansion for large nodes with vocabulary diversity if len(attr_ch) < 6: continue # Truly small coherent cluster — no further split needed # Compute vocabulary range: max unique non-stop tokens minus min vocab_sizes = [] for cid in cached_ids: txt = text_cache.get(cid, '').lower() toks = set(re.findall(r'\b[a-z]{4,}\b', txt)) - _STOP vocab_sizes.append(len(toks)) vocab_range = max(vocab_sizes) - min(vocab_sizes) if vocab_sizes else 0 if vocab_range < 3: continue # Low internal diversity — truly homogeneous, stop here # Re-cluster the leaves k_sub = min(n_subclusters, max(2, len(attr_ch) // 3)) try: dist = cosine_distances(embs).astype(float) np.fill_diagonal(dist, 0.0) sub_lbs = AgglomerativeClustering(n_clusters=k_sub, metric='precomputed', linkage='average').fit_predict(dist) except Exception: continue # [C2][TAX][RAPTOR] Quality gate: reject unbalanced or degenerate splits cluster_sizes = [int((sub_lbs == sk).sum()) for sk in range(k_sub)] total_size = sum(cluster_sizes) max_cluster = max(cluster_sizes) if cluster_sizes else 0 # Balance: largest cluster ≤ 70% of total if total_size > 0 and max_cluster / total_size > 0.70: continue # Degenerate split — one cluster dominates, no real gain # Min-size: every sub-cluster must have ≥ 2 variables (HiExpan sibling coherence) if any(s < 2 for s in cluster_sizes): continue # Remove direct leaf connections from this agg node agg_id = int(agg['id']) for cid in attr_ch: remove_child(nodes, agg_id, cid) # Per sub-cluster leaf ids + their TITLES (so HiExpan labels from titles too, # not the boilerplate definition path). sub_cids_by_sk = [[cached_ids[i] for i, m_ in enumerate(sub_lbs == sk) if m_] for sk in range(k_sub)] sub_titles_by_sk = [[_concept_title(text_cache.get(cid, '')) for cid in cids] for cids in sub_cids_by_sk] agg_ancestors = ancestor_names(nodes, agg_id) + [agg['name']] _anc_words = set(re.findall(r'[a-z]{3,}', ' '.join(agg_ancestors).lower())) _used_sub = set() # Create sub-nodes — TITLE label wins; concept scoring only as fallback. for sk in range(k_sub): sub_cids = sub_cids_by_sk[sk] if not sub_cids: continue sub_mask = sub_lbs == sk sub_texts = [text_cache.get(cid, '') for cid in sub_cids] sub_emb = _centroid(embs[sub_mask]) title_label = _title_cluster_label( sub_titles_by_sk[sk], [sub_titles_by_sk[j] for j in range(k_sub) if j != sk], ancestor_words=_anc_words, used_labels=_used_sub) if title_label: label = title_label provenance = {'node_label': label, 'source_evidence': ['description_title'], 'confidence': 0.0, 'alternatives': []} elif sub_emb is not None and concept_embs is not None and concept_table: scores = score_concepts_for_cluster(sub_emb, concept_embs, concept_table, sub_texts) label, provenance = assign_concept_label( scores, fallback=f'{agg["name"]} {sk+1}', ancestor_names=agg_ancestors, top_level_tasks=top_level_tasks, ) else: label = f'{agg["name"]} {sk+1}' provenance = None _used_sub.add(str(label).lower()) nid = next_id(nodes) nodes.append(make_agg(nid, label, desc=f'Sub-group of {agg["name"]}: {label}', relation_type='belongs_to', provenance=provenance)) add_child(nodes, agg_id, nid) for cid in sub_cids: add_child(nodes, nid, cid) n_exp += 1 return nodes, n_exp def hiexpan_global_optimization(nodes, embedder, text_cache, n_passes=2, emb_cache=None): """[HIE] Global optimization — repeated width expansion passes until convergence. Reuses emb_cache — no new encode() calls across passes.""" total = 0 for _ in range(n_passes): nodes, moves = hiexpan_width_expansion(nodes, embedder, text_cache, threshold=0.40, emb_cache=emb_cache) total += moves if moves == 0: break return nodes, total def run_hiexpan(nodes, can, embedder, concept_table=None, concept_embs=None, max_depth_passes=4): """ [HIE][C1][C2] Run all HiExpan passes with a single pre-built embedding cache. Paper-correct: encode all leaves ONCE, reuse across coherence / width / depth / global. This reduces HiExpan from O(n_passes × n_leaves) encode calls to O(1). [C1][HIE][TAX] Recursive depth expansion loop: HiExpan: "builds the taxonomy by recursively expanding all these sets." TaxoGen: "splitting a coarse topic into fine-grained ones" is iterative. Progressive coherence thresholds: [0.45, 0.38, 0.30, 0.22] — each pass allows finer splits, pushing the hierarchy deeper until max_depth_passes or convergence. """ text_cache = {int(n['id']): str(n.get('semantic_desc', n.get('desc', ''))) for n in nodes if n['type'] == 'attribute'} # ── Pre-encode ALL leaves once (HiExpan paper: pre-compute entity representations) emb_cache = _build_emb_cache(nodes, embedder, text_cache) report = {} report['coherence_before'] = hiexpan_sibling_coherence( nodes, embedder, text_cache, emb_cache=emb_cache) nodes, n_width = hiexpan_width_expansion( nodes, embedder, text_cache, emb_cache=emb_cache) report['width_expansion_moves'] = n_width # [C1][HIE][TAX] Recursive depth expansion — progressive threshold schedule # Pass 1: threshold=0.45 (broad splits) # Pass 2: threshold=0.38 (medium splits) # Pass 3: threshold=0.30 (fine splits) # Pass 4: threshold=0.22 (very fine — only if still incoherent) # [FIX1] DEPTH-EXPANSION DISABLED. It split concept nodes (Total Correct, # Percent Correct, Error) by DELAY condition into sub-clusters whose titles # differ only by a parenthetical number — so the title labeler found nothing # distinctive and fell back to the boilerplate "Calculated Assessed Trials" # candidate. Those repetitive "same children" tiers are removed by not running # this pass; the hierarchy stays Task → concept → leaves. Measure→Statistic # depth is handled separately from the title composition (Fix 2), not here. total_depth_exp = 0 pass_idx = -1 report['depth_expansion_nodes'] = total_depth_exp report['depth_expansion_passes'] = pass_idx + 1 nodes, n_global = hiexpan_global_optimization( nodes, embedder, text_cache, emb_cache=emb_cache) report['global_optimization_moves'] = n_global report['coherence_after'] = hiexpan_sibling_coherence( nodes, embedder, text_cache, emb_cache=emb_cache) return nodes, report # ───────────────────────────────────────────────────────────────────────────── # CONFLICT RESOLUTION TABLE [HIE] # Variables where the top-2 concept assignments differ by < 0.05 in score. # ───────────────────────────────────────────────────────────────────────────── def compute_conflict_table(can, nodes): """ [HIE] Full conflict resolution table. For each low-confidence variable, computes similarity to current parent centroid and to all sibling group centroids — shows top-2 alternative placements. """ pm = build_parent_map(nodes) m = nmap(nodes) # Build centroid cache for all aggregation nodes agg_nodes = [n for n in nodes if n.get('type') == 'aggregation'] text_cache = {int(n['id']): str(n.get('desc', '')) for n in nodes if n.get('type') == 'attribute'} # TF-IDF similarity proxy (no embedder available here — use text overlap) def _sim_to_group(var_text, agg_node): agg_texts = [text_cache.get(int(c), '') for c in agg_node.get('related', []) if m.get(int(c), {}).get('type') == 'attribute'] if not agg_texts: return 0.0 combined = ' '.join(agg_texts).lower() var_words = set(re.findall(r'\b[a-z]{3,}\b', var_text.lower())) - _STOP group_words = set(re.findall(r'\b[a-z]{3,}\b', combined)) - _STOP if not var_words: return 0.0 return len(var_words & group_words) / len(var_words) rows = [] for _, row in can.iterrows(): score = float(row.get('_concept_score', 0.0)) if not (0 < score < 0.25): continue lid = row['_leaf_id'] lid_n = [n for n in nodes if n.get('metadata', {}).get('leaf_id') == lid] if not lid_n: continue nid = int(lid_n[0]['id']) parent_id = pm.get(nid) parent_n = m.get(parent_id, {}) parent_nm = parent_n.get('name', '') var_text = str(row.get('_text', '')) # Sibling groups = all aggregation nodes under same grandparent grandparent_id = pm.get(parent_id) sibling_groups = [ n for n in agg_nodes if pm.get(int(n['id'])) == grandparent_id and int(n['id']) != parent_id ] sib_sims = sorted( [{'name': sn['name'], 'sim': round(_sim_to_group(var_text, sn), 3)} for sn in sibling_groups], key=lambda x: -x['sim'] ) cur_sim = round(_sim_to_group(var_text, parent_n), 3) if parent_n else 0.0 alt1 = sib_sims[0] if len(sib_sims) > 0 else {'name': '—', 'sim': 0.0} alt2 = sib_sims[1] if len(sib_sims) > 1 else {'name': '—', 'sim': 0.0} decision = ('Move to alt-1' if alt1['sim'] > cur_sim + 0.10 else 'Review manually' if alt1['sim'] > cur_sim else 'Keep current') rows.append({ 'variable': row['_leaf_label'], 'concept_label': row.get('_concept_label', ''), 'concept_score': round(score, 3), 'current_parent': parent_nm, 'current_sim': cur_sim, 'alt_parent_1': alt1['name'], 'alt_sim_1': alt1['sim'], 'alt_parent_2': alt2['name'], 'alt_sim_2': alt2['sim'], 'decision': decision, 'source': row.get('_concept_source', ''), }) cols = ['variable', 'concept_label', 'concept_score', 'current_parent', 'current_sim', 'alt_parent_1', 'alt_sim_1', 'alt_parent_2', 'alt_sim_2', 'decision', 'source'] return pd.DataFrame(rows, columns=cols) if rows else pd.DataFrame(columns=cols) # ───────────────────────────────────────────────────────────────────────────── # CASTANET PARALLEL FACETED HIERARCHIES [CAS] # Uses automatic concept assignments — no hardcoded dimension patterns. # ───────────────────────────────────────────────────────────────────────────── def detect_facets(can, code_expansions=None): """ [CAS] Auto-detect semantic facet dimensions from the actual metadata structure. Each facet is only added when meaningful (>1 distinct value in the data). No hardcoded labels — facet names and groups come entirely from the data. Detects (in order, only if data supports them): Task — top-level group path Variant — second-level group path Measure Type — automatically assigned concept label (N×M alignment) Statistic — detected from description text (mean/sd/median/percent etc.) Condition — numeric code suffix (delay/timepoint conditions) Outcome Type — outcome/error keywords detected from description text Scale/Precision — metadata columns (decimal places, unit, format) Code Family — structural prefix groups """ facets = [] code_expansions = code_expansions or {} sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text' # ── Facet: Task (top-level group path) ──────────────────────────────────── top_groups = can['_group_path'].apply( lambda x: str(x).split(' > ')[0].strip() if str(x) not in ('', 'nan', 'Ungrouped') else 'Ungrouped' ) if top_groups.nunique() > 1: can['_facet_task'] = top_groups facets.append({ 'name': 'Task', 'desc': 'Variables grouped by their top-level task or domain.', 'fn': (lambda col: lambda row: str(row.get(col, 'Ungrouped')))('_facet_task'), 'relation': 'belongs_to', }) # ── Facet: Variant (second-level group path, if present) ────────────────── def _second_level(gpath): parts = str(gpath).split(' > ') return parts[1].strip() if len(parts) > 1 else '' second = can['_group_path'].apply(_second_level) if second.ne('').any() and second.nunique() > 1: can['_facet_variant'] = second.where(second != '', 'No Variant') facets.append({ 'name': 'Variant', 'desc': 'Variables grouped by their second-level structural variant.', 'fn': (lambda col: lambda row: str(row.get(col, 'No Variant')))('_facet_variant'), 'relation': 'belongs_to', }) # ── Facet: Measure Type (concept label from N×M alignment) ──────────────── has_concepts = can['_concept_label'].fillna('').ne('').any() if has_concepts and can['_concept_label'].nunique() > 1: facets.append({ 'name': 'Measure Type', 'desc': 'Variables grouped by automatically discovered concept label (N×M embedding alignment).', 'fn': lambda row: str(row['_concept_label']) if str(row['_concept_label']) not in ('', 'nan') else 'Unclassified', 'relation': 'related_to', }) # ── Facet: Statistic (detected from description text) ───────────────────── _stat_re = re.compile( r'\b(mean|average|median|standard deviation|std|percent|proportion|' r'probability|total|sum|count|maximum|minimum|range|variance|' r'coefficient|ratio|rate|frequency)\b', re.IGNORECASE ) _stat_norm = { 'average': 'Mean', 'std': 'Standard Deviation', 'proportion': 'Percent', 'sum': 'Total', 'count': 'Total', 'frequency': 'Rate', } def _extract_stat(row): hits = _stat_re.findall(str(row.get(sem_col, row.get('_text', ''))).lower()) if not hits: return '' h = hits[0].lower() return _stat_norm.get(h, h.title()) stat_col = can.apply(_extract_stat, axis=1) if stat_col.ne('').any() and stat_col.nunique() > 1: can['_facet_stat'] = stat_col.where(stat_col != '', 'Other') facets.append({ 'name': 'Statistic', 'desc': 'Variables grouped by statistical summary type detected from descriptions.', 'fn': (lambda col: lambda row: str(row.get(col, 'Other')))('_facet_stat'), 'relation': 'is_statistic_of', }) # ── Facet: Condition (numeric code suffix) ───────────────────────────────── _num_re = re.compile(r'(\d+)') def _extract_cond(row): hits = _num_re.findall(str(row['_leaf_label']).split('/')[0].strip()) return hits[0] if hits else '' cond_col = can.apply(_extract_cond, axis=1) if cond_col.ne('').any() and cond_col.nunique() > 1: can['_facet_cond'] = cond_col.where(cond_col != '', 'No Condition') # Name facet from most common unit word in descriptions _unit_re = re.compile(r'\b(second|msec|millisecond|month|week|day|year|trial|block|session|delay)\b', re.IGNORECASE) all_text = ' '.join(can[sem_col].fillna('').astype(str).tolist()).lower() unit_hits = _unit_re.findall(all_text) fname = (max(set(unit_hits), key=unit_hits.count).title() + ' Condition') if unit_hits else 'Condition' facets.append({ 'name': fname, 'desc': 'Variables grouped by numeric condition variant in variable codes.', 'fn': (lambda col: lambda row: str(row.get(col, 'No Condition')))('_facet_cond'), 'relation': 'has_condition', }) # ── Facet: Outcome/Error Type (from description keywords) ───────────────── _out_re = re.compile(r'\b(error|errors|miss|false alarm|omission|commission|incorrect|outcome|penalty)\b', re.IGNORECASE) def _extract_outcome(row): hits = _out_re.findall(str(row.get(sem_col, row.get('_text', ''))).lower()) return hits[0].title() if hits else '' out_col = can.apply(_extract_outcome, axis=1) if out_col.ne('').any() and out_col.nunique() > 1: can['_facet_outcome'] = out_col.where(out_col != '', 'Other') facets.append({ 'name': 'Outcome Type', 'desc': 'Variables grouped by outcome/error type detected from description text.', 'fn': (lambda col: lambda row: str(row.get(col, 'Other')))('_facet_outcome'), 'relation': 'has_measure', }) # ── Facet: Scale/Precision (from _raw metadata columns) ─────────────────── _prec_re = re.compile(r'\b(decimal|precision|unit|scale|format)\b', re.IGNORECASE) if '_raw' in can.columns: sample_raw = can['_raw'].dropna().iloc[0] if len(can) > 0 else {} prec_cols = [c for c in (sample_raw.keys() if isinstance(sample_raw, dict) else []) if _prec_re.search(str(c))] if prec_cols: def _extract_prec(row): raw = row.get('_raw', {}) if not isinstance(raw, dict): return '' for pc in prec_cols: v = str(raw.get(pc, '')).strip() if v and v.lower() not in ('nan', 'none', ''): return v return '' prec_col = can.apply(_extract_prec, axis=1) if prec_col.ne('').any() and prec_col.nunique() > 1: can['_facet_prec'] = prec_col.where(prec_col != '', 'Unspecified') facets.append({ 'name': 'Scale/Precision', 'desc': 'Variables grouped by decimal places or unit of measurement.', 'fn': (lambda col: lambda row: str(row.get(col, 'Unspecified')))('_facet_prec'), 'relation': 'belongs_to', }) # ── Facet: Code Family ───────────────────────────────────────────────────── has_families = can['_code_family'].fillna('').ne('').any() if has_families and can['_code_family'].nunique() > 1: facets.append({ 'name': 'Code Family', 'desc': 'Variables grouped by variable-code structural prefix.', 'fn': lambda row: str(row['_code_family']) if str(row['_code_family']) not in ('', 'nan') else 'Other', 'relation': 'belongs_to', }) # Fallback: TF-IDF semantic clusters if fewer than 2 facets detected if len(facets) < 2: texts = can[sem_col].fillna('').tolist() for nc in [5, 8]: lbls = tfidf_cluster_labels(texts, max_clusters=nc) lbl_col = f'_tfidf_cluster_{nc}' can[lbl_col] = lbls facets.append({ 'name': f'Semantic Cluster (k={nc})', 'desc': f'TF-IDF agglomerative clustering into {nc} groups.', 'fn': (lambda col: lambda row: str(row.get(col, 'Other')))(lbl_col), 'relation': 'related_to', }) return facets def build_facet_hierarchy(can, facet, project='root'): """[CAS] Single-level facet hierarchy: Root → Group → Leaf.""" nodes = [{'id': 0, 'name': project, 'type': 'root', 'dtype': 'root', 'isShown': True, 'related': [], 'desc': f"Facet: {facet['name']}"}] group_ids = {} for i, (_, row) in enumerate(can.iterrows(), start=1): group_label = str(facet['fn'](row)) if group_label not in group_ids: gid = len(nodes) group_ids[group_label] = gid nodes.append(make_agg(gid, group_label, desc=f"{facet['name']}: {group_label}", relation_type=facet['relation'])) add_child(nodes, 0, gid) lid = len(nodes) nodes.append({'id': lid, 'name': str(row['_leaf_label']), 'dtype': str(row['_dtype']), 'related': [], 'isShown': True, 'type': 'attribute', 'desc': str(row['_text']), 'source_file': str(row['_source_file']), 'metadata': {'leaf_id': str(row['_leaf_id']), 'group_path': str(row['_group_path'])}}) add_child(nodes, group_ids[group_label], lid) for n in nodes: n['related'] = list(dict.fromkeys([int(x) for x in n.get('related', [])])) return nodes # ───────────────────────────────────────────────────────────────────────────── # VISUALIZATION # ───────────────────────────────────────────────────────────────────────────── RELATION_COLORS = { 'has_measure': '#4C72B0', 'is_statistic_of': '#DD8452', 'has_condition': '#55A868', 'part_of': '#C44E52', 'instance_of': '#8172B2', 'subclass_of': '#937860', 'belongs_to': '#8C8C8C', 'related_to': '#CCB974', '': '#8C8C8C', } def _node_color(n): if n.get('type') == 'root': return '#c44e52' if n.get('type') == 'attribute': return '#4C72B0' if n.get('type') == 'collapsed': return '#bbbbbb' return RELATION_COLORS.get(n.get('info', {}).get('relation_type', ''), '#8C8C8C') def _wrap_hover(text, width=90): """Word-wrap plain text into HTML lines for Plotly hover tooltips.""" text = str(text).replace('<', '<').replace('>', '>') words, lines, line, length = text.split(), [], [], 0 for w in words: if length + len(w) + 1 > width and line: lines.append(' '.join(line)) line, length = [w], len(w) else: line.append(w); length += len(w) + 1 if line: lines.append(' '.join(line)) return '
'.join(lines) def _rich_hover(n, nodes): """Full, word-wrapped hover tooltip (name, provenance, complete description). Shared by every view so the treemap and node-link tooltips are as readable as the sunburst's — leaves show their full semantic_desc, no truncation.""" nid = int(n['id']); lc = len(leaf_ids(nodes, nid)) rel = n.get('info', {}).get('relation_label', '') if n.get('type') == 'aggregation' else '' prov = n.get('concept_provenance', {}) raw_desc = (n.get('semantic_desc') or n.get('desc', '')) \ if n.get('type') == 'attribute' else n.get('desc', '') desc_html = _wrap_hover(raw_desc) alts = ', '.join(prov.get('alternatives', [])) src = ', '.join(prov.get('source_evidence', [])) return (f'{n.get("name","")}
Type: {n.get("type","")}
' f'Relation: {rel}
Variables: {lc}' + (f'
Confidence: {prov.get("confidence","")} | Source: {src}' f'
Alternatives: {alts}' if prov else '') + f'

{desc_html}') def plot_sunburst(nodes, max_depth=4): pm = build_parent_map(nodes) ids, labels, parents_, values, hover = [], [], [], [], [] for n in nodes: nid = int(n['id']); lc = len(leaf_ids(nodes, nid)) ids.append(str(nid)); labels.append(str(n.get('name', ''))[:40]) parents_.append('' if nid == 0 else str(pm.get(nid, 0))) values.append(max(1, lc)) hover.append(_rich_hover(n, nodes)) fig = go.Figure(go.Sunburst(ids=ids, labels=labels, parents=parents_, values=values, branchvalues='total', hovertext=hover, hoverinfo='text', maxdepth=max_depth, insidetextorientation='radial', marker=dict(colorscale='Blues', line=dict(width=1, color='white')), leaf=dict(opacity=0.85))) fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10), title='Click sector to drill down — hover for concept provenance') return fig def plot_treemap(nodes): pm = build_parent_map(nodes) ids, labels, parents_, values, hover = [], [], [], [], [] for n in nodes: nid = int(n['id']); lc = len(leaf_ids(nodes, nid)) ids.append(str(nid)); labels.append(str(n.get('name', ''))[:40]) parents_.append('' if nid == 0 else str(pm.get(nid, 0))) values.append(max(1, lc)) hover.append(_rich_hover(n, nodes)) fig = go.Figure(go.Treemap(ids=ids, labels=labels, parents=parents_, values=values, branchvalues='total', hovertext=hover, hoverinfo='text', textinfo='label+value', marker=dict(colorscale='Blues', line=dict(width=1, color='white')))) fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10)) return fig def plot_facets_parallel(facet_trees): """[CAS] Side-by-side sunbursts — one per facet dimension.""" names = list(facet_trees.keys()); nf = len(names) if nf == 0: return go.Figure() cols = min(3, nf); rows = (nf + cols - 1) // cols specs = [[{'type': 'sunburst'} for _ in range(cols)] for _ in range(rows)] fig = make_subplots(rows=rows, cols=cols, specs=specs, subplot_titles=names) for idx, (fname, fnodes) in enumerate(facet_trees.items()): row = idx // cols + 1; col = idx % cols + 1 pm = build_parent_map(fnodes) ids, labels, parents_, values, hover = [], [], [], [], [] for n_ in fnodes: nid = int(n_['id']); lc = len(leaf_ids(fnodes, nid)) ids.append(f'{fname}_{nid}'); labels.append(str(n_.get('name', ''))[:28]) parents_.append('' if nid == 0 else f'{fname}_{pm.get(nid, 0)}') values.append(max(1, lc)) hover.append(f'{n_.get("name","")}
Variables: {lc}') fig.add_trace(go.Sunburst(ids=ids, labels=labels, parents=parents_, values=values, branchvalues='total', hovertext=hover, hoverinfo='text', maxdepth=2, leaf=dict(opacity=0.8), marker=dict(line=dict(width=1, color='white'))), row=row, col=col) fig.update_layout(height=420 * rows, margin=dict(l=10, r=10, t=50, b=10), title_text='Castanet Parallel Faceted Hierarchies — same variables, different views', title_font_size=13) return fig def display_graph(nodes, max_depth=4, show_hidden=False): m = nmap(nodes); dnodes = {}; edges = []; counter = 10 ** 9 def rec(nid, depth): nonlocal counter n = m.get(int(nid)) if not n: return if not show_hidden and n.get('isShown') is False and depth > 0: return dnodes[int(nid)] = n if depth >= max_depth and n.get('related'): counter += 1; cid = counter dnodes[cid] = {'id': cid, 'name': f'… {len(leaf_ids(nodes,nid))} variables', 'type': 'collapsed', 'dtype': 'determine', 'related': [], 'desc': f'Collapsed: {n.get("name")}', 'isShown': True} edges.append((int(nid), cid)); return for c in n.get('related', []): ch = m.get(int(c)) if not ch: continue if not show_hidden and ch.get('isShown') is False: continue edges.append((int(nid), int(c))); rec(int(c), depth + 1) rec(0, 0); return list(dnodes.values()), edges def positions(dnodes, edges): """ Reingold-Tilford style layout. x = depth × horizontal_scale (breathing room between levels) y = subtree-aware vertical placement with 1.8 spacing per leaf Aggregation nodes centered over their children's y range. """ H_SCALE = 3.0 # horizontal gap between depth levels V_SPACE = 1.8 # vertical gap between leaf slots children = defaultdict(list) for p, c in edges: children[p].append(c) pos = {} counter = {'v': 0} def rec(nid, depth): ch = children.get(nid, []) if not ch: # Leaf — assign next vertical slot y_pos = counter['v'] * V_SPACE counter['v'] += 1 pos[nid] = (depth * H_SCALE, y_pos) return y_pos child_ys = [rec(c, depth + 1) for c in ch] # Parent centered over children range y_pos = float(np.mean(child_ys)) pos[nid] = (depth * H_SCALE, y_pos) return y_pos rec(0, 0) return pos def plot_node_link(nodes, max_depth, show_hidden, show_leaf_labels): """ Node-link tree with Reingold-Tilford layout. Paper: Taxonomizer recommends Sunburst as primary view for large hierarchies. Node-link is supplementary — best for exploring structure at moderate depth. """ dnodes, edges = display_graph(nodes, max_depth, show_hidden) pos = positions(dnodes, edges) # Edges: elbow-style (horizontal then vertical) ex, ey = [], [] for p, c in edges: if p not in pos or c not in pos: continue x0, y0 = pos[p] x1, y1 = pos[c] # Draw: parent → midpoint horizontally → child vertically → child xm = (x0 + x1) / 2 ex += [x0, xm, xm, x1, None] ey += [y0, y0, y1, y1, None] traces = [go.Scatter(x=ex, y=ey, mode='lines', line=dict(width=1, color='#c8c8c8'), hoverinfo='skip', showlegend=False)] # Nodes — split aggregation and leaf into two traces for cleaner rendering agg_xs, agg_ys, agg_labels, agg_colors, agg_hover = [], [], [], [], [] lf_xs, lf_ys, lf_labels, lf_colors, lf_hover = [], [], [], [], [] for n in dnodes: nid = int(n['id']) if nid not in pos: continue x, y = pos[nid] lc = len(leaf_ids(nodes, nid)) lab = n.get('name', str(nid)) htxt = _rich_hover(n, nodes) col = _node_color(n) if n.get('type') in ('root', 'aggregation'): display_lab = (lab + (f' ({lc})' if lc else ''))[:50] agg_xs.append(x); agg_ys.append(y) agg_labels.append(display_lab) agg_colors.append(col); agg_hover.append(htxt) else: display_lab = lab[:40] if show_leaf_labels else '' lf_xs.append(x); lf_ys.append(y) lf_labels.append(display_lab) lf_colors.append(col); lf_hover.append(htxt) if agg_xs: traces.append(go.Scatter( x=agg_xs, y=agg_ys, mode='markers+text', text=agg_labels, textposition='middle right', hovertext=agg_hover, hoverinfo='text', marker=dict(size=16, color=agg_colors, line=dict(color='white', width=2)), showlegend=False )) if lf_xs: traces.append(go.Scatter( x=lf_xs, y=lf_ys, mode='markers+text', text=lf_labels, textposition='middle right', hovertext=lf_hover, hoverinfo='text', marker=dict(size=7, color=lf_colors, symbol='circle', opacity=0.75, line=dict(color='white', width=1)), showlegend=False )) n_leaves = max(12, len(lf_xs)) fig = go.Figure(traces) fig.update_layout( height=max(700, min(4000, int(n_leaves * 32))), margin=dict(l=20, r=220, t=40, b=20), plot_bgcolor='white', paper_bgcolor='white', xaxis=dict(visible=False, fixedrange=False), yaxis=dict(visible=False, autorange='reversed', fixedrange=False), dragmode='pan', annotations=[dict( text='Tip: Sunburst (above) is better for large hierarchies [Taxonomizer]', xref='paper', yref='paper', x=0.0, y=1.01, showarrow=False, font=dict(size=11, color='grey'), align='left' )] ) return fig def semantic_map(can): texts = can['_text'].fillna('').astype(str).tolist() labels = can['_leaf_label'].astype(str).tolist() groups = can['_group_path'].fillna('Ungrouped').astype(str).apply(lambda x: x.split(' > ')[0]) X = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000).fit_transform(texts) coords = TruncatedSVD(n_components=2, random_state=42).fit_transform(X) if X.shape[1] >= 2 \ else np.zeros((len(texts), 2)) fig = go.Figure() for g in list(dict.fromkeys(groups)): mask = groups == g; idx = np.where(mask.values)[0] fig.add_trace(go.Scatter(x=coords[mask, 0], y=coords[mask, 1], mode='markers', name=str(g), text=[labels[i] for i in idx], hovertext=[f'{labels[i]}
{texts[i][:400]}' for i in idx], hoverinfo='text', marker=dict(size=8, opacity=0.85))) fig.update_layout(height=600, plot_bgcolor='white', paper_bgcolor='white') return fig # ───────────────────────────────────────────────────────────────────────────── # STREAMLIT STATE # ───────────────────────────────────────────────────────────────────────────── for _key in ['nodes', 'canonical', 'configs', 'embedder', 'facet_trees', 'hiexpan_report', 'concept_table', 'domain']: if _key not in st.session_state: st.session_state[_key] = None # ───────────────────────────────────────────────────────────────────────────── # SIDEBAR # ───────────────────────────────────────────────────────────────────────────── with st.sidebar: st.header('1. Input') uploads = st.file_uploader('Upload metadata file(s)', type=['csv','tsv','txt','xlsx','xls','json','md','markdown'], accept_multiple_files=True) existing = st.file_uploader('Load existing hierarchy JSON', type=['json']) st.header('2. Generation') project = st.text_input('Root / project name', value='metadata_project') max_rows = st.slider('Max variables', 10, 3000, 600, 10) merge_files = st.checkbox('Merge uploaded files', value=True) n_clusters = st.slider('Max clusters per group', 2, 16, 8, 1, help='Maximum number of concept sub-groups per top-level group.') st.header('3. Semantic embedder') model_choice = st.selectbox('Embedding model', ['all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'paraphrase-MiniLM-L6-v2', 'TF-IDF (no ST)'], help='[TAX][GON] Sentence-BERT for dense semantic embeddings.') max_concepts = st.slider('Max candidate concepts', 30, 300, 120, 10, help='How many candidate concepts to extract from metadata text.') st.caption('HiExpan refinement runs automatically after every build. ' 'Wikidata / Wikipedia / PubMed activate automatically for biomedical, ' 'cognitive, and neurological domains.') # ───────────────────────────────────────────────────────────────────────────── # LOAD EXISTING HIERARCHY # ───────────────────────────────────────────────────────────────────────────── if existing is not None: try: obj = json.loads(existing.getvalue().decode('utf-8', errors='replace')) if isinstance(obj, list): st.session_state.nodes = obj st.success('Loaded hierarchy JSON.') else: st.error('Hierarchy JSON must be a list of nodes.') except Exception as e: st.error(f'Could not load: {e}') # ───────────────────────────────────────────────────────────────────────────── # MAIN PIPELINE # ───────────────────────────────────────────────────────────────────────────── if uploads: paths = save_uploads(uploads) raw_by = {}; cfg_by = {}; prof_by = {} st.subheader('Step 1 — Inspect files') for p in paths: try: df = load_any(p); warn = False if probably_raw(df): df = raw_to_metadata(df); warn = True cfg, prof = detect_roles(df) raw_by[p.name] = df; cfg_by[p.name] = cfg; prof_by[p.name] = prof with st.expander(f'📄 {p.name}', expanded=False): if warn: st.warning('Looked like raw data — columns converted to metadata rows.') st.write(f'Rows: **{len(df):,}**, Columns: **{len(df.columns)}**') st.dataframe(df.head(10), use_container_width=True) except Exception as e: st.error(f'Failed to load {p.name}: {e}') st.subheader('Step 2 — Confirm column roles') configs = {} for name, df in raw_by.items(): with st.expander(f'⚙️ {name}', expanded=True): cols = list(df.columns); auto = cfg_by[name] c1, c2 = st.columns(2) with c1: leaf = st.multiselect('Leaf column(s)', cols, default=[c for c in auto['leaf_cols'] if c in cols], key=f'leaf_{name}') group = st.multiselect('Group/Task column(s)', cols, default=[c for c in auto['group_cols'] if c in cols], key=f'group_{name}') with c2: text = st.multiselect('Description column(s)', cols, default=[c for c in auto['text_cols'] if c in cols], key=f'text_{name}') meta = st.multiselect('Type/unit column(s)', cols, default=[c for c in auto['metadata_cols'] if c in cols], key=f'meta_{name}') prev = list(dict.fromkeys(leaf + group + text + meta)) if prev: st.dataframe(df[prev].head(6), use_container_width=True) configs[name] = {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta} if st.button('🌳 Build Approach 1 hierarchy', type='primary'): try: # ── Step A: Build canonical schemas per file ─────────────────────── cans = [build_canonical(df.head(max_rows), configs[name], name) for name, df in raw_by.items()] # ── Step A.5: Domain check before merging multiple files ────────── if merge_files and len(cans) > 1: file_domains = [detect_domain(c) for c in cans] unique_domains = list(dict.fromkeys(file_domains)) if len(unique_domains) > 1: names_str = ', '.join( f'{n} → {d}' for n, d in zip(list(raw_by.keys()), file_domains) ) st.warning( f'Files appear to be from different domains: {names_str}. ' f'Merging may produce a mixed hierarchy. ' f'Uncheck **Merge uploaded files** to process separately.' ) can = pd.concat(cans, ignore_index=True) else: can = cans[0] if len(can) > max_rows: can = can.head(max_rows).copy() # ── Step B: Code family + acronym expansion ─────────────────────── with st.spinner('Detecting variable code families and expanding acronyms...'): can = cluster_codes_by_prefix(can) n_families = can['_code_family'].ne('').sum() if n_families > 0: st.info(f'Detected {can["_code_family"].nunique()} code families ' f'({n_families} coded variables).') code_expansions = expand_variable_codes(can) if code_expansions: st.info(f'Identified {len(code_expansions)} acronym/segment expansions.') st.session_state['code_expansions'] = code_expansions # ── Step B.5: Patch _semantic_text with acronym expansions ────────── # [C7][GON][LOB] Description text is the dominant semantic signal. # Gonçalves: N×M alignment right-hand side uses concept descriptions, not codes. # Lobo: "generate additional context for column names to aid matching." # Fix: prepend ONLY when expansion adds genuinely new semantic content. # Do NOT prepend if the expansion token is already present in the description # (prevents "DMS DMS Correct Latency..." doubling that caused "Dms Dms" labels). # [F1] Word-boundary coverage check for _patch_semantic. # Previous guard used raw substring match: "DMS Correct Latency SD" # not in description even when description has "DMS Correct Latency # Standard Deviation" — because "SD" ≠ "Standard Deviation". # New check: expansion is "covered" if ≥60% of its non-stop words # appear as whole words in the description. If covered, skip prepend. _patch_stop = {'the','a','an','is','are','was','to','of','in', 'on','at','for','with','by','and','or','as'} def _exp_covered(exp_str, base_lower): words = [w for w in exp_str.lower().split() if w not in _patch_stop and len(w) > 2] if not words: return True found = sum( 1 for w in words if re.search(r'\b' + re.escape(w) + r'\b', base_lower) ) return found / len(words) >= 0.60 def _patch_semantic(row, exps): base = str(row.get('_semantic_text', row['_text'])) base_lower = base.lower() code = str(row['_leaf_label']).strip().split('/')[0] seg_tok = re.compile(r'([A-Z]{2,}|\d+)') segments = seg_tok.findall(code) new_parts = [] for s in segments: exp = exps.get(s, {}).get('expansion', '') # Only prepend if expansion adds genuinely new information # [F1] word-boundary check: skip if ≥60% of exp words already present if exp and not _exp_covered(exp, base_lower): new_parts.append(exp) if new_parts: return f'{" ".join(new_parts)} {base}' # Family expansion — same word-boundary guard fam = str(row.get('_code_family', '')) if fam and fam in exps: fam_exp = exps[fam].get('expansion', '') if fam_exp and not _exp_covered(fam_exp, base_lower): return f'{fam_exp} {base}' return base can['_semantic_text'] = can.apply( lambda r: _patch_semantic(r, code_expansions), axis=1 ) # ── Step C: Load SBERT embedder (always attempt; fallback graceful) ─ with st.spinner('Loading SBERT embedding model...'): model_name = model_choice if model_choice != 'TF-IDF (no ST)' else 'all-MiniLM-L6-v2' emb = SemanticEmbedder(model_name=model_name) ok, msg = emb.load() # always attempt SBERT load if ok: st.success(f'SBERT loaded: {msg}') else: st.warning(f'SBERT unavailable — {msg}. Using TF-IDF+SVD fallback.') st.session_state.embedder = emb # ── Step D: Detect domain ───────────────────────────────────────── domain = detect_domain(can) st.session_state.domain = domain _bio_domains = ('biomedical', 'cognitive', 'neurological') _use_external = domain in _bio_domains st.info(f'Detected domain: **{domain}**' + (' — Wikidata / Wikipedia / PubMed activated' if _use_external else '')) # ── Step E: Extract candidate concepts from metadata ────────────── with st.spinner('Extracting candidate concepts from metadata text...'): candidates = extract_candidate_concepts_from_metadata(can, max_concepts=max_concepts) st.info(f'Extracted {len(candidates)} candidate concepts from metadata.') # ── Step F: Build concept table ──────────────────────────────────── # Biomedical / cognitive / neurological → enrich via Wikidata + PubMed # Wikipedia excluded: too slow for interactive use; Wikidata covers same ground # All other domains → local-only (no HTTP calls) if _use_external: with st.spinner(f'Enriching concept table via Wikidata / PubMed ({domain} domain)...'): pb = st.progress(0) concept_table = retrieve_concept_table( candidates, domain=domain, use_wikidata=True, use_wikipedia=False, use_wordnet=True, use_pubmed=True, bioportal_key='', progress_cb=lambda x: pb.progress(x), code_expansions=code_expansions, ) pb.empty() n_wd = sum(1 for c in concept_table if 'wikidata' in c.get('source', '')) n_pm = sum(1 for c in concept_table if 'pubmed' in c.get('source', '')) st.success(f'Concept table: {len(concept_table)} entries ' f'(Wikidata: {n_wd}, PubMed: {n_pm})') else: concept_table = [ {'label': c['label'], 'full_text': c['label'], 'source': c.get('source', 'metadata_tfidf'), 'frequency': c.get('frequency', 0), 'tfidf_score': c.get('tfidf_score', 0.0), 'broader_relations': []} for c in candidates ] st.success(f'Concept table: {len(concept_table)} entries (local metadata — no external calls)') st.session_state.concept_table = concept_table # ── Step F.5: Fit shared vector space for TF-IDF fallback ───────── # CRITICAL: must encode variables + concepts in the SAME space for # N×M cosine similarity to be valid. No-op when SBERT is active. with st.spinner('Fitting shared embedding space...'): var_texts = can['_semantic_text'].fillna('').astype(str).tolist() concept_texts = [c['full_text'] for c in concept_table] emb.fit_joint(var_texts + concept_texts) # ── Step F.6: Pre-compute Statistic and Condition facets ───────── # [F3][F5][CAS] These columns are needed inside _cluster_and_label # for facet sub-splitting. They must be computed BEFORE Step G. # detect_facets / build_castanet_facets runs AFTER hierarchy build # (Step I), so we pre-compute only _facet_cond here. The statistic tier # is produced data-drivenly later by _nest_by_measure (no hardcoded vocab). with st.spinner('Pre-computing Condition facets [CAS]...'): can = precompute_stat_cond_facets(can) n_cond = can['_facet_cond'].ne('').sum() st.info(f'Facet pre-computation: {n_cond} variables with Condition. ' f'Statistic depth is derived from concept titles (_nest_by_measure).') # ── Step G: Build concept hierarchy (N×M alignment) ────────────── with st.spinner('Building concept hierarchy via N×M alignment [GON][TAX]...'): nodes = build_concept_hierarchy( can, emb, concept_table, project=project, n_clusters_per_group=n_clusters, ) # ── Step H: HiExpan refinement (always automatic) ───────────────── with st.spinner('Running HiExpan refinement [HIE]...'): if concept_table: c_embs = emb.encode([c['full_text'] for c in concept_table]) else: c_embs = None nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs) # HiExpan's width/global passes MOVE leaves between concepts; a concept # that loses all its leaves becomes empty. build_concept_hierarchy prunes # internally, but that runs BEFORE HiExpan — so re-prune here, else empty # nodes break the Plotly branchvalues='total' sunburst/treemap (parent # value < sum(children) → blank render; node-link is unaffected). _prune_empty_aggregations(nodes) _alive = {int(n['id']) for n in nodes} for _n in nodes: _n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', [])) if x in _alive] st.session_state.hiexpan_report = report wmoves = report.get('width_expansion_moves', 0) dexp = report.get('depth_expansion_nodes', 0) gmoves = report.get('global_optimization_moves', 0) st.success(f'HiExpan complete — width moves: {wmoves}, ' f'depth expansions: {dexp}, global moves: {gmoves}') # ── Step I: Castanet facets ─────────────────────────────────────── with st.spinner('Building Castanet parallel facets [CAS]...'): facets = detect_facets(can, code_expansions=code_expansions) facet_trees = {f['name']: build_facet_hierarchy(can, f, project) for f in facets} st.session_state.facet_trees = facet_trees # ── Step J: Evaluation metrics ──────────────────────────────────── n_total = len(can) n_aligned = can['_concept_score'].gt(0.08).sum() n_lowconf = can['_concept_score'].between(0, 0.25, inclusive='right').sum() n_fallbk = can['_concept_score'].eq(0.0).sum() n_family = can['_code_family'].ne('').sum() avg_conf = float(can['_concept_score'].mean()) cov_pct = round(100 * n_aligned / max(n_total, 1), 1) st.session_state['eval_metrics'] = { 'total_variables': n_total, 'alignment_coverage_%': cov_pct, 'avg_label_confidence': round(avg_conf, 3), 'fallback_count': int(n_fallbk), 'fallback_rate_%': round(100 * n_fallbk / max(n_total, 1), 1), 'low_confidence_count': int(n_lowconf), 'variables_with_family': int(n_family), 'code_family_%': round(100 * n_family / max(n_total, 1), 1), 'concept_table_size': len(concept_table), 'wikidata_entries': sum(1 for c in concept_table if 'wikidata' in c.get('source', '')), 'pubmed_entries': sum(1 for c in concept_table if 'pubmed' in c.get('source', '')), 'acronym_expansions': len(code_expansions), 'hiexpan_width_moves': wmoves, 'hiexpan_depth_exp': dexp, 'hiexpan_global_moves': gmoves, } # ── Build concept-label provenance DataFrame (4th export) ────────── prov_rows_build = [] for _n in nodes: if _n.get('type') == 'aggregation' and _n.get('concept_provenance'): _p = _n['concept_provenance'] prov_rows_build.append({ 'Node': _n['name'], 'Confidence': _p.get('confidence', ''), 'Source': ', '.join(_p.get('source_evidence', [])), 'Embedding sim': _p.get('embedding_sim', ''), 'Alternatives': ', '.join(_p.get('alternatives', [])[:3]), }) prov_df = pd.DataFrame(prov_rows_build) if prov_rows_build else pd.DataFrame() st.session_state.canonical = can st.session_state.configs = configs st.session_state.nodes = nodes st.session_state['prov_df'] = prov_df n_l = len([n for n in nodes if n.get('type') == 'attribute']) n_i = len([n for n in nodes if n.get('type') == 'aggregation']) st.success(f'Built: {n_l} variables, {n_i} internal nodes, ' f'{len(facets)} facets — {cov_pct}% concept-aligned.') except Exception as e: st.error(f'Build failed: {e}') import traceback; st.code(traceback.format_exc()) if st.session_state.nodes is None: st.info('Upload a metadata file and click **Build Approach 1 hierarchy**.') st.stop() nodes = st.session_state.nodes can = st.session_state.canonical facet_trees = st.session_state.facet_trees or {} hiexpan_report = st.session_state.hiexpan_report or {} concept_table = st.session_state.concept_table or [] # ───────────────────────────────────────────────────────────────────────────── # TABS # ───────────────────────────────────────────────────────────────────────────── tabs = st.tabs(['🌳 LoD tree', '🔲 Faceted view', '🧬 HiExpan report', '⚠️ Conflicts', '✏️ Edit', '🔍 Search', '🗺️ Semantic map', '📋 Metadata', '⬇️ Export', 'ℹ️ Method', '📊 Evaluation']) # ── Tab 0: LoD tree ─────────────────────────────────────────────────────────── with tabs[0]: # ── Visualization controls (above chart — easy to find) ─────────────────── vc1, vc2, vc3, vc4 = st.columns([2, 2, 1, 1]) with vc1: viz_mode = st.radio('View mode', ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'], horizontal=True, index=0, help='Sunburst recommended for large hierarchies [Taxonomizer]. Node-link best for exploring structure at smaller depth.') with vc2: depth = st.slider('Depth (Level of Detail)', 1, 8, 3, 1) with vc3: show_leaf_labels = st.checkbox('Leaf labels', value=False) with vc4: show_hidden = st.checkbox('Hidden nodes', value=False) st.divider() if viz_mode == 'Sunburst (drill-down)': st.caption('Hover for concept provenance (confidence, source, alternatives). Click to drill down.') st.plotly_chart(plot_sunburst(nodes, depth), use_container_width=True) elif viz_mode == 'Treemap': st.plotly_chart(plot_treemap(nodes), use_container_width=True) else: st.plotly_chart(plot_node_link(nodes, depth, show_hidden, show_leaf_labels), use_container_width=True) pr = path_rows(nodes) max_d = max((r['depth'] for r in pr), default=0) c1, c2, c3 = st.columns(3) c1.metric('Variables', len([n for n in nodes if n.get('type') == 'attribute'])) c2.metric('Internal nodes', len([n for n in nodes if n.get('type') == 'aggregation'])) c3.metric('Max depth', max_d) emb_ = st.session_state.embedder if emb_: st.caption(f'Embedding backend: **{emb_.backend}** | Domain: **{st.session_state.domain or "unknown"}**') # Code expansion table code_exp = st.session_state.get('code_expansions', {}) if code_exp: with st.expander(f'Acronym / code segment expansions ({len(code_exp)} found)', expanded=False): exp_rows = [{'Segment': seg, 'Expansion': v['expansion'], 'Evidence': ', '.join(v['evidence'])} for seg, v in code_exp.items()] st.dataframe(pd.DataFrame(exp_rows), use_container_width=True) # Concept label provenance for internal nodes prov_rows = [] for n in nodes: if n.get('type') == 'aggregation' and n.get('concept_provenance'): p = n['concept_provenance'] prov_rows.append({ 'Node': n['name'], 'Confidence': p.get('confidence', ''), 'Source': ', '.join(p.get('source_evidence', [])), 'Embedding sim': p.get('embedding_sim', ''), 'Alternatives': ', '.join(p.get('alternatives', [])[:3]), }) if prov_rows: with st.expander('Concept label provenance for internal nodes', expanded=False): st.dataframe(pd.DataFrame(prov_rows), use_container_width=True) # ── Tab 1: Faceted view ─────────────────────────────────────────────────────── with tabs[1]: st.subheader('Castanet Parallel Faceted Hierarchies') st.markdown( '**[CAS]** Each sunburst organises the same variables by a different dimension. ' 'Concept facet uses automatically assigned labels from embedding alignment.' ) if facet_trees: st.plotly_chart(plot_facets_parallel(facet_trees), use_container_width=True) st.markdown('### Per-facet detail') sel_facet = st.selectbox('Inspect facet tree', list(facet_trees.keys())) ft = facet_trees[sel_facet] st.plotly_chart(plot_sunburst(ft, max_depth=3), use_container_width=True) n_groups = len([n for n in ft if n.get('type') == 'aggregation']) st.info(f'Facet **{sel_facet}**: {n_groups} groups, ' f'{len([n for n in ft if n.get("type")=="attribute"])} variables') else: st.info('Build the hierarchy first to see faceted views.') # ── Tab 2: HiExpan report ───────────────────────────────────────────────────── with tabs[2]: st.subheader('HiExpan Refinement Report') if hiexpan_report: c1, c2, c3 = st.columns(3) c1.metric('Width expansion moves', hiexpan_report.get('width_expansion_moves', 0)) c2.metric('Depth expansion nodes', hiexpan_report.get('depth_expansion_nodes', 0)) c3.metric('Global optimization moves', hiexpan_report.get('global_optimization_moves', 0)) st.markdown('### Sibling coherence — before refinement (worst first)') before = hiexpan_report.get('coherence_before', []) if before: st.dataframe(pd.DataFrame(before), use_container_width=True) st.markdown('### Sibling coherence — after refinement') after = hiexpan_report.get('coherence_after', []) if after: st.dataframe(pd.DataFrame(after), use_container_width=True) b_mean = np.mean([r['coherence_score'] for r in before]) if before else float('nan') a_mean = np.mean([r['coherence_score'] for r in after]) st.metric('Mean coherence improvement', f'{a_mean:.3f}', delta=f'{a_mean - b_mean:+.3f}') else: st.info('HiExpan runs automatically. Build the hierarchy to see results.') # ── Evaluation metrics ───────────────────────────────────────────────────── ev = st.session_state.get('eval_metrics', {}) if ev: st.markdown('---') st.subheader('Evaluation Metrics') st.markdown( 'These metrics help evaluate how well the automatic concept alignment worked. ' 'For thesis evaluation, compare against a manually curated hierarchy.' ) ea, eb, ec, ed = st.columns(4) ea.metric('Alignment coverage', f'{ev.get("alignment_coverage_%", 0)}%', help='% of variables with concept score > 0.08 (non-fallback)') eb.metric('Avg label confidence', f'{ev.get("avg_label_confidence", 0):.3f}', help='Mean concept score across all variables (0–1)') ec.metric('Low-confidence placements', ev.get('low_confidence_count', 0), help='Variables with concept score 0–0.25 (review in Conflicts tab)') ed.metric('Fallback rate', f'{ev.get("fallback_rate_%", 0)}%', help='% of variables that got score=0 (no concept matched above threshold)') e2a, e2b, e2c = st.columns(3) e2a.metric('Concept table size', ev.get('concept_table_size', 0)) e2b.metric('Variables w/ code family', f'{ev.get("code_family_%", 0)}%') e2c.metric('Acronym expansions', ev.get('acronym_expansions', 0)) st.caption( '**Thesis interpretation:** Alignment coverage > 70% indicates the concept table ' 'adequately covers the metadata domain. Fallback rate > 30% suggests the domain vocabulary ' 'is sparse — try a biomedical/cognitive dataset to activate Wikidata + PubMed enrichment. ' 'Low-confidence > 20% suggests HiExpan depth expansion created subclusters with ambiguous boundaries.' ) # ── Tab 3: Conflict resolution ──────────────────────────────────────────────── with tabs[3]: st.subheader('Conflict Resolution — Low-Confidence Placements') st.markdown( 'Variables with concept assignment confidence < 0.25 may belong to multiple groups. ' 'Review and use the **Edit** tab to move them if needed.' ) if can is not None: conflict_df = compute_conflict_table(can, nodes) if len(conflict_df): st.dataframe(conflict_df, use_container_width=True) else: st.success('No low-confidence placements detected.') else: st.info('Build the hierarchy first.') # ── Tab 4: Edit ─────────────────────────────────────────────────────────────── with tabs[4]: rows_ = path_rows(nodes); choice_to_id = {r['choice']: r['id'] for r in rows_} selected = st.selectbox('Select node', list(choice_to_id.keys())) sid = choice_to_id[selected]; node = get_node(nodes, sid) c1, c2 = st.columns(2) with c1: st.markdown('### Node properties') new_name = st.text_input('Name', value=node.get('name', ''), key=f'nm{sid}') new_desc = st.text_area('Description', value=node.get('desc', ''), key=f'ds{sid}', height=100) dtype_vals = ['root', 'number', 'string', 'determine', 'unknown'] cur = node.get('dtype', 'determine') new_dtype = st.selectbox('Data type', dtype_vals, index=dtype_vals.index(cur) if cur in dtype_vals else 3, key=f'dt{sid}') new_shown = st.checkbox('Shown', value=bool(node.get('isShown', True)), key=f'sh{sid}') if node.get('type') == 'aggregation': rel_opts = list(RELATION_TYPES.keys()) cur_rel = node.get('info', {}).get('relation_type', 'belongs_to') new_rel = st.selectbox('Relation type', rel_opts, index=rel_opts.index(cur_rel) if cur_rel in rel_opts else 0, format_func=lambda k: f'{k} — {RELATION_TYPES[k]}', key=f'rel{sid}') if node.get('concept_provenance'): prov = node['concept_provenance'] st.markdown('**Concept provenance:**') st.json(prov) if st.button('Save changes'): info = dict(node.get('info', {})) if node.get('type') == 'aggregation': info['relation_type'] = new_rel info['relation_label'] = RELATION_TYPES.get(new_rel, '') st.session_state.nodes = update_node(nodes, sid, name=new_name, desc=new_desc, dtype=new_dtype, isShown=new_shown, info=info) st.rerun() with c2: st.markdown('### Move / add / delete') if node.get('type') in ['root', 'aggregation']: with st.form('add_grp'): cname = st.text_input('New child name', value='New Group') crel = st.selectbox('Relation type', list(RELATION_TYPES.keys())) cdesc = st.text_area('Description', value='') if st.form_submit_button('Add child'): nid_ = next_id(nodes) nodes.append(make_agg(nid_, cname, desc=cdesc, relation_type=crel)) add_child(nodes, sid, nid_) st.session_state.nodes = nodes; st.rerun() pops = [o for o in agg_opts(nodes, True) if o['id'] != sid] if sid != 0 and pops: tgt = st.selectbox('Move under', [o['label'] for o in pops]) tgt_id = next(o['id'] for o in pops if o['label'] == tgt) if st.button('Move node'): st.session_state.nodes = move_node(nodes, sid, tgt_id); st.rerun() if node.get('type') == 'aggregation': rea = st.checkbox('Reattach children when deleting', value=True) if st.button('Delete aggregation'): st.session_state.nodes = delete_agg(nodes, sid, rea); st.rerun() st.markdown('### Children') cns = [get_node(nodes, c) for c in node.get('related', [])] st.dataframe(pd.DataFrame([{'id': c.get('id'), 'name': c.get('name'), 'type': c.get('type'), 'relation': c.get('info', {}).get('relation_label', ''), 'desc': str(c.get('desc', ''))[:120]} for c in cns if c]), use_container_width=True) # ── Tab 5: Search ───────────────────────────────────────────────────────────── with tabs[5]: q = st.text_input('Search name, description, relation, type') out_ = [] for n in nodes: hay = ' '.join([str(n.get(k, '')) for k in ['name', 'desc', 'dtype', 'type']] + [n.get('info', {}).get('relation_label', '')]).lower() if not q or q.lower() in hay: out_.append({'id': n.get('id'), 'name': n.get('name'), 'type': n.get('type'), 'relation': n.get('info', {}).get('relation_label', ''), 'n_children': len(n.get('related', [])), 'desc': str(n.get('desc', ''))[:200]}) st.dataframe(pd.DataFrame(out_), use_container_width=True) # ── Tab 6: Semantic map ─────────────────────────────────────────────────────── with tabs[6]: if can is None or len(can) < 3: st.info('Semantic map available after build.') else: st.plotly_chart(semantic_map(can), use_container_width=True) # ── Tab 7: Metadata ─────────────────────────────────────────────────────────── with tabs[7]: if can is None: st.info('Available after build.') else: show_cols = [c for c in can.columns if c != '_raw'] st.dataframe(can[show_cols], use_container_width=True) # ── Tab 8: Export ───────────────────────────────────────────────────────────── with tabs[8]: # Name downloads after the uploaded CSV (fall back to the project name). if uploads: _base = safe_name(Path(uploads[0].name).stem) else: _base = safe_name(project) col1, col2 = st.columns(2) with col1: st.download_button( 'Hierarchy JSON', data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'), file_name=f'{_base}_approach1_hierarchy.json', mime='application/json', use_container_width=True, ) with col2: if facet_trees: st.download_button( 'Facets JSON', data=json.dumps(facet_trees, indent=2, ensure_ascii=False).encode('utf-8'), file_name=f'{_base}_approach1_facets.json', mime='application/json', use_container_width=True, ) col3, col4 = st.columns(2) with col3: if can is not None: st.download_button( 'Canonical CSV', data=can.drop(columns=['_raw'], errors='ignore').to_csv(index=False).encode('utf-8'), file_name=f'{_base}_approach1_canonical.csv', mime='text/csv', use_container_width=True, ) with col4: _prov_df = st.session_state.get('prov_df', pd.DataFrame()) if not _prov_df.empty: st.download_button( 'Concept labels CSV', data=_prov_df.to_csv(index=False).encode('utf-8'), file_name=f'{_base}_approach1_concept_labels.csv', mime='text/csv', use_container_width=True, ) st.divider() # ── Save directly into the project's outputs/approach_1/ folder ──────────── _out_dir = Path(__file__).resolve().parent / 'outputs' / 'approach_1' st.markdown('### Save to project folder') st.caption( 'The download buttons above go to your browser’s Downloads folder (a browser ' f'restriction). This button instead writes the files into `{_out_dir}` with the ' 'dataset name — convenient for `evaluate_all.py`.' ) if st.button('💾 Save all to outputs/approach_1/', type='primary', use_container_width=True): try: _out_dir.mkdir(parents=True, exist_ok=True) saved = [] (_out_dir / f'{_base}_approach1_hierarchy.json').write_text( json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8') saved.append(f'{_base}_approach1_hierarchy.json') if facet_trees: (_out_dir / f'{_base}_approach1_facets.json').write_text( json.dumps(facet_trees, indent=2, ensure_ascii=False), encoding='utf-8') saved.append(f'{_base}_approach1_facets.json') if can is not None: can.drop(columns=['_raw'], errors='ignore').to_csv( _out_dir / f'{_base}_approach1_canonical.csv', index=False) saved.append(f'{_base}_approach1_canonical.csv') _prov_df2 = st.session_state.get('prov_df', pd.DataFrame()) if not _prov_df2.empty: _prov_df2.to_csv(_out_dir / f'{_base}_approach1_concept_labels.csv', index=False) saved.append(f'{_base}_approach1_concept_labels.csv') st.success(f'Saved to `{_out_dir}`:\n\n- ' + '\n- '.join(saved)) except Exception as _e: st.error(f'Could not save: {_e}') st.divider() st.markdown('### Hierarchy preview (first 5 nodes)') st.json(nodes[:5]) # ── Tab 9: Method ───────────────────────────────────────────────────────────── with tabs[9]: st.markdown(f""" ## Method — Approach 1 ### Algorithm (no hardcoded domain labels) | Step | What happens | Paper | |---|---|---| | 1. Canonical schema | Every metadata file → unified `_text` object | [GON] | | 2. Code family detection | Variable-code structural prefix clustering (DMSL*, SWMBE*) | [GON] | | 3. Domain detection | Auto-detect biomedical / cognitive / finance / environment / general | — | | 4. Candidate concept extraction | TF-IDF n-grams + noun phrases + group-path terms from **input data only** | [GON][TAX] | | 5. External concept table | Wikidata + WordNet + Wikipedia + PubMed + BioPortal → concept TABLE | [GON][TAX][HIE] | | 6. Concept embedding | SBERT encodes variables + concept table entries | [TAX][GON] | | 7. N×M cosine similarity | Variables × concepts: embedding + string + frequency + source score | [GON] | | 8. Concept label assignment | Best-scoring concept label + confidence + alternatives + provenance | [GON] | | 9. Hierarchy construction | Task/group-first backbone + automatic concept sub-groups | [TAX][HIE] | | 10. HiExpan refinement | Sibling coherence, width expansion, depth expansion (embedding-based), global opt | [HIE] | | 11. Castanet facets | Concept · Task · Code family · Data type parallel views | [CAS] | **Detected domain:** `{st.session_state.domain or 'not yet detected'}` **Concept table size:** `{len(concept_table)} entries` **WordNet available:** `{'yes' if _WORDNET_AVAILABLE else 'no — run: pip install nltk'}` ### Why no hardcoded patterns? The previous version used regex lists (`MEASURE_PATTERNS`, `STAT_PATTERNS`, `CONDITION_PATTERNS`) to label hierarchy nodes. These only worked for the AI-Mind neuropsychology dataset. This version discovers concept labels **automatically**: - Extracts candidate terms from **whatever metadata text the user provides** - Validates them against **universal external knowledge** (Wikidata, WordNet, Wikipedia) - Selects the best label by **embedding cosine similarity** — the Gonçalves N×M alignment step For an AI-Mind dataset, the pipeline will discover "correct latency", "standard deviation", "0 second delay" — because those phrases appear in the data. For a climate dataset, it will discover "temperature anomaly", "precipitation rate" — again, from the data. No domain vocabulary is assumed or hardcoded. ### Scoring formula (per cluster) [GON] ``` score(cluster, concept) = 0.50 × SBERT cosine similarity (cluster centroid ↔ concept embedding) + 0.20 × word overlap (cluster description words ∩ concept label words) + 0.15 × frequency (concept appears in N metadata rows / max frequency) + 0.10 × source confidence (Wikidata=0.88, WordNet=0.83, Wikipedia=0.78, ...) + 0.05 × hierarchy evidence (concept has P31/P279/P361 relations in Wikidata) ``` ### External sources | Source | Domain | What it provides | |---|---|---| | **Wikidata** (always) | Any | Structured descriptions, P31/P279/P361 broader relations | | **WordNet** (default) | Any | Definitions, hypernyms, synonyms | | **Wikipedia** (optional) | Any | Full text definitions | | **PubMed** (optional) | Biomedical/Cognitive | Abstract text for domain embeddings | | **BioPortal** (optional, key) | Biomedical | Ontology class labels and definitions | """) # ── Tab 10: Evaluation ───────────────────────────────────────────────────────── with tabs[10]: import hierarchy_eval as he st.subheader('Hierarchy Quality Evaluation') can_eval = st.session_state.get('canonical', pd.DataFrame()) nodes_eval = st.session_state.get('nodes', []) if can_eval.empty or not nodes_eval: st.info('Build a hierarchy first — metrics appear here after the build completes.') else: st.caption( 'The group column is a *construction input* (Gonçalves text object + concept ' 'alignment), so it cannot be ground truth. The primary metrics below are ' '**reference-free** — they assess the hierarchy itself, no gold standard.' ) with st.spinner('Computing reference-free metrics…'): tm = he.traco_metrics(nodes_eval) npmi = he.npmi_coherence(nodes_eval, can_eval['_text'].tolist()) # ── PRIMARY: reference-free hierarchy quality ───────────────────────── st.markdown('#### Primary — reference-free hierarchy quality') p1, p2, p3 = st.columns(3) p1.metric('Parent–child coherence', tm['pc_coherence'], help='TraCo (Wu et al., AAAI 2024). Children correctly nest under parent theme.') p2.metric('Sibling diversity', tm['sibling_diversity'], help='TraCo (Wu et al., AAAI 2024). Higher = distinct siblings; LOW = redundant.') p3.metric('NPMI label coherence', npmi, help='Lau et al., EACL 2014. Label terms genuinely co-occur in the data.') st.caption(f'Embedding backend: **{tm["encoder"]}**.') # ── Approach-1-specific alignment metrics ───────────────────────────── em = st.session_state.get('eval_metrics', {}) if em: st.markdown('#### Concept-alignment metrics (Approach-1 specific)') a1, a2, a3 = st.columns(3) a1.metric('Alignment coverage', f"{em.get('alignment_coverage_pct', 0):.1f}%", help='% of variables assigned an external concept label') a2.metric('Avg label confidence', f"{em.get('avg_label_confidence', 0):.3f}", help='Mean concept-alignment cosine score across all variables') a3.metric('Fallback rate', f"{em.get('fallback_rate_pct', 0):.1f}%", help='% of variables that used TF-IDF fallback instead of external concept') # ── Structural statistics ───────────────────────────────────────────── st.markdown('#### Structural statistics') sm = he.structural_stats(nodes_eval) s1, s2, s3, s4, s5 = st.columns(5) s1.metric('Aggregation nodes', sm['n_aggregation_nodes']) s2.metric('Max leaf depth', sm['max_depth']) s3.metric('Avg leaf depth', sm['avg_leaf_depth']) s4.metric('Avg branching', sm['avg_branching_factor']) s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%") # ── SECONDARY: group preservation (caveated) ────────────────────────── st.markdown('#### Secondary — group-structure preservation *(descriptive)*') st.caption( '⚠️ The group column was an **input** to construction, so these are NOT accuracy ' 'metrics — only how much the hierarchy still reflects the pre-existing group column.' ) gp = he.group_preservation(nodes_eval, can_eval) g1, g2, g3 = st.columns(3) g1.metric('NMI', gp['NMI']); g2.metric('ARI', gp['ARI']); g3.metric('Purity', gp['Purity'])