RoophaSharon
Navigation router (branding + Demo View + collapsible Build hierarchy); full-range LoD slider (1-9, default 7); replace deprecated use_container_width
ef5da89 | # approach 1.py β Automatic Metadata Hierarchy Builder β Approach 1 | |
| # | |
| # Algorithm (no hardcoded domain-specific labels): | |
| # 1. Read metadata β detect roles β build canonical schema | |
| # 2. Extract candidate concepts automatically from metadata text | |
| # 3. Detect domain β select external sources | |
| # 4. Retrieve concept TABLE from external sources (Wikidata, Wikipedia, WordNet, BioPortal) | |
| # 5. Embed variables + concept table (SBERT or TF-IDF fallback) | |
| # 6. Compute NΓM cosine similarity matrix [GON] β variables Γ concepts | |
| # 7. Score concept assignment: embedding + string + frequency + source + hierarchy | |
| # 8. Build task/group-first hierarchy using automatically assigned concept labels | |
| # 9. HiExpan refinement: sibling coherence, width expansion, depth expansion, global opt | |
| # 10. VIANNA LoD tree + Castanet parallel facets | |
| # 11. Export with label provenance | |
| # | |
| # Papers: | |
| # [GON] GonΓ§alves et al. β biomedical metadata alignment via NΓM concept similarity matrix | |
| # [TAX] Taxonomizer (Sultanum et al.) β leaf=attribute, internal node=abstract group | |
| # [HIE] HiExpan (Shen et al.) β width/depth expansion, sibling coherence, global opt | |
| # [CAS] Castanet β parallel faceted hierarchies over the same variable set | |
| from __future__ import annotations | |
| import csv, json, re, time, warnings | |
| from collections import defaultdict | |
| from difflib import SequenceMatcher | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import requests | |
| import streamlit as st | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.decomposition import TruncatedSVD | |
| from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score | |
| from sklearn.metrics.pairwise import cosine_distances, cosine_similarity | |
| from sklearn.preprocessing import LabelEncoder | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| _ST_AVAILABLE = True | |
| except ImportError: | |
| _ST_AVAILABLE = False | |
| try: | |
| import nltk | |
| for _pkg in ('wordnet', 'omw-1.4'): | |
| try: | |
| nltk.data.find(f'corpora/{_pkg}') | |
| except LookupError: | |
| nltk.download(_pkg, quiet=True) | |
| from nltk.corpus import wordnet as wn | |
| _WORDNET_AVAILABLE = True | |
| except Exception: | |
| _WORDNET_AVAILABLE = False | |
| warnings.filterwarnings('ignore') | |
| # set_page_config handled by the navigation router (demo.py) | |
| st.title('Metadata Hierarchy Builder β Approach 1') | |
| st.caption( | |
| 'Automatic concept-label extraction from metadata text + HiExpan refinement + Castanet facets. ' | |
| 'External enrichment (Wikidata / Wikipedia / PubMed) activates automatically for biomedical, ' | |
| 'cognitive, and neurological domains.' | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONSTANTS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split() | |
| GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split() | |
| TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation comment'.split() | |
| META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split() | |
| RELATION_TYPES = { | |
| 'has_measure': 'has measure type', | |
| 'is_statistic_of': 'is statistic of', | |
| 'has_condition': 'has condition', | |
| 'part_of': 'part of', # Wikidata P361 | |
| 'instance_of': 'instance of', # Wikidata P31 | |
| 'subclass_of': 'subclass of', # Wikidata P279 | |
| 'belongs_to': 'belongs to', | |
| 'related_to': 'semantically related to', | |
| } | |
| # Source confidence weights for concept scoring [GON] | |
| SOURCE_CONFIDENCE = { | |
| 'group_path': 0.95, | |
| 'description_title': 0.91, # [FIX4][TAX][LOB] Text before first colon in description β highly discriminative | |
| 'bioportal': 0.92, | |
| 'cognitive_atlas': 0.94, # [C5] Cognitive Atlas β domain-specific for CANTAB/cognitive, above Wikidata | |
| 'wikidata': 0.88, | |
| 'wordnet': 0.83, | |
| 'pubmed': 0.82, | |
| 'wikipedia': 0.78, | |
| 'metadata_tfidf': 0.65, | |
| 'noun_phrase': 0.55, | |
| } | |
| # English stop words (standard, domain-agnostic) | |
| _STOP = { | |
| 'the','a','an','is','are','was','were','be','been','being','have','has','had', | |
| 'do','does','did','will','would','shall','should','may','might','must','can', | |
| 'could','of','in','on','at','to','for','with','by','from','about','as','into', | |
| 'through','during','before','after','above','below','between','each','all', | |
| 'both','few','more','most','other','some','such','no','nor','not','only', | |
| 'same','so','than','too','very','just','this','that','these','those','which', | |
| 'who','when','where','why','how','what','and','but','or','if','then','because', | |
| 'while','although','however','therefore','thus','hence','also','well','used', | |
| 'using','use','based','given','defined','number','value','values','score', | |
| } | |
| # βββ KeyBERT / labelling configuration βββββββββββββββββββββββββββββββββββββββ | |
| # These tune the KeyBERT label synthesizer used in the hybrid scorer. | |
| # | |
| # USE_NOUN_PHRASES β True: candidate phrases are NLTK POS-tagged noun phrases | |
| # (needs the 'averaged_perceptron_tagger' corpus); False: plain n-gram candidates | |
| # from tokens. False is robust for short CANTAB/AI-MIND descriptions and avoids the | |
| # extra NLTK dependency. | |
| USE_NOUN_PHRASES = False | |
| # USE_CTFIDF β True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide | |
| # boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid. | |
| USE_CTFIDF = True | |
| # KEYBERT_DIVERSITY β MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid | |
| # (pick the single most relevant phrase); 0.5 = standard MMR diversification. | |
| KEYBERT_DIVERSITY = 0 | |
| # βββ Title-SEEDED KeyBERT label-scorer weights βββββββββββββββββββββββββββββββ | |
| # Concept labels are FORMED FROM THE DESCRIPTIONS (KeyBERT candidate phrases over the | |
| # cluster's member descriptions). The pre-colon title is a ranking SEED/anchor, not the | |
| # label itself: LABEL_W_TITLE controls how strongly it biases the choice toward the | |
| # human-canonical phrasing (this is "Guided/Seeded KeyBERT"). Set LABEL_W_TITLE=0 for a | |
| # pure-description ablation. Magnitudes are relative (need not sum to 1). | |
| LABEL_W_RELEVANCE = 0.45 # cosine(candidate, cluster centroid) β description fit (Ξ±) | |
| LABEL_W_TITLE = 0.35 # cosine(candidate, pre-colon title) β title influence (Ξ²) | |
| LABEL_W_CONTRAST = 0.15 # discriminativeness vs sibling clusters (Ξ³) | |
| # NOTE: node labels are formed from DESCRIPTIONS + pre-colon TITLE only. External | |
| # ontology sources (Cognitive Atlas / Wikidata / WordNet / PubMed) inform the embedding | |
| # space / semantic understanding but are never used to name a node β so there is no | |
| # external-grounding term in the label score. | |
| # Corpus IDF over description n-grams; populated in build_concept_hierarchy() and | |
| # consumed by _keybert_label when USE_CTFIDF=True. | |
| _CORPUS_IDF: dict = {} | |
| # Active dataset domain; set in build_concept_hierarchy(), read by the hybrid label | |
| # scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing). | |
| _ACTIVE_DOMAIN: str = 'general' | |
| # Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from | |
| # data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows | |
| # embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the | |
| # embedding text so they can neither name a node nor distort clustering. Domain-agnostic | |
| # documentation/scale tokens only β not concept vocabulary. | |
| _LABEL_BOILERPLATE = { | |
| 'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link', | |
| 'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category', | |
| 'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat', | |
| } | |
| # Inline URLs in free text (http://β¦, www.β¦/β¦) β removed from the embedding text. | |
| _URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)', | |
| re.IGNORECASE) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FILE LOADING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def safe_name(name): | |
| return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name) | |
| def try_read_csv(path): | |
| best, best_score = None, -1 | |
| for enc in ['utf-8-sig', 'utf-8', 'latin1']: | |
| for sep in [None, ',', '\t', ';', '|']: | |
| try: | |
| df = pd.read_csv(path, sep=sep, engine='python', encoding=enc) | |
| score = df.shape[1] * 10 - float(df.isna().mean().mean()) | |
| if score > best_score: | |
| best, best_score = df, score | |
| except Exception: | |
| pass | |
| if best is None: | |
| raise ValueError(f'Could not read {path.name}') | |
| best.columns = [str(c).strip().replace(';', '') for c in best.columns] | |
| if len(best) > 0: | |
| first = best.iloc[:, 0].astype(str) | |
| other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0 | |
| if first.str.contains(',').mean() > 0.50 and other_null > 0.70: | |
| lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines() | |
| if lines: | |
| header = [h.strip().replace(';', '') for h in lines[0].split(',')] | |
| rows = [] | |
| for line in lines[1:]: | |
| line = line.strip().rstrip(';') | |
| if not line: | |
| continue | |
| if line.startswith('"') and line.endswith('"'): | |
| line = line[1:-1] | |
| try: | |
| parts = next(csv.reader([line], quotechar='"')) | |
| except Exception: | |
| continue | |
| if len(parts) >= len(header): | |
| rows.append(parts[:len(header)]) | |
| if rows: | |
| best = pd.DataFrame(rows, columns=header) | |
| best.columns = [str(c).strip().replace(';', '') for c in best.columns] | |
| return best | |
| def load_any(path): | |
| s = path.suffix.lower() | |
| if s in ['.csv', '.tsv', '.txt']: | |
| return try_read_csv(path) | |
| if s in ['.xlsx', '.xls']: | |
| return pd.read_excel(path) | |
| if s == '.json': | |
| obj = json.loads(path.read_text(encoding='utf-8', errors='replace')) | |
| if isinstance(obj, list): | |
| return pd.json_normalize(obj) | |
| if isinstance(obj, dict): | |
| for v in obj.values(): | |
| if isinstance(v, list): | |
| return pd.json_normalize(v) | |
| if s in ['.md', '.markdown']: | |
| rows = [] | |
| for ln in path.read_text(encoding='utf-8', errors='replace').splitlines(): | |
| if '|' not in ln: | |
| continue | |
| cells = [c.strip() for c in ln.strip().strip('|').split('|')] | |
| if cells and not all(re.fullmatch(r':?-{2,}:?', c or '') for c in cells): | |
| rows.append(cells) | |
| if len(rows) < 2: | |
| raise ValueError('No Markdown table found') | |
| header = rows[0] | |
| data = [r[:len(header)] + [''] * max(0, len(header) - len(r)) for r in rows[1:]] | |
| return pd.DataFrame(data, columns=header) | |
| raise ValueError(f'Unsupported: {s}') | |
| def probably_raw(df): | |
| cols = [str(c).lower() for c in df.columns] | |
| return df.shape[1] > 20 and not any(any(k in c for k in TEXT_KEYS) for c in cols) | |
| def raw_to_metadata(df): | |
| rows = [] | |
| for c in df.columns: | |
| s = df[c] | |
| dtype = 'number' if pd.api.types.is_numeric_dtype(s) else 'string' | |
| sample = ', '.join(map(str, s.dropna().astype(str).unique()[:5])) | |
| rows.append({'name': str(c), 'description': f'Column dtype:{dtype}. Values:{sample}', 'dtype': dtype}) | |
| return pd.DataFrame(rows) | |
| import tempfile | |
| def save_uploads(files): | |
| tmp = Path(tempfile.mkdtemp(prefix='meta_app1_')) | |
| paths = [] | |
| for f in files: | |
| p = tmp / safe_name(f.name) | |
| p.write_bytes(f.getbuffer()) | |
| paths.append(p) | |
| return paths | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ROLE DETECTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def norm(c): | |
| return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_') | |
| def kscore(c, keys): | |
| nc = norm(c) | |
| return sum(1 for k in keys if k in nc) | |
| def profile_columns(df): | |
| out = [] | |
| n = max(len(df), 1) | |
| for col in df.columns: | |
| s = df[col] | |
| non = float(s.notna().mean()) | |
| nun = int(s.nunique(dropna=True)) | |
| ur = nun / n | |
| avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0 | |
| out.append({ | |
| 'column': str(col), | |
| 'non_null': round(non, 3), | |
| 'unique_values': nun, | |
| 'unique_ratio': round(ur, 3), | |
| 'avg_length': round(avg, 1), | |
| 'leaf_score': 4 * kscore(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0), | |
| 'group_score': 4 * kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n * 0.5, 80) else 0) + (1 if avg < 60 else 0), | |
| 'text_score': 5 * kscore(col, TEXT_KEYS) + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0), | |
| 'metadata_score': 4 * kscore(col, META_KEYS) + (2 if 1 < nun < min(n * 0.8, 100) else 0), | |
| }) | |
| return pd.DataFrame(out) | |
| def detect_roles(df): | |
| prof = profile_columns(df) | |
| leaf = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist() | |
| text = prof[(prof.text_score >= 4) | (prof.avg_length > 80)].sort_values('text_score', ascending=False)['column'].tolist() or leaf.copy() | |
| group = prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)].sort_values('group_score', ascending=False)['column'].head(3).tolist() | |
| meta = prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))].sort_values('metadata_score', ascending=False)['column'].head(5).tolist() | |
| # DDI/CDISC: representation columns must never become structural hierarchy levels [GON][TAX] | |
| # These substrings identify physical metadata β universally, across any domain. | |
| _META_SUBSTR_BLOCK = { | |
| 'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale', | |
| 'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing', | |
| } | |
| def _col_is_repr(col_name): | |
| nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower()) | |
| return any(sub in nc for sub in _META_SUBSTR_BLOCK) | |
| # Force representation columns out of group and into metadata | |
| meta_extra = [c for c in prof['column'].tolist() | |
| if _col_is_repr(c) and c not in text and c not in leaf and c not in meta] | |
| group = [c for c in group if not _col_is_repr(c)] | |
| meta = list(dict.fromkeys(meta + meta_extra))[:8] | |
| return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof | |
| def sv(x): | |
| return '' if pd.isna(x) else str(x).strip() | |
| def guess_dtype(row, dtype_cols, label): | |
| joined = ' '.join(sv(row.get(c, '')) for c in dtype_cols).lower() | |
| if any(t in joined for t in ['num', 'int', 'float', 'double', 'decimal', 'continuous', 'number']): | |
| return 'number' | |
| if any(t in joined for t in ['string', 'text', 'char', 'category', 'categorical', 'nominal']): | |
| return 'string' | |
| if re.search(r'(name|country|gender|sex|site|visit|status)', label.lower()): | |
| return 'string' | |
| return 'determine' | |
| def build_canonical(df, cfg, source): | |
| """[GON] Build unified metadata text object from any tabular metadata file.""" | |
| leaf_cols = cfg.get('leaf_cols', []) | |
| group_cols = cfg.get('group_cols', []) | |
| text_cols = cfg.get('text_cols', []) | |
| meta_cols = cfg.get('metadata_cols', []) | |
| if not leaf_cols: | |
| raise ValueError('Choose at least one leaf column') | |
| rows = [] | |
| for i, row in df.iterrows(): | |
| leaf_parts = [sv(row.get(c, '')) for c in leaf_cols] | |
| leaf_parts = [p for p in leaf_parts if p] | |
| label = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}' | |
| group_parts = [sv(row.get(c, '')) for c in group_cols] | |
| group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']] | |
| gpath = ' > '.join(group_parts) if group_parts else 'Ungrouped' | |
| parts = [] | |
| for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)): | |
| v = sv(row.get(c, '')) | |
| if v: | |
| parts.append(f'{c}: {v}') | |
| text = ' | '.join(parts) if parts else label | |
| # _semantic_text: description values only, no "fieldname: " prefixes [TAX][GON] | |
| # TAX embeds description text, not the full metadata row. | |
| # This is the input to SBERT and TF-IDF candidate extraction. | |
| sem_parts = [] | |
| for c in text_cols: | |
| v = sv(row.get(c, '')) | |
| if v: | |
| sem_parts.append(v) | |
| if not sem_parts: | |
| sem_parts = list(leaf_parts) if leaf_parts else [] | |
| semantic_text = ' '.join(sem_parts) if sem_parts else text | |
| # Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens | |
| # cannot dominate either the embedding (clustering) or the KeyBERT label. | |
| semantic_text = _URL_RE.sub(' ', semantic_text) | |
| rows.append({ | |
| '_source_file': source, | |
| '_row_index': int(i), | |
| '_leaf_label': label, | |
| '_leaf_id': f'{gpath}.{label}' if gpath != 'Ungrouped' else label, | |
| '_group_path': gpath, | |
| '_text': text, | |
| '_semantic_text': semantic_text, | |
| '_dtype': guess_dtype(row, meta_cols, label), | |
| '_raw': row.to_dict(), | |
| '_concept_label': '', | |
| '_concept_score': 0.0, | |
| '_concept_source': '', | |
| '_code_family': '', | |
| }) | |
| can = pd.DataFrame(rows) | |
| if can['_leaf_id'].duplicated().any(): | |
| cnt = defaultdict(int) | |
| ids = [] | |
| for lid in can['_leaf_id']: | |
| cnt[lid] += 1 | |
| ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}') | |
| can['_leaf_id'] = ids | |
| return can | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # [F3] EARLY FACET PRE-COMPUTATION [CAS] | |
| # Castanet: parallel facets (Statistic, Condition) are orthogonal split dimensions. | |
| # These must be available BEFORE build_concept_hierarchy so _cluster_and_label | |
| # can use them for sub-splitting. detect_facets/build_castanet_facets is called | |
| # AFTER the hierarchy build, which is too late β so we compute them here first. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def precompute_stat_cond_facets(can): | |
| """ | |
| Pre-compute _facet_cond on can (numeric experimental conditions only). | |
| Called before build_concept_hierarchy so that _cluster_and_label can use it to | |
| insert Condition sub-tiers. | |
| NOTE: the statistic tier (Mean / Median / SD / β¦) is NO LONGER computed here. | |
| It used to come from a hardcoded statistic vocabulary regex, which (a) is domain | |
| hardcoding and (b) is not derived from the data's own concept titles. Statistic | |
| depth is now produced data-drivenly by _nest_by_measure(), which discovers the | |
| shared measure phrase and keeps the residual (Mean/Median/SD) as children β no | |
| word list. Condition detection below stays: it is structural (a digit in the | |
| code validated against the description text), not a hardcoded vocabulary. | |
| [CAS] Castanet parallel facets Β· [HIE] HiExpan sub-set discovery | |
| """ | |
| can = can.copy() | |
| sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text' | |
| # ββ Condition: digit in variable code VALIDATED by description text ββββββββββ | |
| # [FIX2][GON] GonΓ§alves et al. (ESWC 2019): structural code alignment must be | |
| # validated against description text β the description is the authoritative source. | |
| # Previous rule: any digit in the code = condition value β caused false labels | |
| # like "468" (from SWMBE468) and HCP numeric suffixes that are not conditions. | |
| # New rule: a digit is accepted as a condition only if it ALSO appears as a | |
| # standalone token in the variable's description text, confirming it is a real | |
| # experimental parameter (delay, boxes, items, etc.). | |
| _num_re = re.compile(r'(\d+)') | |
| def _extract_cond(row): | |
| code = str(row['_leaf_label']).split('/')[0].strip() | |
| hits = _num_re.findall(code) | |
| if not hits: | |
| return '' | |
| desc_text = str(row.get(sem_col, row.get('_text', ''))).lower() | |
| for digit in hits: | |
| # Accept digit only if it appears as a whole word in the description | |
| if re.search(r'\b' + re.escape(digit) + r'\b', desc_text): | |
| return digit | |
| return '' | |
| cond_col = can.apply(_extract_cond, axis=1) | |
| can['_facet_cond'] = cond_col.where(cond_col != '', '') | |
| return can | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SEMANTIC EMBEDDER [TAX][GON] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class SemanticEmbedder: | |
| """SBERT with TF-IDF+SVD fallback. [TAX] Word2VecβSBERT; [GON] GloVeβSBERT. | |
| Critical fix: in TF-IDF mode, a single vectorizer+SVD is fit jointly on ALL | |
| texts (variables + concept entries) so both live in the same vector space. | |
| Without this, NΓM cosine similarity between separately-fit spaces is meaningless. | |
| """ | |
| def __init__(self, model_name='all-MiniLM-L6-v2'): | |
| self.model_name = model_name | |
| self._model = None | |
| self._using_st = False | |
| self._joint_vec = None # shared TF-IDF vectorizer (fallback mode only) | |
| self._joint_svd = None # shared SVD (fallback mode only) | |
| self._joint_dim = 64 | |
| def load(self): | |
| if _ST_AVAILABLE: | |
| try: | |
| self._model = SentenceTransformer(self.model_name) | |
| self._using_st = True | |
| return True, f'Loaded {self.model_name} (SBERT)' | |
| except Exception as e: | |
| return False, f'sentence-transformers failed: {e}' | |
| return False, 'sentence-transformers not installed β using TF-IDF+SVD fallback' | |
| def fit_joint(self, all_texts): | |
| """ | |
| Call once with variable texts + concept full_texts combined BEFORE encoding. | |
| Ensures TF-IDF fallback uses a single shared vector space for NΓM alignment. | |
| No-op when SBERT is active (SBERT is already a universal space). | |
| """ | |
| if self._using_st: | |
| return | |
| clean = [str(t) for t in all_texts if str(t).strip()] | |
| if len(clean) < 2: | |
| return | |
| vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), | |
| max_features=1000, min_df=1) | |
| X = vec.fit_transform(clean) | |
| n_comp = min(self._joint_dim, X.shape[1] - 1, X.shape[0] - 1) | |
| if n_comp >= 2: | |
| svd = TruncatedSVD(n_components=n_comp, random_state=42) | |
| svd.fit(X) | |
| self._joint_vec = vec | |
| self._joint_svd = svd | |
| def encode(self, texts): | |
| if self._using_st and self._model is not None: | |
| embs = self._model.encode(texts, show_progress_bar=False, | |
| batch_size=64, normalize_embeddings=True) | |
| return np.array(embs) | |
| # TF-IDF fallback β use shared space if available | |
| clean = [str(t) for t in texts] | |
| if self._joint_vec is not None and self._joint_svd is not None: | |
| X = self._joint_vec.transform(clean) | |
| embs = self._joint_svd.transform(X) | |
| else: | |
| # Independent fit (only before fit_joint is called β e.g. early pipeline stages) | |
| vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), | |
| max_features=1000, min_df=1) | |
| X = vec.fit_transform(clean) | |
| n_comp = min(self._joint_dim, X.shape[1] - 1, X.shape[0] - 1) | |
| embs = (TruncatedSVD(n_components=n_comp, random_state=42).fit_transform(X) | |
| if n_comp >= 2 else X.toarray().astype(float)) | |
| norms = np.linalg.norm(embs, axis=1, keepdims=True) | |
| norms[norms == 0] = 1 | |
| return (embs / norms).astype(float) | |
| def backend(self): | |
| return self.model_name if self._using_st else 'TF-IDF+SVD (joint-fit, fallback)' | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CODE / ACRONYM ANALYSIS [GON] | |
| # Detects coded variable names and groups them by shared structural prefix. | |
| # GonΓ§alves et al. use string-distance clustering before semantic alignment. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def detect_coded_variables(can): | |
| """ | |
| Returns mask of rows whose leaf label looks like a variable code: | |
| all-uppercase strings with digits, short, no spaces (e.g. DMSL0SD). | |
| """ | |
| pattern = re.compile(r'^[A-Z][A-Z0-9_]{2,}$') | |
| return can['_leaf_label'].apply(lambda x: bool(pattern.match(str(x).strip().split('/')[0].strip()))) | |
| def cluster_codes_by_prefix(can): | |
| """ | |
| [F7] Groups coded variable names by their structural prefix. | |
| Improvement over simple ^[A-Z]+ regex: uses longest-common-prefix detection | |
| so that codes without digits (DMSLADSD, DMSLSSD) join the same family as | |
| codes with digits (DMSL0SD, DMSL4SD, DMSL12SD). | |
| Algorithm: | |
| 1. For each coded variable, extract the alphabetic prefix before first digit | |
| (same as before for codes WITH digits, e.g. DMSL from DMSL0SD). | |
| 2. For codes WITHOUT digits, try progressively shorter prefixes until finding | |
| one shared by β₯2 other codes β so DMSLADSD tries "DMSLADSD", "DMSLADS", | |
| "DMSLA", "DMSL" β "DMSL" matches β₯2 others β family = "DMSL". | |
| 3. Assign the LONGEST matching prefix as the family key. | |
| Result: DMSL0SD, DMSL4SD, DMSL12SD, DMSLADSD, DMSLSSD, DMSLSD all share | |
| family "DMSL" regardless of digit presence. Works on any CSV domain. | |
| """ | |
| can = can.copy() | |
| coded_mask = detect_coded_variables(can) | |
| if not coded_mask.any(): | |
| return can | |
| # Step 1: collect all codes and their alpha prefix before first digit | |
| idx_to_code = {} | |
| idx_to_alpha = {} | |
| for idx, row in can[coded_mask].iterrows(): | |
| code = str(row['_leaf_label']).strip().split('/')[0].strip() | |
| idx_to_code[idx] = code | |
| m = re.match(r'^([A-Z]+)', code) | |
| idx_to_alpha[idx] = m.group(1) if m else code | |
| # Step 2: build prefix β {indices} map for all possible prefix lengths β₯ 3 | |
| prefix_to_idxs = defaultdict(set) | |
| for idx, alpha in idx_to_alpha.items(): | |
| for length in range(3, len(alpha) + 1): | |
| prefix_to_idxs[alpha[:length]].add(idx) | |
| # Step 3: for each code find the longest prefix with β₯2 total matching codes | |
| prefix_counts = {p: len(idxs) for p, idxs in prefix_to_idxs.items()} | |
| best_prefix = {} | |
| for idx, alpha in idx_to_alpha.items(): | |
| chosen = None | |
| for length in range(len(alpha), 2, -1): # try longest first | |
| candidate = alpha[:length] | |
| if prefix_counts.get(candidate, 0) >= 2: | |
| chosen = candidate | |
| break | |
| best_prefix[idx] = chosen | |
| # Step 4: assign β only use a prefix if it appears in β₯2 variables | |
| for idx, prefix in best_prefix.items(): | |
| if prefix: | |
| can.at[idx, '_code_family'] = prefix | |
| return can | |
| def expand_variable_codes(can): | |
| """ | |
| [GON] Automatically expand variable code segments to human-readable terms. | |
| Three evidence sources β all data-driven, no hardcoded domain terms: | |
| 1. Parenthetical patterns in description text: | |
| 'DMS (Delayed Matching to Sample)' β DMS = Delayed Matching to Sample | |
| 2. Repeated positional suffix across a code family: | |
| DMSL0SD, DMSL4SD β suffix 'SD' constant β search descriptions for 'SD' expansion | |
| 3. Group name as expansion of code prefix: | |
| codes in group 'Delayed Matching to Sample' β prefix DMSL β group name | |
| Returns dict: {segment β {'expansion': str, 'evidence': [str]}} | |
| """ | |
| expansions = {} | |
| # Source 1: parenthetical patterns "(ABBR)" or "(Full Name)" | |
| paren_re = re.compile( | |
| r'\b([A-Z]{2,8})\b\s*[\(\[]\s*([A-Za-z][^)\]]{3,80})\s*[\)\]]' | |
| r'|([A-Za-z][^(\[]{3,60})\s*[\(\[]\s*([A-Z]{2,8})\s*[\)\]]' | |
| ) | |
| for text in can['_text'].fillna('').astype(str): | |
| for m in paren_re.finditer(text): | |
| if m.group(1): # ABBR (Full Name) | |
| seg, exp = m.group(1), m.group(2).strip() | |
| else: # Full Name (ABBR) | |
| seg, exp = m.group(4), m.group(3).strip() | |
| exp = exp.split('.')[0].split(';')[0].strip() | |
| if len(exp) >= 4 and seg not in expansions: | |
| expansions[seg] = {'expansion': exp, | |
| 'evidence': ['description_parenthetical']} | |
| # Source 2: repeated positional suffix across a code family | |
| coded_mask = detect_coded_variables(can) | |
| family_rows = defaultdict(list) | |
| for _, row in can[coded_mask].iterrows(): | |
| fam = str(row.get('_code_family', '')) | |
| if fam: | |
| family_rows[fam].append(row) | |
| seg_tok = re.compile(r'([A-Z]{2,})') | |
| for fam, rows in family_rows.items(): | |
| if len(rows) < 2: | |
| continue | |
| codes = [str(r['_leaf_label']).strip().split('/')[0] for r in rows] | |
| all_segs = [seg_tok.findall(c) for c in codes] | |
| min_len = min((len(s) for s in all_segs), default=0) | |
| for pos in range(-1, -min_len - 1, -1): | |
| vals = [s[pos] for s in all_segs if len(s) >= abs(pos)] | |
| if not vals or vals[0].isdigit(): | |
| continue | |
| seg_val = vals[0] | |
| if all(v == seg_val for v in vals) and seg_val not in expansions: | |
| look_re = re.compile( | |
| rf'\b{re.escape(seg_val)}\b[\s\-β:]*([A-Za-z][a-zA-Z ]+)', | |
| re.IGNORECASE | |
| ) | |
| for r in rows: | |
| hit = look_re.search(str(r.get('_text', ''))) | |
| if hit: | |
| exp = hit.group(1).strip().split('.')[0].split('(')[0].strip() | |
| if 4 <= len(exp) <= 60: | |
| expansions[seg_val] = { | |
| 'expansion': exp, | |
| 'evidence': [f'code_family_{fam}_positional_suffix'] | |
| } | |
| break | |
| # Source 3: group name as prefix expansion | |
| for fam, rows in family_rows.items(): | |
| if fam in expansions: | |
| continue | |
| groups = [str(r.get('_group_path', '')).split(' > ')[0].strip() | |
| for r in rows | |
| if str(r.get('_group_path', '')) not in ('', 'nan', 'Ungrouped')] | |
| if groups and groups[0].lower() != fam.lower(): | |
| expansions[fam] = {'expansion': groups[0], | |
| 'evidence': ['group_name_match']} | |
| return expansions | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # DOMAIN DETECTION | |
| # Routes to domain-specific external sources automatically. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _DOMAIN_SIGNALS = { | |
| 'cognitive': [ | |
| 'reaction time', 'response time', 'memory', 'attention', 'executive', | |
| 'cognitive', 'correct', 'error', 'delay', 'task', 'trial', 'stimulus', | |
| 'recall', 'recognition', 'working memory', 'inhibition', 'processing speed', | |
| 'latency', 'accuracy', 'hit', 'false alarm', 'miss', | |
| ], | |
| 'biomedical': [ | |
| 'patient', 'clinical', 'diagnosis', 'treatment', 'disease', 'symptom', | |
| 'medication', 'hospital', 'brain', 'neural', 'mri', 'fmri', 'eeg', | |
| 'biomarker', 'genetic', 'phenotype', 'cohort', 'longitudinal', | |
| ], | |
| 'finance': [ | |
| 'price', 'return', 'portfolio', 'equity', 'bond', 'yield', 'market', | |
| 'stock', 'currency', 'gdp', 'inflation', 'revenue', 'profit', 'index', | |
| ], | |
| 'environment': [ | |
| 'temperature', 'precipitation', 'climate', 'emission', 'pollution', | |
| 'biodiversity', 'ecosystem', 'carbon', 'species', 'habitat', 'soil', | |
| ], | |
| 'survey': [ | |
| 'questionnaire', 'likert', 'respondent', 'survey', 'agree', 'disagree', | |
| 'strongly', 'satisfaction', 'attitude', 'opinion', | |
| ], | |
| } | |
| def detect_domain(can): | |
| """ | |
| Detect domain from all metadata text. | |
| Returns domain string used to select external sources. | |
| """ | |
| all_text = ' '.join(can['_text'].fillna('').astype(str).tolist()).lower() | |
| scores = {domain: sum(1 for sig in signals if sig in all_text) | |
| for domain, signals in _DOMAIN_SIGNALS.items()} | |
| best = max(scores, key=scores.get) | |
| return best if scores[best] >= 2 else 'general' | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CANDIDATE CONCEPT EXTRACTION FROM METADATA [GON][TAX] | |
| # Mines the metadata text itself for candidate concept labels. | |
| # No external source needed at this stage β purely data-driven. | |
| # Sources: group path components, TF-IDF n-grams, repeated noun phrases. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def extract_candidate_concepts_from_metadata(can, max_concepts=150): | |
| """ | |
| Extract candidate concept labels from the metadata itself. | |
| Uses _semantic_text (description values only) to avoid field-name contamination [TAX][HIE][YAKE]. | |
| Returns list of dicts: {label, full_text, frequency, source, tfidf_score} | |
| """ | |
| # No hardcoded structural/boilerplate word list. Candidate EXTRACTION now keeps | |
| # everything that survives standard stop-word removal; boilerplate suppression is | |
| # done downstream and SEMANTICALLY in score_concepts_for_cluster() via the | |
| # specificity (semantic-IDF) signal, which is corpus-derived and dataset-agnostic. | |
| # A term like "Calculated Assessed Trials" is no longer blacklisted here β it is | |
| # simply ranked low because it is close to every group centroid. [GON][TaxoGen] | |
| _STRUCT_NOISE: set = set() | |
| candidates = {} # label_lower β dict | |
| # Boolean / value-state noise tokens β candidates made entirely of these words | |
| # are documentation artefacts, NOT semantic concepts [FIX5][TaxoGen KDD 2018] | |
| _BOOL_NOISE = { | |
| 'true', 'false', 'yes', 'no', 'completed', 'incomplete', | |
| 'missing', 'unknown', 'none', 'other', 'na', 'n/a', | |
| 'not', 'done', 'pending', 'available', 'unavailable', | |
| } | |
| def _is_bool_noise(label): | |
| """Return True if every non-stop word in label is a boolean/value-state token.""" | |
| words = set(re.findall(r'\b[a-z]{2,}\b', label.lower())) - _STOP | |
| return len(words) > 0 and words.issubset(_BOOL_NOISE) | |
| sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text' | |
| # ββ Source 0: Description titles β colon-structured descriptions only ββββββ | |
| # [FIX4][TAX] Taxonomizer: "text before first colon" is the concept anchor | |
| # ONLY when a genuine colon separates label from explanation, e.g. | |
| # "Reaction Time: time from stimulus to response" β anchor = "Reaction Time". | |
| # [Sultanum & Mueller, IEEE TVCG 2019]; [Lobo et al., ISWC 2023] | |
| # | |
| # [FIX-R1] Cross-task leakage fix: | |
| # CANTAB/AI-Mind descriptions have NO colon β they are plain text like | |
| # "DMS Correct Latency Standard Deviation 0 second delay". The naive | |
| # implementation treated the FULL description as the title, so | |
| # "Prm Correct Latency (Sd) Delayed" entered the global pool and was | |
| # assigned to DMS clusters (high SBERT similarity). | |
| # Gate: only accept title_raw that is <80% the length of the full description. | |
| # This confirms a colon genuinely separates a short label from a long explanation. | |
| # CANTAB descriptions (no colon β title == full text) are skipped entirely. | |
| # Additional guards: strip task prefix (data-driven), β€4 words, freq β₯ 2. | |
| # Discover top-level task tokens from _group_path β data-driven, NOT hardcoded | |
| top_task_tokens: set = set() | |
| if '_group_path' in can.columns: | |
| for _gp in can['_group_path'].dropna().astype(str): | |
| _first = _gp.split(' > ')[0].strip() | |
| if _first and _first.lower() not in ('ungrouped', 'nan', ''): | |
| top_task_tokens.add(_first.lower()) | |
| _task_pfx_re = ( | |
| re.compile( | |
| r'^(?:' + '|'.join(re.escape(t) | |
| for t in sorted(top_task_tokens, key=len, reverse=True)) | |
| + r')\s+', | |
| re.IGNORECASE, | |
| ) | |
| if top_task_tokens else None | |
| ) | |
| title_counts: dict = defaultdict(int) | |
| for raw_text in can[sem_col].fillna('').astype(str): | |
| desc_part = raw_text | |
| if 'description:' in raw_text.lower(): | |
| desc_part = re.split(r'description\s*:', raw_text, maxsplit=1, | |
| flags=re.IGNORECASE)[-1].strip() | |
| full_len = len(desc_part.strip()) | |
| if full_len < 3: | |
| continue | |
| title_raw = re.split(r'[:|]', desc_part)[0].strip() | |
| # Gate: title must be genuinely shorter than the full description. | |
| # If title β₯ 80% of full text there is no colon structure β skip. | |
| if len(title_raw) >= full_len * 0.80: | |
| continue | |
| title_clean = re.sub(r'^[\s\d\W]+', '', title_raw).strip() | |
| if len(title_clean) < 3 or title_clean.replace(' ', '').isdigit(): | |
| continue | |
| # Strip leading task prefix (data-driven) | |
| if _task_pfx_re: | |
| title_clean = _task_pfx_re.sub('', title_clean).strip() | |
| if len(title_clean) < 3: | |
| continue | |
| # β€4 words: a concept anchor must be a short label, not a sentence | |
| if len(title_clean.split()) > 4: | |
| continue | |
| if _is_bool_noise(title_clean): # [FIX5] | |
| continue | |
| title_counts[title_clean] += 1 | |
| for title, cnt in title_counts.items(): | |
| if cnt < 2: # must appear in β₯2 variables to be a real shared concept | |
| continue | |
| # Reject titles containing underscores β always raw variable/column names | |
| if '_' in title: | |
| continue | |
| key = title.lower() | |
| if key not in candidates: | |
| candidates[key] = { | |
| 'label': title, | |
| 'full_text': title, | |
| 'frequency': cnt, | |
| 'source': 'description_title', | |
| 'tfidf_score': 0.95, | |
| 'cross_group_count': 0, | |
| } | |
| else: | |
| candidates[key]['frequency'] = max(candidates[key]['frequency'], cnt) | |
| # Source 1: Group path components β already structured, highest quality | |
| for gpath in can['_group_path'].dropna().unique(): | |
| for part in str(gpath).split(' > '): | |
| part = part.strip() | |
| if len(part) >= 3 and part.lower() not in ('ungrouped', 'nan', 'none', ''): | |
| key = part.lower() | |
| if key not in candidates: | |
| candidates[key] = { | |
| 'label': part, 'full_text': part, | |
| 'frequency': 0, 'source': 'group_path', 'tfidf_score': 1.0, | |
| 'cross_group_count': 0, # [FIX6] | |
| } | |
| candidates[key]['frequency'] += 1 | |
| # Source 2: TF-IDF n-grams from _semantic_text (description values only) [TAX][YAKE] | |
| texts = can[sem_col].fillna('').astype(str).tolist() | |
| if texts: | |
| try: | |
| vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), | |
| max_features=min(600, max_concepts * 4), min_df=1) | |
| X = vec.fit_transform(texts) | |
| terms = vec.get_feature_names_out() | |
| mean_tfidf = np.asarray(X.mean(axis=0)).flatten() | |
| freq_arr = np.asarray((X > 0).sum(axis=0)).flatten() | |
| for i in np.argsort(mean_tfidf)[::-1][:max_concepts]: | |
| term = terms[i] | |
| words = term.split() | |
| if not (len(term) >= 4 and any(c.isalpha() for c in term) | |
| and not term.replace(' ', '').isdigit()): | |
| continue | |
| # Skip stop words and structural noise (single-word filter) | |
| if len(words) == 1 and (term in _STOP or term.lower() in _STRUCT_NOISE): | |
| continue | |
| # [FIX5][TaxoGen] Skip boolean/value-state noise candidates | |
| if _is_bool_noise(term): | |
| continue | |
| # YAKE: single-word candidates penalised β prefer multi-word phrases | |
| score_mult = 0.5 if len(words) == 1 else 1.0 | |
| key = term.lower() | |
| if key not in candidates: | |
| candidates[key] = { | |
| 'label': term, | |
| 'full_text': term, | |
| 'frequency': int(freq_arr[i]), | |
| 'source': 'metadata_tfidf', | |
| 'tfidf_score': float(mean_tfidf[i]) * score_mult, | |
| 'cross_group_count': 0, # [FIX6] | |
| } | |
| except Exception: | |
| pass | |
| # Source 3: Repeated multi-word noun phrases from _semantic_text [TAX] | |
| phrase_re = re.compile(r'\b([a-z][a-z0-9]{1,}(?:\s+[a-z][a-z0-9]{1,}){1,3})\b') | |
| phrase_counts = defaultdict(int) | |
| for text in texts: | |
| for m in phrase_re.finditer(text.lower()): | |
| phrase = m.group(1) | |
| words = phrase.split() | |
| if any(w not in _STOP and w not in _STRUCT_NOISE and len(w) >= 3 for w in words): | |
| phrase_counts[phrase] += 1 | |
| for phrase, count in sorted(phrase_counts.items(), key=lambda x: -x[1]): | |
| if count >= 2 and len(phrase) >= 5: | |
| if _is_bool_noise(phrase): # [FIX5] | |
| continue | |
| key = phrase.lower() | |
| if key not in candidates: | |
| candidates[key] = { | |
| 'label': phrase, | |
| 'full_text': phrase, | |
| 'frequency': count, | |
| 'source': 'noun_phrase', | |
| 'tfidf_score': 0.0, | |
| 'cross_group_count': 0, # [FIX6] | |
| } | |
| # ββ [FIX6][TaxoGen][CAS] Cross-group boilerplate tagging ββββββββββββββββββ | |
| # A concept that appears in EVERY top-level group is dataset-wide boilerplate | |
| # (e.g. "Calculated Assessed Trials" across DMS/MOT/PAL/SWM/β¦). | |
| # Count how many distinct top-level groups contain each candidate label. | |
| # The penalty is applied later in score_concepts_for_cluster(). | |
| # Paper rationale β TaxoGen KDD 2018: contrastive term selection prefers | |
| # locally dominant, globally rare terms. Castanet NAACL 2007: facet labels | |
| # must discriminate between top-level categories. | |
| if '_group_path' in can.columns: | |
| top_groups = ( | |
| can['_group_path'].fillna('').astype(str) | |
| .apply(lambda p: p.split(' > ')[0].strip().lower()) | |
| ) | |
| all_top_groups = [g for g in top_groups.unique() if g not in ('', 'ungrouped', 'nan')] | |
| n_top_groups = max(1, len(all_top_groups)) | |
| # Build per-group text corpus for fast membership testing | |
| group_texts = {} | |
| for grp in all_top_groups: | |
| mask = top_groups == grp | |
| group_texts[grp] = ' '.join(can.loc[mask, sem_col].fillna('').astype(str)).lower() | |
| for key, cand in candidates.items(): | |
| cand_words = set(re.findall(r'\b[a-z]{3,}\b', cand['label'].lower())) - _STOP | |
| if not cand_words: | |
| continue | |
| count_in_groups = sum( | |
| 1 for grp_text in group_texts.values() | |
| if all(w in grp_text for w in cand_words) | |
| ) | |
| cand['cross_group_count'] = count_in_groups | |
| cand['_n_top_groups'] = n_top_groups # store for scorer | |
| # Sort: description_title / group_path first, then by tfidf_score, then by frequency | |
| _src_priority = {'group_path': 0, 'description_title': 1} | |
| result = sorted( | |
| candidates.values(), | |
| key=lambda x: (_src_priority.get(x['source'], 2), | |
| -x['tfidf_score'], -x['frequency']) | |
| ) | |
| return result[:max_concepts] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # EXTERNAL CONCEPT SOURCES | |
| # Build a concept TABLE (not just append text). Each entry has a full_text | |
| # that is encoded by SBERT for the NΓM alignment matrix. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def wikidata_search(term): | |
| """Wikidata entity search β returns concatenated descriptions. [GON][HIE]""" | |
| try: | |
| resp = requests.get( | |
| 'https://www.wikidata.org/w/api.php', | |
| params={'action': 'wbsearchentities', 'search': term, | |
| 'language': 'en', 'format': 'json', 'limit': 3}, | |
| timeout=6, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} | |
| ) | |
| items = resp.json().get('search', []) | |
| descs = [it.get('description', '') for it in items if it.get('description')] | |
| return ' '.join(descs[:2]) | |
| except Exception: | |
| return '' | |
| def wikidata_broader(term): | |
| """ | |
| P31=instance_of, P279=subclass_of, P361=part_of from Wikidata SPARQL. | |
| These are the beyond-is-a relations from [HIE]. | |
| """ | |
| try: | |
| sr = requests.get( | |
| 'https://www.wikidata.org/w/api.php', | |
| params={'action': 'wbsearchentities', 'search': term, | |
| 'language': 'en', 'format': 'json', 'limit': 1}, | |
| timeout=5, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} | |
| ) | |
| items = sr.json().get('search', []) | |
| if not items: | |
| return [] | |
| qid = items[0]['id'] | |
| sparql = f""" | |
| SELECT ?rel ?broaderLabel WHERE {{ | |
| VALUES ?prop {{ wdt:P31 wdt:P279 wdt:P361 }} | |
| wd:{qid} ?prop ?broader . | |
| BIND(REPLACE(STR(?prop),'.*P','P') AS ?rel) | |
| SERVICE wikibase:label {{ bd:serviceParam wikibase:language 'en' . }} | |
| }} LIMIT 5 | |
| """ | |
| resp = requests.get( | |
| 'https://query.wikidata.org/sparql', | |
| params={'query': sparql, 'format': 'json'}, | |
| headers={'Accept': 'application/json', 'User-Agent': 'MetadataHierarchyTool/1.0'}, | |
| timeout=10 | |
| ) | |
| results = resp.json().get('results', {}).get('bindings', []) | |
| rel_map = {'P31': 'instance_of', 'P279': 'subclass_of', 'P361': 'part_of'} | |
| out = [] | |
| for r in results: | |
| rel = rel_map.get(r.get('rel', {}).get('value', ''), 'related_to') | |
| lbl = r.get('broaderLabel', {}).get('value', '') | |
| if lbl: | |
| out.append((rel, lbl)) | |
| return out | |
| except Exception: | |
| return [] | |
| def wikipedia_summary(term): | |
| """Wikipedia intro paragraph. Taxonomizer trained on Wikipedia β same corpus. [TAX]""" | |
| try: | |
| resp = requests.get( | |
| 'https://en.wikipedia.org/api/rest_v1/page/summary/' + term.replace(' ', '_'), | |
| timeout=6, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} | |
| ) | |
| extract = resp.json().get('extract', '') | |
| return extract[:300] if extract else '' | |
| except Exception: | |
| return '' | |
| def pubmed_keywords(query): | |
| """PubMed enrichment β biomedical domain only. [GON]""" | |
| try: | |
| search = requests.get( | |
| 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', | |
| params={'db': 'pubmed', 'term': query[:100], 'retmax': 3, 'retmode': 'json'}, | |
| timeout=8 | |
| ) | |
| ids = search.json().get('esearchresult', {}).get('idlist', []) | |
| if not ids: | |
| return '' | |
| fetch = requests.get( | |
| 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', | |
| params={'db': 'pubmed', 'id': ','.join(ids), 'rettype': 'abstract', 'retmode': 'text'}, | |
| timeout=10 | |
| ) | |
| return fetch.text[:400] | |
| except Exception: | |
| return '' | |
| def bioportal_search(phrase, api_key): | |
| """BioPortal ontology search β biomedical domain, requires free API key. [GON]""" | |
| if not api_key: | |
| return [] | |
| try: | |
| resp = requests.get( | |
| 'https://data.bioontology.org/search', | |
| params={'q': phrase, 'pagesize': 3, 'display_links': 'false'}, | |
| headers={'Authorization': f'apikey token={api_key}'}, | |
| timeout=8 | |
| ) | |
| results = resp.json().get('collection', []) | |
| out = [] | |
| for r in results: | |
| lbl = r.get('prefLabel', '') | |
| defn = ' '.join(r.get('definition', []))[:200] | |
| if lbl: | |
| out.append({ | |
| 'label': lbl, 'full_text': f'{lbl}. {defn}' if defn else lbl, | |
| 'source': 'bioportal', 'frequency': 0, 'tfidf_score': 0.0, | |
| 'broader_relations': [], | |
| }) | |
| return out | |
| except Exception: | |
| return [] | |
| def wordnet_lookup(phrase): | |
| """WordNet definition + hypernyms β synonym/hypernym expansion. [GON]""" | |
| if not _WORDNET_AVAILABLE: | |
| return None | |
| try: | |
| synsets = wn.synsets(phrase.replace(' ', '_')) | |
| if not synsets: | |
| synsets = wn.synsets(phrase.split()[0]) if phrase.split() else [] | |
| if not synsets: | |
| return None | |
| ss = synsets[0] | |
| defn = ss.definition() | |
| hypernyms = [h.name().replace('_', ' ').split('.')[0] for h in ss.hypernyms()[:3]] | |
| return {'definition': defn, 'hypernyms': hypernyms} | |
| except Exception: | |
| return None | |
| def _is_acronym(phrase): | |
| """True if phrase is a short all-caps token β high Wikidata polysemy risk. [GON][BLINK] | |
| These tokens must not be queried raw; use the expanded form instead.""" | |
| p = phrase.strip() | |
| return p.isupper() and 2 <= len(p) <= 6 and sum(c.isalpha() for c in p) >= 2 | |
| def cognitive_atlas_search(term): | |
| """Cognitive Atlas REST API β domain-specific for cognitive/neurological tasks. | |
| Preferred over Wikidata for cognitive domain codes. [GON]""" | |
| try: | |
| resp = requests.get( | |
| 'https://www.cognitiveatlas.org/api/v-alpha/task', | |
| params={'search': term, 'format': 'json'}, | |
| timeout=8, headers={'User-Agent': 'MetadataHierarchyTool/1.0'} | |
| ) | |
| items = resp.json() | |
| if isinstance(items, list) and items: | |
| item = items[0] | |
| name = item.get('name', '') | |
| defn = item.get('definition_text', '') or item.get('alias', '') | |
| if name: | |
| return f'{name}. {defn[:250]}' if defn else name | |
| except Exception: | |
| pass | |
| return '' | |
| def retrieve_concept_table(candidates, domain='general', | |
| use_wikidata=True, use_wikipedia=False, | |
| use_wordnet=True, use_pubmed=False, | |
| bioportal_key='', progress_cb=None, | |
| code_expansions=None): | |
| """ | |
| Build a concept TABLE from candidates + external sources. | |
| Each entry: {label, full_text, source, frequency, tfidf_score, broader_relations}. | |
| full_text = label + external description β encoded by SBERT for NΓM matrix. | |
| This is the right-hand side of the GonΓ§alves NΓM alignment matrix. [GON] | |
| """ | |
| # Start with all candidates as base entries | |
| table = {} # label_lower β dict | |
| for c in candidates: | |
| key = c['label'].lower() | |
| table[key] = { | |
| 'label': c['label'], | |
| 'full_text': c['label'], | |
| 'source': c['source'], | |
| 'frequency': c.get('frequency', 0), | |
| 'tfidf_score': c.get('tfidf_score', 0.0), | |
| 'broader_relations': [], | |
| } | |
| # Enrich top candidates with external sources | |
| api_candidates = sorted(candidates, | |
| key=lambda x: (0 if x['source'] == 'group_path' else 1, | |
| -x.get('tfidf_score', 0), -x.get('frequency', 0)))[:60] | |
| n_api = len(api_candidates) | |
| code_expansions = code_expansions or {} | |
| for i, c in enumerate(api_candidates): | |
| if progress_cb: | |
| progress_cb(i / n_api) | |
| phrase = c['label'] | |
| key = phrase.lower() | |
| # Determine the query phrase β never query raw acronyms on Wikidata [GON][BLINK] | |
| if _is_acronym(phrase): | |
| exp = code_expansions.get(phrase, {}).get('expansion', '') | |
| query_phrase = exp if exp else None # None = skip Wikidata entirely | |
| else: | |
| query_phrase = phrase | |
| # Cognitive Atlas (cognitive/neurological domain β before Wikidata) [GON] | |
| if domain in ('cognitive', 'neurological', 'biomedical') and query_phrase: | |
| cat_def = cognitive_atlas_search(query_phrase) | |
| if cat_def and key in table: | |
| table[key]['full_text'] = f'{phrase}. {cat_def}' | |
| table[key]['source'] = 'cognitive_atlas' | |
| # Wikidata β use expanded form for acronyms, skip if no expansion found | |
| if use_wikidata and query_phrase is not None: | |
| wd_desc = wikidata_search(query_phrase) | |
| wd_rel = wikidata_broader(query_phrase) | |
| if key in table: | |
| if wd_desc and table[key]['source'] not in ('cognitive_atlas',): | |
| table[key]['full_text'] = f'{phrase}. {wd_desc}' | |
| table[key]['source'] = 'wikidata' | |
| table[key]['broader_relations'] = wd_rel | |
| # WordNet β synonyms, hypernyms, definitions | |
| if use_wordnet and _WORDNET_AVAILABLE: | |
| wn_res = wordnet_lookup(phrase) | |
| if wn_res: | |
| wn_key = f'wordnet_{key}' | |
| table[wn_key] = { | |
| 'label': phrase, | |
| 'full_text': f'{phrase}. {wn_res["definition"]}', | |
| 'source': 'wordnet', | |
| 'frequency': c.get('frequency', 0), | |
| 'tfidf_score': c.get('tfidf_score', 0.0), | |
| 'broader_relations': [('related_to', h) for h in wn_res.get('hypernyms', [])], | |
| } | |
| # Wikipedia (optional) | |
| if use_wikipedia and i < 20: | |
| wiki = wikipedia_summary(phrase) | |
| if wiki: | |
| wp_key = f'wikipedia_{key}' | |
| table[wp_key] = { | |
| 'label': phrase, | |
| 'full_text': f'{phrase}. {wiki[:200]}', | |
| 'source': 'wikipedia', | |
| 'frequency': c.get('frequency', 0), | |
| 'tfidf_score': c.get('tfidf_score', 0.0), | |
| 'broader_relations': [], | |
| } | |
| # PubMed (biomedical only, optional) | |
| if use_pubmed and domain in ('biomedical', 'cognitive') and i < 8: | |
| pm = pubmed_keywords(phrase) | |
| if pm: | |
| pm_key = f'pubmed_{key}' | |
| table[pm_key] = { | |
| 'label': phrase, | |
| 'full_text': f'{phrase}. {pm[:200]}', | |
| 'source': 'pubmed', | |
| 'frequency': c.get('frequency', 0), | |
| 'tfidf_score': c.get('tfidf_score', 0.0), | |
| 'broader_relations': [], | |
| } | |
| time.sleep(0.35) # NCBI rate limit | |
| # BioPortal (biomedical only, optional API key) | |
| if bioportal_key and domain in ('biomedical', 'cognitive') and i < 20: | |
| for bp in bioportal_search(phrase, bioportal_key): | |
| bp_key = f"bioportal_{bp['label'].lower()}" | |
| table[bp_key] = bp | |
| return list(table.values()) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONCEPT ALIGNMENT β NΓM COSINE SIMILARITY [GON] | |
| # GonΓ§alves et al. build an NΓM similarity matrix between metadata field | |
| # embeddings and ontology term embeddings, then rank alignments. | |
| # Here: N=variable clusters, M=concept table entries. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _string_overlap(cluster_texts, concept_label): | |
| """ | |
| Word overlap between cluster descriptions and concept label words. | |
| Measures string-level evidence that this concept matches this cluster. | |
| """ | |
| concept_words = set(re.findall(r'\b[a-z]{3,}\b', concept_label.lower())) - _STOP | |
| if not concept_words: | |
| return 0.0 | |
| cluster_combined = ' '.join(cluster_texts).lower() | |
| cluster_words = set(re.findall(r'\b[a-z]{3,}\b', cluster_combined)) - _STOP | |
| overlap = len(concept_words & cluster_words) / len(concept_words) | |
| return float(overlap) | |
| def _seq_sim(a, b): | |
| """SequenceMatcher ratio between two strings β for code/label similarity.""" | |
| return SequenceMatcher(None, a.lower(), b.lower()).ratio() | |
| def score_concepts_for_cluster(cluster_emb, concept_embs, concept_table, cluster_texts, | |
| n_total_vars=None, member_embs=None, | |
| sibling_centroids=None, ref_centroids=None, | |
| corpus_centroid=None, own_group_centroid=None): | |
| """ | |
| Fully-semantic, multi-signal concept scoring for one cluster. | |
| [GON] GonΓ§alves ESWC 2019 (IDF-weighted embeddings + cosine); | |
| [TaxoGen KDD 2018] contrastive term selection; [CAS] Castanet facet contrast. | |
| Every signal is cosine-in-embedding-space β no word-overlap, no hardcoded | |
| boilerplate lists. All references are data-derived, so it transfers to any set. | |
| score = 0.30 Γ fit (mean cosine of label to THIS cluster's members) | |
| + 0.35 Γ contrast (fit β best cosine to a SIBLING cluster, same task) | |
| + 0.25 Γ home (cosine to OWN-task centroid β mean over all tasks) | |
| + 0.10 Γ source_conf | |
| β url_noise_penalty | |
| Why three signals, and what each one kills: | |
| β’ fit β must actually describe this cluster. | |
| β’ contrast β kills dataset-wide BOILERPLATE ("Calculated Assessed Trials"): | |
| it sits in every sibling cluster too, so fit β sibling-sim β | |
| contrast β 0. A real sub-topic ("Total Errors") is in its own | |
| cluster but not the latency sibling β positive contrast. | |
| β’ home β kills CROSS-TASK leakage ("Rvp 3 Targets" under DMS): it is far | |
| from the DMS group centroid, so (cos to own group β mean over | |
| groups) is negative β clipped to 0. A genuine DMS term is at or | |
| above the cross-task average β positive. This is task-RELATIVE, | |
| unlike a group-agnostic maxβmean peak (which wrongly rewarded a | |
| sharp RVP-specific label even while labelling a DMS cluster). | |
| `own_group_centroid` is the centroid of the current task's variables (passed in | |
| by the caller); `ref_centroids` are all top-level task centroids. | |
| Returns list of dicts sorted by score descending. | |
| """ | |
| if concept_embs is None or len(concept_table) == 0: | |
| return [] | |
| concept_embs = np.asarray(concept_embs, dtype=float) | |
| # Similarity of each candidate to this cluster's centroid | |
| emb_sims = cosine_similarity([cluster_emb], concept_embs)[0] | |
| # ββ fit: mean cosine to the cluster's MEMBER embeddings (robust to outliers) | |
| if member_embs is not None and len(member_embs) > 0: | |
| fit = cosine_similarity(concept_embs, np.asarray(member_embs, dtype=float)).mean(axis=1) | |
| else: | |
| fit = emb_sims | |
| # ββ contrast: fit minus closeness to the nearest SIBLING cluster (same task) | |
| if sibling_centroids is not None and len(sibling_centroids) > 0: | |
| sib_sims = cosine_similarity(concept_embs, np.asarray(sibling_centroids, dtype=float)) | |
| contrast = np.clip(fit - sib_sims.max(axis=1), 0.0, 1.0) | |
| else: | |
| contrast = np.zeros(len(concept_table)) | |
| # ββ home: does the label belong to THIS task more than to tasks on average? | |
| # Task-relative β measured against the CURRENT group centroid, not a peak. | |
| home_active = False | |
| if own_group_centroid is not None and ref_centroids is not None and len(ref_centroids) >= 2: | |
| own_sim = cosine_similarity(concept_embs, [own_group_centroid])[:, 0] | |
| all_mean = cosine_similarity(concept_embs, np.asarray(ref_centroids, dtype=float)).mean(axis=1) | |
| home = np.clip((own_sim - all_mean) * 3.0, 0.0, 1.0) | |
| home_active = True | |
| elif own_group_centroid is not None and corpus_centroid is not None: | |
| own_sim = cosine_similarity(concept_embs, [own_group_centroid])[:, 0] | |
| gen = cosine_similarity(concept_embs, [corpus_centroid])[:, 0] | |
| home = np.clip((own_sim - gen) * 3.0, 0.0, 1.0) | |
| home_active = True | |
| else: | |
| # No task reference (e.g. single Ungrouped bucket): neutral, don't filter. | |
| home = np.full(len(concept_table), 0.34) | |
| src_sc = np.array([SOURCE_CONFIDENCE.get(c.get('source', 'noun_phrase'), 0.55) | |
| for c in concept_table]) | |
| # URL / HTML artifact penalty β strips documentation junk, not domain terms | |
| _url_noise_re = re.compile( | |
| r'\b(http|href|wiki|neurolex|org|www|definition|category|link|url)\b', | |
| re.IGNORECASE | |
| ) | |
| noise_penalty = np.array( | |
| [0.35 if _url_noise_re.search(c['label']) else 0.0 for c in concept_table] | |
| ) | |
| total = (0.30 * fit + 0.35 * contrast + 0.25 * home | |
| + 0.10 * src_sc - noise_penalty) | |
| # Reported only (provenance/debug) β not scored. | |
| str_sims = np.array([_string_overlap(cluster_texts, c['label']) for c in concept_table]) | |
| results = [] | |
| for i, concept in enumerate(concept_table): | |
| # Drop candidates that don't fit this cluster, or (when a task reference | |
| # exists) that belong to a DIFFERENT task β i.e. home collapsed to 0. | |
| if float(fit[i]) < 0.12: | |
| continue | |
| if home_active and float(home[i]) <= 0.0: | |
| continue | |
| results.append({ | |
| 'label': concept['label'], | |
| 'score': float(total[i]), | |
| 'embedding_sim': float(emb_sims[i]), | |
| 'coverage': float(fit[i]), | |
| 'contrast': float(contrast[i]), | |
| 'specificity': float(home[i]), | |
| 'string_sim': float(str_sims[i]), | |
| 'source': concept.get('source', 'unknown'), | |
| 'broader_relations': concept.get('broader_relations', []), | |
| '_emb': concept_embs[i], | |
| }) | |
| return sorted(results, key=lambda x: -x['score']) | |
| def assign_concept_label(scores, fallback='Group', min_score=0.08, | |
| ancestor_names=None, used_sibling_labels=None, | |
| top_level_tasks=None, ancestor_embs=None, | |
| sibling_label_embs=None, dup_sim=0.82): | |
| """ | |
| Pick best concept label from scored results. | |
| Rejection combines STRUCTURAL guards (domain-agnostic, not hardcoding) with | |
| SEMANTIC ones (embedding cosine): | |
| Structural: | |
| - token self-repetition ("Dms Dms") | |
| - label is a substring of / equal to an ancestor, or vice-versa | |
| (kills "Dms" and "Dms Recommended Standard" sitting under ancestor "DMS") | |
| - all of the label's content words already appear in an ancestor label | |
| - exact match with an already-used sibling label | |
| - FOREIGN-TASK token: label contains a top-level task name that is NOT the | |
| current ancestor task (e.g. "Rvp 3 Targets" / "Swm Errors" under DMS). | |
| Task names are discovered from _group_path β data-driven, not hardcoded. | |
| Semantic: | |
| - cosine(label_emb, any ancestor_emb) > dup_sim β parent paraphrase | |
| - cosine(label_emb, any sibling_emb) > dup_sim β sibling paraphrase | |
| Returns (label, provenance_dict). | |
| """ | |
| ancestor_set = {str(a).lower().strip() for a in (ancestor_names or [])} | |
| used_set = {str(u).lower().strip() for u in (used_sibling_labels or [])} | |
| anc_embs = np.asarray(ancestor_embs, dtype=float) if ancestor_embs is not None and len(ancestor_embs) else None | |
| sib_embs = list(sibling_label_embs) if sibling_label_embs else [] | |
| # Current task = the ancestor that is itself a top-level task (data-driven) | |
| _task_set = {str(t).lower() for t in (top_level_tasks or [])} | |
| _current_task = next((str(a).lower() for a in (ancestor_names or []) | |
| if str(a).lower() in _task_set), None) | |
| def _is_degenerate(lbl, emb=None): | |
| """True if label should be rejected.""" | |
| l = lbl.strip().lower() | |
| # Structural 1: token self-repetition ("Dms Dms", "Swm Swm") | |
| toks = l.split() | |
| if len(toks) >= 2 and len(set(toks)) < len(toks): | |
| return True | |
| # Structural 2: substring of / equal to an ancestor (or vice-versa) | |
| for anc in ancestor_set: | |
| if l == anc or l in anc or anc in l: | |
| return True | |
| # Structural 3: all content words already present in an ancestor label | |
| lbl_words = set(re.findall(r'\b[a-z]{3,}\b', l)) - _STOP | |
| for anc in ancestor_set: | |
| anc_words = set(re.findall(r'\b[a-z]{3,}\b', anc)) - _STOP | |
| if lbl_words and lbl_words.issubset(anc_words): | |
| return True | |
| # Structural 4: already used by a sibling group | |
| if l in used_set: | |
| return True | |
| # Structural 5: foreign-task token (cross-task contamination) | |
| if _current_task and _task_set: | |
| for task in _task_set: | |
| if task != _current_task and re.search(r'\b' + re.escape(task) + r'\b', l): | |
| return True | |
| # Semantic parent-duplication: candidate paraphrases an ancestor label | |
| if emb is not None and anc_embs is not None: | |
| if float(cosine_similarity([emb], anc_embs).max()) > dup_sim: | |
| return True | |
| # Semantic sibling-duplication: candidate paraphrases a chosen sibling label | |
| if emb is not None and sib_embs: | |
| if float(cosine_similarity([emb], np.asarray(sib_embs, dtype=float)).max()) > dup_sim: | |
| return True | |
| return False | |
| # Walk ranked scores; skip degenerate candidates | |
| chosen = None | |
| for s in scores: | |
| if s['score'] < min_score: | |
| break | |
| candidate = s['label'].strip().title() | |
| if not _is_degenerate(candidate, s.get('_emb')): | |
| chosen = s | |
| break | |
| if chosen is None: | |
| return fallback, { | |
| 'node_label': fallback, 'confidence': 0.0, | |
| 'alternatives': [], 'source_evidence': ['tfidf_fallback'], | |
| 'embedding_sim': 0.0, 'string_sim': 0.0, | |
| 'coverage': 0.0, 'contrast': 0.0, 'specificity': 0.0, | |
| } | |
| label = chosen['label'].strip().title() | |
| alts = [s['label'] for s in scores[1:4] | |
| if s['label'] != chosen['label'] | |
| and not _is_degenerate(s['label'].strip().title(), s.get('_emb'))] | |
| provenance = { | |
| 'node_label': label, | |
| 'confidence': round(chosen['score'], 3), | |
| 'alternatives': alts, | |
| 'source_evidence': [chosen['source']], | |
| 'embedding_sim': round(chosen['embedding_sim'], 3), | |
| 'coverage': round(chosen.get('coverage', 0.0), 3), | |
| 'contrast': round(chosen.get('contrast', 0.0), 3), | |
| 'specificity': round(chosen.get('specificity', 0.0), 3), | |
| 'string_sim': round(chosen['string_sim'], 3), | |
| } | |
| return label, provenance | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TF-IDF FALLBACK CLUSTERING | |
| # Used when concept table is unavailable or similarity is too low. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def tfidf_cluster_labels(texts, max_clusters=8): | |
| """[GON] TF-IDF agglomerative clustering β discriminative label per cluster.""" | |
| n = len(texts) | |
| if n <= 1: | |
| return [''] * n | |
| try: | |
| vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), | |
| max_features=500, min_df=1) | |
| X = vec.fit_transform(texts) | |
| n_clust = min(max_clusters, max(2, n // 3), X.shape[0]) | |
| dist = cosine_distances(X).astype(float) | |
| np.fill_diagonal(dist, 0.0) | |
| labels = AgglomerativeClustering(n_clusters=n_clust, metric='precomputed', | |
| linkage='average').fit_predict(dist) | |
| terms = vec.get_feature_names_out() | |
| X_dense = X.toarray() | |
| cnames = {} | |
| for k in range(n_clust): | |
| mask = labels == k | |
| if not mask.any(): | |
| cnames[k] = f'Group {k+1}' | |
| continue | |
| scores = X_dense[mask].mean(axis=0) - (X_dense[~mask].mean(axis=0) if (~mask).any() else 0) | |
| top = [i for i in np.argsort(scores)[::-1] if len(terms[i]) > 3] | |
| cnames[k] = terms[top[0]].title() if top else f'Group {k+1}' | |
| return [cnames[int(lb)] for lb in labels] | |
| except Exception: | |
| return [''] * n | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # [C8] WORDNET HYPERNYM CHAIN FALLBACK [CAS][TAX] | |
| # Castanet: "carves out a structure from WordNet that reflects the collection." | |
| # Taxonomizer: "labeling inner nodes requires the identification of hypernyms." | |
| # Walks IS-A chain upward from the dominant noun in cluster texts. | |
| # Returns the highest-confidence hypernym that is not in excluded_names. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def wordnet_hypernym_fallback(cluster_texts, excluded_names=None): | |
| """ | |
| [C8][CAS][TAX] Walk WordNet IS-A chain upward from cluster centroid noun. | |
| Returns the best hypernym that is: | |
| - not in excluded_names (ancestors, parent label) | |
| - not a stop word | |
| - not too generic (not 'entity','object','thing','abstraction','whole') | |
| Falls back to None if WordNet unavailable or no valid hypernym found. | |
| """ | |
| if not _WORDNET_AVAILABLE: | |
| return None | |
| excluded = {str(n).lower().strip() for n in (excluded_names or [])} | |
| _too_generic = {'entity', 'object', 'thing', 'abstraction', 'whole', | |
| 'physical entity', 'psychological feature', 'group', | |
| 'attribute', 'measure', 'amount', 'number', 'quantity'} | |
| # Extract most frequent meaningful nouns from cluster texts | |
| all_text = ' '.join(cluster_texts).lower() | |
| words = [w for w in re.findall(r'\b[a-z]{4,}\b', all_text) | |
| if w not in _STOP and w not in _too_generic] | |
| if not words: | |
| return None | |
| from collections import Counter | |
| freq = Counter(words) | |
| candidates_words = [w for w, _ in freq.most_common(8)] | |
| best_label = None | |
| best_depth = 0 # prefer specific (deeper) hypernyms over generic ones | |
| for word in candidates_words: | |
| try: | |
| synsets = wn.synsets(word, pos=wn.NOUN) | |
| if not synsets: | |
| continue | |
| ss = synsets[0] | |
| # Walk hypernym chain β collect all hypernyms with their depth | |
| paths = ss.hypernym_paths() | |
| for path in paths: | |
| for depth, hyp_ss in enumerate(reversed(path)): | |
| hyp_name = hyp_ss.name().split('.')[0].replace('_', ' ') | |
| if (hyp_name.lower() not in excluded | |
| and hyp_name.lower() not in _too_generic | |
| and hyp_name.lower() not in _STOP | |
| and len(hyp_name) > 3 | |
| and depth > 0): # skip the synset itself (depth=0) | |
| if depth > best_depth: | |
| best_depth = depth | |
| best_label = hyp_name.title() | |
| break # use deepest valid hypernym per path | |
| except Exception: | |
| continue | |
| return best_label if best_depth > 0 else None | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # [C3] DISCRIMINATIVE TF-IDF LABEL PER GROUP [GON][TAX] | |
| # GonΓ§alves: NΓM alignment uses text signals from descriptions, not raw codes. | |
| # TaxoGen: "local embedding module for discriminative power at each level." | |
| # Computes TF-IDF across ALL groups so terms unique to THIS group score high. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_discriminative_tfidf_label(cluster_texts, all_groups_texts): | |
| """ | |
| [C3][GON][TAX] Return the most discriminative 1-2 word label for cluster_texts | |
| relative to all_groups_texts (list of text lists from sibling groups). | |
| Uses TF-IDF contrast: high TF in cluster, low IDF across all groups = discriminative. | |
| Returns a title-cased string or None. | |
| """ | |
| try: | |
| # Build one document per group (cluster + all siblings) | |
| cluster_doc = ' '.join(cluster_texts) | |
| sibling_docs = [' '.join(g) for g in all_groups_texts if g] | |
| all_docs = [cluster_doc] + sibling_docs | |
| if len(all_docs) < 2: | |
| return None | |
| vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), | |
| max_features=300, min_df=1) | |
| X = vec.fit_transform(all_docs) | |
| terms = vec.get_feature_names_out() | |
| cluster_vec = X[0].toarray()[0] | |
| # Score = cluster TF-IDF score β mean across sibling docs | |
| sibling_mean = X[1:].toarray().mean(axis=0) if X.shape[0] > 1 else np.zeros(len(terms)) | |
| contrast = cluster_vec - sibling_mean | |
| best_idxs = [i for i in np.argsort(contrast)[::-1] | |
| if len(terms[i]) > 3 and contrast[i] > 0.01] | |
| if best_idxs: | |
| return terms[best_idxs[0]].title() | |
| except Exception: | |
| pass | |
| return None | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NODE MANIPULATION UTILITIES | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def nmap(nodes): return {int(n['id']): n for n in nodes} | |
| def next_id(nodes): return max([int(n['id']) for n in nodes] or [0]) + 1 | |
| def add_child(nodes, parent, child): | |
| m = nmap(nodes); p = m.get(int(parent)) | |
| if not p: return | |
| rel = list(p.get('related', [])) | |
| if int(child) not in rel: rel.append(int(child)) | |
| p['related'] = rel | |
| def remove_child(nodes, parent, child): | |
| m = nmap(nodes); p = m.get(int(parent)) | |
| if p: p['related'] = [x for x in p.get('related', []) if int(x) != int(child)] | |
| def make_agg(id, name, related=None, op='concat', dtype='determine', desc='', | |
| shown=True, relation_type='belongs_to', provenance=None): | |
| node = { | |
| 'id': int(id), | |
| 'name': str(name), | |
| 'related': [int(x) for x in (related or [])], | |
| 'type': 'aggregation', | |
| 'info': {'operation': op, 'usedAttributes': [], 'formula': '', 'exec': '', | |
| 'relation_type': relation_type, | |
| 'relation_label': RELATION_TYPES.get(relation_type, 'belongs to')}, | |
| 'isShown': bool(shown), | |
| 'desc': desc or '', | |
| 'dtype': dtype, | |
| 'recover': True, | |
| } | |
| if provenance: | |
| node['concept_provenance'] = provenance | |
| return node | |
| def get_node(nodes, id): return nmap(nodes).get(int(id)) | |
| def update_node(nodes, id, **upd): | |
| for n in nodes: | |
| if int(n['id']) == int(id): n.update(upd) | |
| return nodes | |
| def parents(nodes, child): | |
| return [int(n['id']) for n in nodes if int(child) in [int(x) for x in n.get('related', [])]] | |
| def ancestor_names(nodes, nid): | |
| """ | |
| [FIX1][HIE] Walk up the tree from nid collecting all ancestor node names. | |
| HiExpan (Shen et al., KDD 2018) Section 4.3 β Conflict Resolution: | |
| "avoid assigning a label already present in the path from root to the node." | |
| Used by hiexpan_depth_expansion_semantic to pass ancestor context to | |
| assign_concept_label, preventing repeated labels across hierarchy levels. | |
| """ | |
| m = nmap(nodes) | |
| result, cur, visited = [], int(nid), set() | |
| while cur not in visited: | |
| visited.add(cur) | |
| pars = parents(nodes, cur) | |
| if not pars: | |
| break | |
| p = pars[0] | |
| pn = m.get(p) | |
| if pn and pn.get('name') and pn.get('type') != 'root': | |
| result.append(str(pn['name'])) | |
| cur = p | |
| return result | |
| def descendants(nodes, id): | |
| m = nmap(nodes); seen = [] | |
| def rec(nid): | |
| n = m.get(int(nid)) | |
| if not n: return | |
| for c in n.get('related', []): | |
| c = int(c) | |
| if c in seen: continue | |
| seen.append(c); rec(c) | |
| rec(id); return seen | |
| def leaf_ids(nodes, id): | |
| m = nmap(nodes); out = [] | |
| def rec(nid): | |
| n = m.get(int(nid)) | |
| if not n: return | |
| if n.get('type') == 'attribute': out.append(int(nid)); return | |
| for c in n.get('related', []): rec(int(c)) | |
| rec(id); return list(dict.fromkeys(out)) | |
| def move_node(nodes, child, new_parent): | |
| if int(child) == 0 or int(new_parent) == int(child) or int(new_parent) in descendants(nodes, child): | |
| return nodes | |
| for p in parents(nodes, child): remove_child(nodes, p, child) | |
| add_child(nodes, new_parent, child); return nodes | |
| def delete_agg(nodes, id, reattach=True): | |
| if int(id) == 0: return nodes | |
| n = get_node(nodes, id) | |
| if not n or n.get('type') == 'attribute': return nodes | |
| ps = parents(nodes, id); children = list(n.get('related', [])) | |
| for p in ps: | |
| remove_child(nodes, p, id) | |
| if reattach: | |
| for c in children: add_child(nodes, p, c) | |
| return [x for x in nodes if int(x['id']) != int(id)] | |
| def build_parent_map(nodes): | |
| pm = {} | |
| for n in nodes: | |
| for c in n.get('related', []): | |
| cid = int(c) | |
| if cid not in pm: pm[cid] = int(n['id']) | |
| return pm | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # EVALUATION HELPERS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list: | |
| """Map each canonical row to the id of its depth-1 aggregation ancestor.""" | |
| pm = build_parent_map(nodes) | |
| def depth1(nid: int) -> int: | |
| while pm.get(nid, -1) not in (-1, 0): | |
| nid = pm[nid] | |
| return nid | |
| lid_to_nid = {n['metadata']['leaf_id']: int(n['id']) | |
| for n in nodes if n.get('type') == 'attribute' and 'metadata' in n} | |
| return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1 | |
| for lid in can['_leaf_id']] | |
| def _purity_score(y_true, y_pred) -> float: | |
| from collections import Counter | |
| clusters: dict = {} | |
| for t, p in zip(y_true, y_pred): | |
| clusters.setdefault(p, []).append(t) | |
| correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values()) | |
| return correct / max(len(y_true), 1) | |
| def _structural_stats(nodes: list) -> dict: | |
| pm = build_parent_map(nodes) | |
| def depth_of(nid: int) -> int: | |
| d = 0 | |
| while nid in pm: | |
| nid = pm[nid]; d += 1 | |
| return d | |
| agg = [n for n in nodes if n.get('type') == 'aggregation'] | |
| leafs = [n for n in nodes if n.get('type') == 'attribute'] | |
| depths = [depth_of(int(n['id'])) for n in leafs] | |
| branches = [len(n.get('related', [])) for n in agg] | |
| singletons = sum(1 for b in branches if b == 1) | |
| return { | |
| 'n_aggregation_nodes': len(agg), | |
| 'max_depth': int(max(depths, default=0)), | |
| 'avg_leaf_depth': round(float(np.mean(depths)), 2) if depths else 0.0, | |
| 'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0, | |
| 'singleton_nodes_%': round(100.0 * singletons / max(len(agg), 1), 1), | |
| } | |
| def path_rows(nodes): | |
| m = nmap(nodes); rows = [] | |
| def rec(nid, path, depth): | |
| n = m.get(int(nid)) | |
| if not n: return | |
| label = n.get('name', str(nid)) | |
| full = f'{path} / {label}' if path else label | |
| lc = len(leaf_ids(nodes, nid)) | |
| rows.append({'id': int(nid), 'name': label, 'path': full, 'depth': depth, | |
| 'type': n.get('type', ''), 'leaf_count': lc, | |
| 'relation': n.get('info', {}).get('relation_label', ''), | |
| 'choice': f'{full} [{n.get("type","")}, {lc} vars]'}) | |
| for c in n.get('related', []): rec(int(c), full, depth + 1) | |
| rec(0, '', 0); return rows | |
| def attr_opts(nodes): | |
| return [{'label': f'{n["name"]} (id={n["id"]})', 'id': int(n['id']), 'node': n} | |
| for n in nodes if n.get('type') == 'attribute'] | |
| def agg_opts(nodes, root=True): | |
| types = ['aggregation', 'root'] if root else ['aggregation'] | |
| return [{'label': f'{n["name"]} (id={n["id"]})', 'id': int(n['id']), 'node': n} | |
| for n in nodes if n.get('type') in types] | |
| def _centroid(embs): | |
| if embs is None or len(embs) == 0: return None | |
| c = np.mean(embs, axis=0) | |
| norm = np.linalg.norm(c) | |
| return c / norm if norm > 0 else c | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # [F4] FACET-GUIDED SUB-SPLITTING HELPERS [CAS][HIE] | |
| # | |
| # Castanet (Stoica & Hearst, 2007): "a single hierarchy conflates dimensions | |
| # and produces hard-to-label, counter-intuitive sub-hierarchies." | |
| # These helpers insert Statistic and Condition tiers below each concept node, | |
| # using the _facet_stat and _facet_cond columns pre-computed by | |
| # precompute_stat_cond_facets(). Only splits when β₯2 valid sub-groups exist | |
| # with β₯2 variables each β consistent with HiExpan's coherence requirement. | |
| # No hardcoding: the column values (Mean/SD/Median, 0/4/12/...) come from data. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _MIN_FACET_GROUP = 2 # minimum variables per facet sub-group | |
| def _do_facet_subsplit(sub_can, parent_id, current_path, | |
| nodes, leaf_to_id, ensure_path_fn): | |
| """ | |
| [F4][CAS] Facet sub-split by _facet_cond (numeric condition) only. | |
| The statistic tier is no longer inserted here β it came from a hardcoded | |
| statistic vocabulary and is now produced data-drivenly by _nest_by_measure(). | |
| Kept defensive: if a legacy _facet_stat column is present it is still honoured, | |
| but precompute_stat_cond_facets() no longer produces one. | |
| """ | |
| # A facet tier that merely repeats the parent concept label (e.g. a "Total" | |
| # statistic under a "Total" concept) is redundant β skip it. | |
| _parent_lbl = str(current_path[-1]).lower() if current_path else '' | |
| def _dups_parent(name): | |
| nl = str(name).lower().strip() | |
| return bool(nl) and (nl == _parent_lbl or nl in _parent_lbl or _parent_lbl in nl) | |
| if '_facet_stat' in sub_can.columns: | |
| stat_groups = {} | |
| for sv in sub_can['_facet_stat'].fillna('').unique(): | |
| if str(sv) in ('', 'nan') or _dups_parent(sv): | |
| continue | |
| sg = sub_can[sub_can['_facet_stat'] == sv] | |
| if len(sg) >= _MIN_FACET_GROUP: | |
| stat_groups[sv] = sg | |
| if len(stat_groups) >= 2: | |
| # Identify leftover rows not in any valid stat group | |
| grouped_idx = pd.concat(stat_groups.values()).index if stat_groups else pd.Index([]) | |
| leftover = sub_can[~sub_can.index.isin(grouped_idx)] | |
| for sv, sg in stat_groups.items(): | |
| stat_pid = ensure_path_fn(current_path + [str(sv)], | |
| relation='is_statistic_of') | |
| _do_cond_subsplit(sg, stat_pid, current_path + [str(sv)], | |
| nodes, leaf_to_id, ensure_path_fn) | |
| # Leftover variables (no valid stat value) go directly under parent | |
| for _, row in leftover.iterrows(): | |
| add_child(nodes, parent_id, leaf_to_id[row['_leaf_id']]) | |
| return | |
| # No valid stat split β try cond split at this level directly | |
| _do_cond_subsplit(sub_can, parent_id, current_path, | |
| nodes, leaf_to_id, ensure_path_fn) | |
| def _do_cond_subsplit(sub_can, parent_id, current_path, | |
| nodes, leaf_to_id, ensure_path_fn): | |
| """ | |
| [F4][CAS] Split by _facet_cond (numeric condition: delay/session/timepoint). | |
| Castanet (Stoica & Hearst, NAACL 2007) treats a condition as a valid facet when | |
| it has β₯2 distinct values, but HiExpan (Shen et al., KDD 2018) sibling coherence | |
| requires each resulting group to hold β₯2 variables β a node with a single child | |
| is not a coherent group. | |
| [FIX8] Earlier code (the reverted [FIX3] rule) allowed groups of size 1 whenever | |
| β₯3 distinct condition values existed, to expose 0/4/12-second delay tiers. On | |
| AI-MIND this produced one aggregation node per delay value, each wrapping a SINGLE | |
| variable (e.g. `Standard Deviation > 0 > DMSL0SD`): 53.7% singleton nodes and | |
| meaningless bare-digit labels. We now require β₯2 variables per condition group | |
| unconditionally; variables whose condition value is unique fall through and attach | |
| directly to the statistic node, keeping siblings together instead of fragmenting | |
| them. This is both more faithful to HiExpan and removes the over-split that the | |
| granularity-tolerant set-overlap metric flagged. | |
| """ | |
| if '_facet_cond' in sub_can.columns: | |
| # Count all distinct non-empty condition values in this sub-group | |
| all_cond_vals = [cv for cv in sub_can['_facet_cond'].fillna('').unique() | |
| if str(cv) not in ('', 'nan')] | |
| # [FIX8] Require β₯2 variables per condition group (HiExpan sibling coherence). | |
| min_size = _MIN_FACET_GROUP | |
| cond_groups = {} | |
| for cv in all_cond_vals: | |
| cg = sub_can[sub_can['_facet_cond'] == cv] | |
| if len(cg) >= min_size: | |
| cond_groups[cv] = cg | |
| if len(cond_groups) >= 2: | |
| grouped_idx = pd.concat(cond_groups.values()).index if cond_groups else pd.Index([]) | |
| leftover = sub_can[~sub_can.index.isin(grouped_idx)] | |
| for cv, cg in cond_groups.items(): | |
| cond_pid = ensure_path_fn(current_path + [str(cv)], | |
| relation='has_condition') | |
| for _, row in cg.iterrows(): | |
| add_child(nodes, cond_pid, leaf_to_id[row['_leaf_id']]) | |
| for _, row in leftover.iterrows(): | |
| add_child(nodes, parent_id, leaf_to_id[row['_leaf_id']]) | |
| return | |
| # No facet split possible β attach directly | |
| for _, row in sub_can.iterrows(): | |
| add_child(nodes, parent_id, leaf_to_id[row['_leaf_id']]) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN HIERARCHY BUILDER [GON][TAX][HIE][CAS] | |
| # | |
| # Algorithm: | |
| # 1. Create leaf nodes (all variables) | |
| # 2. Group by top-level _group_path (task/domain β structural backbone) | |
| # 3. For each group: embed variables β agglomerative cluster β for each cluster | |
| # compute centroid β score NΓM against concept table β assign best label | |
| # 4. [F4] For each concept cluster: facet sub-split by Statistic β Condition | |
| # 5. Store concept assignment back on each variable in can | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _noun_phrases(text, max_words=4): | |
| """ | |
| Grammatical noun phrases via NLTK POS tagging (used when USE_NOUN_PHRASES=True). | |
| Returns [] if NLTK / the tagger is unavailable, so the caller falls back to | |
| n-grams. Phrases are contiguous runs of adjectives/nouns up to max_words long. | |
| """ | |
| try: | |
| import nltk | |
| for _pkg in ('averaged_perceptron_tagger', 'punkt'): | |
| try: | |
| nltk.data.find(f'taggers/{_pkg}' if 'tagger' in _pkg else f'tokenizers/{_pkg}') | |
| except LookupError: | |
| nltk.download(_pkg, quiet=True) | |
| toks = nltk.word_tokenize(str(text)) | |
| tags = nltk.pos_tag(toks) | |
| except Exception: | |
| return [] | |
| phrases, cur = [], [] | |
| for w, t in tags: | |
| if t.startswith('NN') or t.startswith('JJ'): | |
| cur.append(w) | |
| if len(cur) > max_words: | |
| cur = cur[-max_words:] | |
| else: | |
| if len(cur) >= 1: | |
| phrases.append(' '.join(cur)) | |
| cur = [] | |
| if cur: | |
| phrases.append(' '.join(cur)) | |
| return [p for p in phrases if len(p) >= 3] | |
| def _keybert_label(member_texts, cluster_centroid, embedder, ancestor_words=None, | |
| corpus_centroid=None, used_labels=None, max_words=4, | |
| gen_weight=0.0, diversity=KEYBERT_DIVERSITY, cap=500): | |
| """ | |
| KeyBERT-style extractive labeller. Extract candidate phrases from the cluster's | |
| DESCRIPTIONS, embed them, and pick by: | |
| score = (1 β diversity)Β·cos(phrase, cluster_centroid) | |
| β diversity Β·cos(phrase, mean candidate phrase) # MMR redundancy | |
| With diversity=0 this is plain cosine-to-centroid (argmax relevance). When | |
| USE_CTFIDF=True the relevance is modulated by corpus IDF so boilerplate (low IDF) | |
| is suppressed. Candidates come from noun phrases (USE_NOUN_PHRASES=True) or | |
| n-grams. Extractive β never hallucinates a label. Returns a title-cased string. | |
| """ | |
| ancestor_words = ancestor_words or set() | |
| used = {str(u).lower() for u in (used_labels or [])} | |
| cand = set() | |
| for t in member_texts: | |
| raw = re.sub(r'\([^)]*\)', ' ', str(t)) # drop parentheticals | |
| nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else [] | |
| if nps: | |
| for p in nps: | |
| toks = [w for w in p.lower().split() | |
| if w not in _STOP and w not in ancestor_words] | |
| if toks: | |
| cand.add(' '.join(toks)) | |
| else: | |
| toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) | |
| if w not in _STOP and w not in ancestor_words] | |
| for nlen in range(1, max_words + 1): | |
| for i in range(len(toks) - nlen + 1): | |
| cand.add(' '.join(toks[i:i + nlen])) | |
| # Junk filter: drop used labels, pure-number phrases, immediately-repeated words. | |
| cand = [c for c in cand if len(c) >= 4 and c.lower() not in used | |
| and not c.replace(' ', '').isdigit() | |
| and not re.search(r'\b(\w+)\s+\1\b', c.lower())] | |
| if not cand: | |
| return '' | |
| cand = cand[:cap] | |
| embs = np.asarray(embedder.encode(cand), dtype=float) | |
| sims = cosine_similarity([cluster_centroid], embs)[0] # relevance | |
| if USE_CTFIDF and _CORPUS_IDF: | |
| mx = max(_CORPUS_IDF.values()) or 1.0 | |
| idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), mx) / mx) for c in cand]) | |
| sims = sims * (0.5 + 0.5 * idf) | |
| if gen_weight and corpus_centroid is not None: | |
| sims = sims - gen_weight * cosine_similarity([corpus_centroid], embs)[0] | |
| if diversity > 0 and len(embs) > 1: # MMR penalty | |
| generic = cosine_similarity(embs.mean(axis=0, keepdims=True), embs)[0] | |
| score = (1.0 - diversity) * sims - diversity * generic | |
| else: | |
| score = sims | |
| return cand[int(np.argmax(score))].title() | |
| def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None, | |
| max_words=3, cap=500): | |
| """ | |
| Extract the KeyBERT CANDIDATE phrases from a cluster's member descriptions β | |
| the same generation logic as _keybert_label but returns the full candidate list | |
| (un-ranked) so the caller can score them with the title-seeded scorer. Phrases | |
| are noun phrases (USE_NOUN_PHRASES=True) or n-grams, with ancestor/task words, | |
| pure numbers, used labels and immediate repeats filtered out. | |
| """ | |
| ancestor_words = ancestor_words or set() | |
| used = {str(u).lower() for u in (used_labels or [])} | |
| block = _STOP | ancestor_words | _LABEL_BOILERPLATE # boilerplate/web/Likert tokens out | |
| cand = set() | |
| for t in member_texts: | |
| raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t))) | |
| nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else [] | |
| if nps: | |
| for p in nps: | |
| toks = [w for w in p.lower().split() if w not in block] | |
| if toks: | |
| cand.add(' '.join(toks)) | |
| else: | |
| toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block] | |
| for nlen in range(1, max_words + 1): | |
| for i in range(len(toks) - nlen + 1): | |
| cand.add(' '.join(toks[i:i + nlen])) | |
| def _ok(c): | |
| words = c.split() | |
| if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit(): | |
| return False | |
| if re.search(r'\b(\w+)\s+\1\b', c.lower()): # adjacent word repeat | |
| return False | |
| if len(words) == 4 and words[:2] == words[2:]: # phrase repeat "x y x y" | |
| return False | |
| if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)): # bare fragment/acronym | |
| return False | |
| return True | |
| return [c for c in cand if _ok(c)][:cap] | |
| def _concept_title(text): | |
| """ | |
| Extract the human-written concept TITLE from a metadata description. | |
| Data-dictionary descriptions read `Title: long definition sentence`. The title | |
| is the colon-segment immediately BEFORE the longest segment (the definition). | |
| Structural, no vocabulary β works whether the title is segment 0 | |
| ("DMS Correct Latency SD: The standard deviation...") or later | |
| ("KEY: SWM Between errors: The number of times..."). Returns the title only. | |
| """ | |
| t = str(text).strip() | |
| if not t: | |
| return '' | |
| segs = t.split(':') | |
| if len(segs) < 2: | |
| title = t | |
| else: | |
| lens = [len(s.strip()) for s in segs] | |
| def_i = max(range(1, len(segs)), key=lambda i: lens[i]) # longest = definition | |
| title = segs[def_i - 1].strip() or t | |
| # A genuine concept title is short. If what we extracted is a full SENTENCE | |
| # (e.g. MOT has no "Title: def" structure β just prose), it is not a title; | |
| # return '' so the caller falls back to the embedding concept scorer instead of | |
| # labelling from a sentence. Length-based, no vocabulary. | |
| if len(re.findall(r'[A-Za-z]+', title)) > 9: | |
| return '' | |
| return title | |
| def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None, | |
| max_words=4, used_labels=None): | |
| """ | |
| Label a cluster from the concept TITLES its members share, chosen CONTRASTIVELY | |
| against sibling clusters (tree-based local-IDF). Titles are concept-dense (no | |
| boilerplate definition text), so this returns the genuine shared concept β | |
| "Correct Latency", "Standard Deviation" β never "Calculated Assessed Trials". | |
| Strips ancestor/task tokens and avoids repeating a parent or a used sibling. | |
| Returns a title-cased label or ''. | |
| """ | |
| ancestor_words = {w.lower() for w in (ancestor_words or [])} | |
| used_labels = {str(u).lower() for u in (used_labels or [])} | |
| def _phrases(title): | |
| t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower())) # drop parens + URLs | |
| toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t) | |
| if w not in _STOP and w not in ancestor_words | |
| and w not in _LABEL_BOILERPLATE] # web/Likert out | |
| out = set() | |
| for nlen in range(1, max_words + 1): | |
| for i in range(len(toks) - nlen + 1): | |
| out.add(' '.join(toks[i:i + nlen])) | |
| return out | |
| M = len(member_titles) | |
| if M == 0: | |
| return '' | |
| member_df = defaultdict(int) | |
| for ph_set in (_phrases(t) for t in member_titles): | |
| for ph in ph_set: | |
| member_df[ph] += 1 | |
| sib_flat = [t for lst in (sibling_title_lists or []) for t in lst] | |
| S = max(1, len(sib_flat)) | |
| sib_df = defaultdict(int) | |
| for t in sib_flat: | |
| for ph in _phrases(t): | |
| sib_df[ph] += 1 | |
| best, best_score = '', -1.0 | |
| for ph, mdf in member_df.items(): | |
| if mdf < 2: # must be shared by β₯2 members | |
| continue | |
| words = ph.split() | |
| if all(w in ancestor_words for w in words): # don't repeat the parent | |
| continue | |
| if ph in used_labels: # don't repeat a sibling | |
| continue | |
| score = (mdf / M - sib_df.get(ph, 0) / S) * (1.0 + 0.25 * (len(words) - 1)) | |
| if score > best_score: | |
| best_score, best = score, ph | |
| return best.title() if best else '' | |
| def _raw_title(text): | |
| """Title segment, keeping parentheticals (the error TYPE lives in them).""" | |
| segs = str(text).split(':') | |
| if len(segs) < 2: | |
| return str(text).strip() | |
| lens = [len(s.strip()) for s in segs] | |
| di = max(range(1, len(segs)), key=lambda i: lens[i]) | |
| return segs[di - 1].strip() | |
| def _label_from_own_title(title, ancestor_words, max_words=4): | |
| """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task | |
| words and parentheticals). Returns '' for sentence-like / empty titles.""" | |
| t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower())) | |
| toks = [w for w in re.findall(r'[a-z][a-z\-]+', t) | |
| if w not in _STOP and w not in ancestor_words | |
| and w not in _LABEL_BOILERPLATE] | |
| if not toks or len(toks) > 7: # >7 words β prose, not a concept title | |
| return '' | |
| return ' '.join(toks[:max_words]).title() | |
| def _strip_leading_prose(label): | |
| """Drop a leading word that is a verb but NOT a noun in WordNet (e.g. 'Include | |
| Shapes' β 'Shapes', from a prose description) β data-driven, no word list. Keeps | |
| qualifiers like 'Correct' only when applied where appropriate (sub-split labels).""" | |
| if not _WORDNET_AVAILABLE: | |
| return label | |
| words = label.split() | |
| while len(words) > 1: | |
| w = words[0].lower() | |
| try: | |
| if wn.synsets(w, pos=wn.VERB) and not wn.synsets(w, pos=wn.NOUN): | |
| words = words[1:] | |
| else: | |
| break | |
| except Exception: | |
| break | |
| return ' '.join(words) | |
| def _subsplit_concept_by_title(nodes): | |
| """ | |
| [Fix2] Split a concept node's leaves by a distinctive TITLE descriptor when they | |
| fall into β₯2 groups of β₯2 β e.g. DMS 'Error' β {Incorrect Colour, Incorrect | |
| Pattern, Distractor} (the type is in the title parenthetical). Greedy prefers | |
| LONGER descriptors so "incorrect colour"/"incorrect pattern" win over the bare | |
| "incorrect". The β₯2-groups-of-β₯2 gate is what stops a delay over-split: delay | |
| variants form only ONE group ("second delay") so they are never split out. | |
| No hardcoded vocabulary. | |
| """ | |
| pm = build_parent_map(nodes) | |
| def _anc_words(nid): | |
| w, x = set(), nid | |
| m = nmap(nodes) | |
| while True: | |
| nd = m.get(x) | |
| if nd and nd.get('type') != 'root': | |
| w |= set(re.findall(r'[a-z]{3,}', str(nd.get('name', '')).lower())) | |
| if x not in pm: | |
| break | |
| x = pm[x] | |
| return w | |
| for node in [n for n in nodes if n.get('type') == 'aggregation']: | |
| nid = int(node['id']) | |
| m = nmap(nodes) | |
| leaf_children = [int(c) for c in node.get('related', []) | |
| if m.get(int(c), {}).get('type') == 'attribute'] | |
| if len(leaf_children) < 4: | |
| continue | |
| aw = _anc_words(nid) | |
| # Tokens present in (nearly) ALL leaves are parent-level, not sub-categories | |
| # β e.g. "sd" under a Standard Deviation node. Excluding them stops the | |
| # delay over-split (without them, delay variants form only one group). | |
| nL = len(leaf_children) | |
| tok_df = defaultdict(int) | |
| for cid in leaf_children: | |
| ln = m[cid] | |
| title = _raw_title(ln.get('semantic_desc', ln.get('desc', ''))) | |
| for w in set(re.findall(r'[a-z][a-z\-]+', title.lower())): | |
| if w not in _STOP and w not in aw: | |
| tok_df[w] += 1 | |
| common = {w for w, c in tok_df.items() if c > 0.7 * nL} | |
| p2l = defaultdict(set) | |
| for cid in leaf_children: | |
| ln = m[cid] | |
| title = _raw_title(ln.get('semantic_desc', ln.get('desc', ''))) | |
| toks = [w for w in re.findall(r'[a-z][a-z\-]+', title.lower()) | |
| if w not in _STOP and w not in aw and w not in common] | |
| phs = set() | |
| for nl in (3, 2, 1): | |
| for i in range(len(toks) - nl + 1): | |
| phs.add(' '.join(toks[i:i + nl])) | |
| for p in phs: | |
| p2l[p].add(cid) | |
| covered, groups = set(), [] | |
| for p, cids in sorted(p2l.items(), key=lambda kv: (-len(kv[0].split()), -len(kv[1]))): | |
| avail = cids - covered | |
| if len(avail) >= 2: | |
| groups.append((p, avail)); covered |= avail | |
| if len(groups) < 2: | |
| continue | |
| for p, cids in groups: | |
| sub_lbl = _strip_leading_prose(p.title()) or p.title() | |
| snid = next_id(nodes) | |
| nodes.append(make_agg(snid, sub_lbl, desc=f'Sub-group: {sub_lbl}', | |
| relation_type='belongs_to')) | |
| add_child(nodes, nid, snid) | |
| for cid in cids: | |
| remove_child(nodes, nid, cid) | |
| add_child(nodes, snid, cid) | |
| return nodes | |
| def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder, | |
| concept_table, concept_embs, ensure_path_fn, | |
| n_clusters_max, can, ref_centroids=None, | |
| corpus_centroid=None): | |
| """ | |
| Cluster variables in tdf and assign concept labels from concept_table. | |
| Modifies nodes in-place. Updates can['_concept_label'] for each variable. | |
| Semantic label selection: | |
| [GON][CAS] score_concepts_for_cluster ranks candidates by embedding coverage, | |
| sibling contrast, and dataset-wide specificity (semantic IDF) using | |
| the member / sibling / group (ref_centroids) embeddings passed in. | |
| [HIE] assign_concept_label rejects labels that paraphrase an ancestor or a | |
| chosen sibling (semantic dedup via embeddings, not word lists). | |
| [HIE] Singleton clusters (n=1) attach directly to parent β no group node. | |
| """ | |
| sem_col = '_semantic_text' if '_semantic_text' in tdf.columns else '_text' | |
| texts = tdf[sem_col].fillna('').tolist() | |
| # Concept TITLES (pre-definition) β clean label vocabulary, never boilerplate. | |
| titles = [_concept_title(t) for t in texts] | |
| n = len(tdf) | |
| if n == 0: | |
| return | |
| # Ancestor names + their embeddings for semantic parent-duplication filter [HIE] | |
| ancestor_names = list(path_prefix) | |
| ancestor_embs = (embedder.encode(ancestor_names) | |
| if ancestor_names else None) | |
| # [ChangeC] Discover top-level task tokens from the full dataset (data-driven) | |
| _top_level_tasks: set = set() | |
| if '_group_path' in can.columns: | |
| for _gp in can['_group_path'].dropna().astype(str): | |
| _f = _gp.split(' > ')[0].strip() | |
| if _f and _f.lower() not in ('ungrouped', 'nan', ''): | |
| _top_level_tasks.add(_f.lower()) | |
| _aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks | |
| if n < 3 or concept_embs is None or len(concept_table) == 0: | |
| # Too few variables to cluster β label each from its own title [Fix5], or | |
| # KeyBERT over its description when no title exists. ensure_path merges it | |
| # into an existing concept of the same name. | |
| pid = ensure_path_fn(path_prefix) | |
| _small = embedder.encode(texts) if texts else None | |
| for i, (_, row) in enumerate(tdf.iterrows()): | |
| lbl = _label_from_own_title(titles[i], _aw_base) | |
| if not lbl and _small is not None: | |
| lbl = _keybert_label([texts[i]], _small[i], embedder, | |
| ancestor_words=_aw_base, used_labels=set(), | |
| max_words=2, gen_weight=0.3, | |
| diversity=KEYBERT_DIVERSITY) | |
| tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \ | |
| {a.lower() for a in ancestor_names} else pid | |
| add_child(nodes, tgt, leaf_to_id[row['_leaf_id']]) | |
| return | |
| # Embed variables | |
| var_embs = embedder.encode(texts) | |
| # Centroid of THIS task/subgroup β reference for the scorer's `home` signal. | |
| own_group_centroid = _centroid(var_embs) | |
| # Choose number of clusters adaptively | |
| n_clust = min(n_clusters_max, max(2, n // 3), n) | |
| # Agglomerative clustering on variable embeddings [TAX][GON] | |
| # [FIX7][GON][TAX] Code-family cohesion bias: | |
| # Variables sharing the same _code_family (e.g. DMSL, SWMBE) are structurally | |
| # related by the instrument's own naming convention. Reduce their pairwise | |
| # cosine distance by a factor of 0.80 so the clusterer prefers to keep them | |
| # together. Rationale β GonΓ§alves ESWC 2019: structural prefix affinity; | |
| # Taxonomizer IEEE TVCG 2019: compound labels align with code morphology. | |
| # Factor 0.80 is a cohesion weight, not a fixed threshold β it is applied | |
| # multiplicatively so the relative ordering of similarities is preserved. | |
| try: | |
| dist = cosine_distances(var_embs).astype(float) | |
| np.fill_diagonal(dist, 0.0) | |
| # Apply code-family cohesion if _code_family is available | |
| if '_code_family' in tdf.columns: | |
| families = tdf['_code_family'].fillna('').astype(str).tolist() | |
| _COHESION_FACTOR = 0.80 # same-family pairs: distance Γ 0.80 (pulled together) | |
| for ii in range(n): | |
| for jj in range(ii + 1, n): | |
| if families[ii] and families[ii] == families[jj]: | |
| dist[ii, jj] *= _COHESION_FACTOR | |
| dist[jj, ii] *= _COHESION_FACTOR | |
| c_lbls = AgglomerativeClustering(n_clusters=n_clust, metric='precomputed', | |
| linkage='average').fit_predict(dist) | |
| except Exception: | |
| c_lbls = np.zeros(n, dtype=int) | |
| n_clust = 1 | |
| rows_list = list(tdf.iterrows()) | |
| # [C3] Pre-compute all cluster text groups for discriminative TF-IDF [GON][TAX] | |
| # and each cluster's centroid (used as sibling-contrast references [CAS]). | |
| all_cluster_texts = [] | |
| all_cluster_titles = [] | |
| all_centroids = [] | |
| for k in range(n_clust): | |
| mask = c_lbls == k | |
| cluster_idxs = [i for i, m in enumerate(mask) if m] | |
| all_cluster_texts.append([texts[i] for i in cluster_idxs] if cluster_idxs else []) | |
| all_cluster_titles.append([titles[i] for i in cluster_idxs] if cluster_idxs else []) | |
| all_centroids.append(_centroid(var_embs[mask]) if cluster_idxs else None) | |
| # Track used sibling labels (string) and their embeddings (semantic dedup) [TAX][GON] | |
| used_sibling_labels = set() | |
| sibling_label_embs = [] | |
| parent_pid = ensure_path_fn(path_prefix) # get parent node id upfront | |
| for k in range(n_clust): | |
| mask = c_lbls == k | |
| cluster_idxs = [i for i, m in enumerate(mask) if m] | |
| if not cluster_idxs: | |
| continue | |
| cluster_texts_k = [texts[i] for i in cluster_idxs] | |
| cluster_emb = _centroid(var_embs[mask]) | |
| # [Fix5] Singleton: label it from its OWN title and attach under that concept | |
| # (ensure_path merges it into an existing same-named concept if one exists), | |
| # instead of dumping it unclassified under the task. | |
| if len(cluster_idxs) == 1: | |
| _, row = rows_list[cluster_idxs[0]] | |
| lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base) | |
| src = 'singleton_title' | |
| if not lbl and cluster_emb is not None: | |
| lbl = _keybert_label([cluster_texts_k[0]], cluster_emb, embedder, | |
| ancestor_words=_aw_base, | |
| used_labels=used_sibling_labels, | |
| max_words=2, gen_weight=0.3, | |
| diversity=KEYBERT_DIVERSITY) | |
| src = 'singleton_keybert' | |
| if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}: | |
| tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to') | |
| can.at[row.name, '_concept_label'] = lbl | |
| else: | |
| tgt = parent_pid | |
| can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root' | |
| add_child(nodes, tgt, leaf_to_id[row['_leaf_id']]) | |
| can.at[row.name, '_concept_score'] = 0.0 | |
| can.at[row.name, '_concept_source'] = src | |
| continue | |
| if cluster_emb is not None: | |
| # Sibling centroids = every OTHER cluster in this parent (contrast ref) [CAS] | |
| sibling_centroids = [all_centroids[j] for j in range(n_clust) | |
| if j != k and all_centroids[j] is not None] | |
| scores = score_concepts_for_cluster( | |
| cluster_emb, concept_embs, concept_table, cluster_texts_k, | |
| n_total_vars=len(can), | |
| member_embs=var_embs[mask], | |
| sibling_centroids=np.array(sibling_centroids) if sibling_centroids else None, | |
| ref_centroids=ref_centroids, # all top-level task centroids | |
| corpus_centroid=corpus_centroid, | |
| own_group_centroid=own_group_centroid, # current task β home signal | |
| ) | |
| else: | |
| scores = [] | |
| # ββ TITLE-SEEDED LABEL SELECTION (Guided KeyBERT) βββββββββββββββββββββ | |
| # The label is FORMED FROM THE DESCRIPTIONS: candidates are KeyBERT phrases | |
| # extracted from the cluster's member descriptions (+ scored concept-table | |
| # entries). The pre-colon TITLE does NOT override β it is a ranking SEED: | |
| # score = Ξ±Β·cos(cand, cluster centroid) # description fit | |
| # + Ξ²Β·cos(cand, title embedding) # title INFLUENCE (LABEL_W_TITLE) | |
| # + Ξ³Β·contrast(vs siblings) | |
| # + δ·external grounding | |
| # So the displayed label is always a description-derived phrase, pulled toward | |
| # the human-canonical title phrasing. Set LABEL_W_TITLE=0 for a pure-description | |
| # ablation. The title phrase is also added as ONE candidate so a clean title can | |
| # still win on merit (it is usually present verbatim in the descriptions anyway). | |
| ancestor_words = set(re.findall(r'[a-z]{3,}', | |
| ' '.join(ancestor_names).lower())) | _top_level_tasks | |
| member_titles_k = [titles[i] for i in cluster_idxs] | |
| sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k] | |
| sibling_texts = [all_cluster_texts[j] for j in range(n_clust) if j != k] | |
| # Pre-colon title β used only as the SEED ANCHOR (and one candidate), never a | |
| # direct override. | |
| title_label = _title_cluster_label(member_titles_k, sibling_title_lists, | |
| ancestor_words=ancestor_words, | |
| used_labels=used_sibling_labels) | |
| title_emb = (embedder.encode([title_label])[0] | |
| if title_label else None) | |
| # Candidate phrases drawn ONLY from the cluster's DESCRIPTIONS (KeyBERT) plus | |
| # the pre-colon title. External ontology sources (Cognitive Atlas / Wikidata / | |
| # WordNet / PubMed) are deliberately NOT candidates β per design they inform the | |
| # embedding space / semantic understanding only, and must never name a node. | |
| kb_cands = _keybert_candidates(cluster_texts_k, ancestor_words=ancestor_words, | |
| used_labels=used_sibling_labels, max_words=3) | |
| pool_src = [(c, 'keybert') for c in kb_cands] | |
| if title_label: | |
| pool_src.append((title_label, 'description_title')) | |
| # Dedup; title's source tag takes priority over keybert when the phrase matches. | |
| seen_pool = {} | |
| for lbl, src in pool_src: | |
| key = lbl.lower() | |
| if key not in seen_pool or src == 'description_title': | |
| seen_pool[key] = (lbl, src) | |
| pool = [v[0] for v in seen_pool.values()] | |
| pool_srcs = [v[1] for v in seen_pool.values()] | |
| keybert_label = kb_cands[0] if kb_cands else '' # for fallback only | |
| candidate_scores = [] | |
| if pool and cluster_emb is not None: | |
| cand_embs = np.asarray(embedder.encode(pool), dtype=float) | |
| relevance = cosine_similarity([cluster_emb], cand_embs)[0] | |
| # c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic | |
| # phrases ("test", "description", "measure", "scores") lose to distinctive ones. | |
| if USE_CTFIDF and _CORPUS_IDF: | |
| _mx = max(_CORPUS_IDF.values()) or 1.0 | |
| _idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool]) | |
| relevance = relevance * (0.5 + 0.5 * _idf) | |
| if sibling_centroids: | |
| sib_sim = cosine_similarity(cand_embs, | |
| np.asarray(sibling_centroids, dtype=float)).max(axis=1) | |
| contrast = np.clip(relevance - sib_sim, 0.0, 1.0) | |
| else: | |
| contrast = np.zeros(len(pool)) | |
| # Title SEED: cosine of each description-derived candidate to the title. | |
| if title_emb is not None: | |
| title_sim = cosine_similarity(cand_embs, [title_emb])[:, 0] | |
| else: | |
| title_sim = np.zeros(len(pool)) | |
| for i, cand in enumerate(pool): | |
| hyb = (LABEL_W_RELEVANCE * float(relevance[i]) | |
| + LABEL_W_TITLE * float(title_sim[i]) | |
| + LABEL_W_CONTRAST * float(contrast[i])) | |
| candidate_scores.append({ | |
| 'label': cand, | |
| 'score': hyb, | |
| 'embedding_sim': float(relevance[i]), | |
| 'coverage': float(relevance[i]), | |
| 'contrast': float(contrast[i]), | |
| 'specificity': 0.0, | |
| 'string_sim': float(title_sim[i]), # title seed alignment | |
| 'source': pool_srcs[i], | |
| 'broader_relations': [], | |
| '_emb': cand_embs[i], | |
| }) | |
| candidate_scores.sort(key=lambda x: -x['score']) | |
| fallback_label = (title_label | |
| or keybert_label | |
| or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts) | |
| or f'Group {k+1}') | |
| label, provenance = assign_concept_label( | |
| candidate_scores, | |
| fallback=fallback_label, | |
| min_score=0.0, | |
| ancestor_names=ancestor_names, | |
| used_sibling_labels=used_sibling_labels, | |
| top_level_tasks=_top_level_tasks, | |
| ancestor_embs=ancestor_embs, | |
| sibling_label_embs=sibling_label_embs, | |
| ) | |
| # Skip the node only when there is truly NO concept name (empty title, no | |
| # scored candidate β a bare "Group k"). Title labels are trusted and kept. | |
| if (not title_label) and (not candidate_scores) and label.startswith('Group '): | |
| for ci in cluster_idxs: | |
| _, row = rows_list[ci] | |
| add_child(nodes, parent_pid, leaf_to_id[row['_leaf_id']]) | |
| can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root' | |
| can.at[row.name, '_concept_score'] = 0.0 | |
| can.at[row.name, '_concept_source'] = 'weak_label_direct' | |
| continue | |
| # WordNet hypernym β ONLY when there is no title concept name. | |
| if (not title_label) and (label == fallback_label | |
| or label.lower() in {a.lower() for a in ancestor_names}): | |
| wn_label = wordnet_hypernym_fallback(cluster_texts_k, excluded_names=ancestor_names) | |
| if wn_label: | |
| label = wn_label | |
| provenance['node_label'] = label | |
| provenance['source_evidence'] = ['wordnet_hypernym'] | |
| # Guarantee distinct siblings: qualify a colliding label with a distinguishing | |
| # word from this cluster's own titles (never emit a duplicate sibling). | |
| if label.lower() in used_sibling_labels: | |
| from collections import Counter as _Counter | |
| _cnt = _Counter() | |
| for _tt in member_titles_k: | |
| for _w in re.findall(r'[a-z]{3,}', _tt.lower()): | |
| if _w not in _STOP and _w not in ancestor_words and _w not in label.lower(): | |
| _cnt[_w] += 1 | |
| _extra = next((w for w, _ in _cnt.most_common() | |
| if f'{label} {w}'.lower() not in used_sibling_labels), None) | |
| if _extra: | |
| label = f'{label} {_extra.title()}' | |
| else: | |
| _i = 2 | |
| while f'{label} {_i}'.lower() in used_sibling_labels: | |
| _i += 1 | |
| label = f'{label} {_i}' | |
| used_sibling_labels.add(label.lower()) # register for sibling dedup (string + emb) | |
| try: | |
| sibling_label_embs.append(embedder.encode([label])[0]) | |
| except Exception: | |
| pass | |
| pid = ensure_path_fn(path_prefix + [label], | |
| relation='belongs_to', provenance=provenance) | |
| # Store concept assignment on can (needed by Castanet facets later). | |
| # Provenance reflects the HYBRID winner (title / keybert / concept_table), | |
| # not the old semantic-only scorer β so the exported labels CSV is accurate. | |
| for ci in cluster_idxs: | |
| _, row = rows_list[ci] | |
| can.at[row.name, '_concept_label'] = label | |
| can.at[row.name, '_concept_score'] = provenance.get('confidence', 0.0) | |
| can.at[row.name, '_concept_source'] = (provenance.get('source_evidence') or ['fallback'])[0] | |
| # Attach the cluster's variables directly under the concept node. The former | |
| # Statistic/Condition facet sub-split is removed: the statistic tier came from | |
| # a hardcoded vocabulary (now produced data-drivenly by _nest_by_measure), and | |
| # the numeric Condition tier produced bare-digit nodes (0/4/12) that inflated | |
| # singleton%/n_agg and moved the tree away from gold. Castanet's Condition facet | |
| # still exists as a separate parallel view via detect_facets() β not a tier. | |
| for ci in cluster_idxs: | |
| _, row = rows_list[ci] | |
| add_child(nodes, pid, leaf_to_id[row['_leaf_id']]) | |
| def _remove_phrase(tokens, phrase_tokens): | |
| """Remove the first contiguous occurrence of phrase_tokens from tokens.""" | |
| nlen = len(phrase_tokens) | |
| for i in range(len(tokens) - nlen + 1): | |
| if tokens[i:i + nlen] == phrase_tokens: | |
| return tokens[:i] + tokens[i + nlen:] | |
| return [t for t in tokens if t not in phrase_tokens] | |
| def _nest_by_measure(nodes): | |
| """ | |
| [Fix2] Group concept-sibling nodes that SHARE a measure phrase into a Measure | |
| parent, renaming each child to its residual statistic. Example under DMS: | |
| Mean Correct Latency, Median Correct Latency, Correct Latency Standard Deviation | |
| β Correct Latency β { Mean, Median, Standard Deviation } | |
| The measure is simply the phrase shared by β₯2 siblings; the statistic is what | |
| remains after removing it. No hardcoded statistic list. Adds MeasureβStatistic | |
| depth only where the data supports it; other concepts stay flat. | |
| """ | |
| pm = build_parent_map(nodes) | |
| task_ids = [int(n['id']) for n in nodes | |
| if n.get('type') == 'aggregation' and pm.get(int(n['id'])) == 0] | |
| for task_id in task_ids: | |
| while True: | |
| m = nmap(nodes) | |
| task = m.get(task_id) | |
| if not task: | |
| break | |
| child_ids = [int(c) for c in task.get('related', []) | |
| if m.get(int(c), {}).get('type') == 'aggregation'] | |
| if len(child_ids) < 3: | |
| break | |
| labels = {cid: str(m[cid]['name']) for cid in child_ids} | |
| phrase_children = defaultdict(set) | |
| for cid, lbl in labels.items(): | |
| toks = [w for w in re.findall(r'[a-z][a-z\-]+', lbl.lower()) if w not in _STOP] | |
| for nlen in (3, 2): | |
| for i in range(len(toks) - nlen + 1): | |
| phrase_children[' '.join(toks[i:i + nlen])].add(cid) | |
| cand = [(ph, cids) for ph, cids in phrase_children.items() if len(cids) >= 2] | |
| if not cand: | |
| break | |
| ph, grouped = max(cand, key=lambda x: (len(x[1]), len(x[0].split()))) | |
| ptoks = ph.split() | |
| nid = next_id(nodes) | |
| nodes.append(make_agg(nid, ph.title(), desc=f'Measure: {ph.title()}', | |
| relation_type='belongs_to')) | |
| add_child(nodes, task_id, nid) | |
| for cid in list(grouped): | |
| remove_child(nodes, task_id, cid) | |
| ctoks = [w for w in re.findall(r'[a-z][a-z\-]+', labels[cid].lower()) | |
| if w not in _STOP] | |
| resid = _remove_phrase(ctoks, ptoks) | |
| if len(resid) == 1: | |
| # A lone modifier ("Double", "Within") reads poorly on its own β | |
| # qualify it with the measure's most-informative word (longest; | |
| # ties β last), e.g. "Double" β "Double Errors". No hardcoding. | |
| mword = max(ptoks, key=lambda w: (len(w), ptoks.index(w))) | |
| if mword not in resid: | |
| resid = resid + [mword] | |
| if resid: | |
| m[cid]['name'] = ' '.join(resid).title() | |
| add_child(nodes, nid, cid) | |
| else: | |
| # child label == measure β dissolve it, leaves go under new parent | |
| for leaf in list(m[cid].get('related', [])): | |
| add_child(nodes, nid, int(leaf)) | |
| nodes[:] = [x for x in nodes if int(x['id']) != cid] | |
| return nodes | |
| def _singular(w): | |
| return w[:-1] if (len(w) > 3 and w.endswith('s') and not w.endswith('ss')) else w | |
| def _nest_by_category(nodes): | |
| """ | |
| [Fix3] Add a Measure-CATEGORY tier: group a task's concept-sibling nodes by their | |
| HEAD noun (last significant word, singularised) when β₯2 share it, e.g. | |
| Total Correct, Percent Correct β Correct β { Total, Percent } | |
| Total Errors, Probability Error β Errors β { Total, Probability } | |
| The HEAD is used (not any shared word) specifically so "Correct Latency" | |
| (head = Latency) is NOT pulled under "Correct". Children are renamed to the | |
| residual (label minus the head). No hardcoded category list. | |
| """ | |
| pm = build_parent_map(nodes) | |
| task_ids = [int(n['id']) for n in nodes | |
| if n.get('type') == 'aggregation' and pm.get(int(n['id'])) == 0] | |
| for task_id in task_ids: | |
| m = nmap(nodes) | |
| task = m.get(task_id) | |
| if not task: | |
| continue | |
| child_ids = [int(c) for c in task.get('related', []) | |
| if m.get(int(c), {}).get('type') == 'aggregation'] | |
| if len(child_ids) < 3: | |
| continue | |
| head_groups, head_forms, labels = defaultdict(list), defaultdict(list), {} | |
| for cid in child_ids: | |
| lbl = str(m[cid]['name']) | |
| labels[cid] = lbl | |
| words = [w for w in re.findall(r'[a-z][a-z\-]+', lbl.lower()) if w not in _STOP] | |
| if not words: | |
| continue | |
| sg = _singular(words[-1]) | |
| head_groups[sg].append(cid) | |
| head_forms[sg].append(words[-1]) | |
| for sg, cids in list(head_groups.items()): | |
| if len(cids) < 2: | |
| continue | |
| cat = max(head_forms[sg], key=len).title() # nicest surface form | |
| nid = next_id(nodes) | |
| nodes.append(make_agg(nid, cat, desc=f'Category: {cat}', | |
| relation_type='belongs_to')) | |
| add_child(nodes, task_id, nid) | |
| for cid in cids: | |
| remove_child(nodes, task_id, cid) | |
| ctoks = [w for w in re.findall(r'[a-z][a-z\-]+', labels[cid].lower()) | |
| if w not in _STOP] | |
| resid = [t for t in ctoks if _singular(t) != sg] | |
| if resid: | |
| m[cid]['name'] = ' '.join(resid).title() | |
| add_child(nodes, nid, cid) | |
| else: | |
| for leaf in list(m[cid].get('related', [])): | |
| add_child(nodes, nid, int(leaf)) | |
| nodes[:] = [x for x in nodes if int(x['id']) != cid] | |
| m = nmap(nodes) | |
| return nodes | |
| def _merge_duplicate_concepts(nodes): | |
| """ | |
| [Fix] Merge aggregation nodes that share the SAME name within the same task | |
| (keeping the shallowest), e.g. SWM had a singleton 'Within Errors' AND an | |
| 'Errors Boxes > Within Errors' β both become one flat 'Within Errors'. Removes | |
| duplicates created when clustering split a concept's variants and #5 / _nest_by_measure | |
| labelled them identically. | |
| """ | |
| pm = build_parent_map(nodes) | |
| def depth(nid): | |
| d, x = 0, nid | |
| while x in pm: | |
| x = pm[x]; d += 1 | |
| return d | |
| def task_of(nid): | |
| x = nid | |
| while True: | |
| p = pm.get(x) | |
| if p is None or p == 0: | |
| return x | |
| x = p | |
| groups = defaultdict(list) | |
| for n in nodes: | |
| if n.get('type') == 'aggregation' and int(n['id']) in pm: | |
| groups[(task_of(int(n['id'])), str(n['name']).lower())].append(int(n['id'])) | |
| removed = set() | |
| for (_t, _name), ids in groups.items(): | |
| ids = [i for i in ids if i not in removed] | |
| if len(ids) < 2: | |
| continue | |
| keeper = min(ids, key=depth) | |
| m = nmap(nodes) | |
| for dup in ids: | |
| if dup == keeper: | |
| continue | |
| for c in list(m[dup].get('related', [])): | |
| remove_child(nodes, dup, int(c)); add_child(nodes, keeper, int(c)) | |
| # Remove dup from whatever node CURRENTLY references it (not the stale pm β | |
| # earlier post-processes may have re-parented it). Leaving a stale ref makes | |
| # a dangling child that breaks the Plotly sunburst. | |
| for pn in nodes: | |
| if dup in [int(x) for x in pn.get('related', [])]: | |
| remove_child(nodes, int(pn['id']), dup) | |
| removed.add(dup) | |
| nodes[:] = [n for n in nodes if int(n['id']) not in removed] | |
| # Defensive: drop any child reference to a node that no longer exists. | |
| _alive = {int(n['id']) for n in nodes} | |
| for n in nodes: | |
| n['related'] = [int(x) for x in n.get('related', []) if int(x) in _alive] | |
| return nodes | |
| def _prune_empty_aggregations(nodes): | |
| """Remove aggregation nodes whose subtree contains NO variable (leaf). Empty | |
| concept nodes are meaningless AND break the Plotly sunburst: every node gets a | |
| min value of 1, so an empty child makes its parent's value < sum(children) and | |
| branchvalues='total' refuses to render (blank chart).""" | |
| m = nmap(nodes) | |
| def has_leaf(nid, seen): | |
| if nid in seen: | |
| return False | |
| seen.add(nid) | |
| n = m.get(nid) | |
| if not n: | |
| return False | |
| if n.get('type') == 'attribute': | |
| return True | |
| return any(has_leaf(int(c), seen) for c in n.get('related', [])) | |
| empty = {int(n['id']) for n in nodes | |
| if n.get('type') == 'aggregation' and not has_leaf(int(n['id']), set())} | |
| if empty: | |
| nodes[:] = [n for n in nodes if int(n['id']) not in empty] | |
| alive = {int(n['id']) for n in nodes} | |
| for n in nodes: | |
| n['related'] = [int(c) for c in n.get('related', []) if int(c) in alive] | |
| return nodes | |
| def _dissolve_facet_singletons(nodes): | |
| """ | |
| Dissolve FACET tier nodes (Statistic / Condition) that wrap a single variable. | |
| A condition or statistic node with exactly one leaf child carries no grouping | |
| value β e.g. `Standard Deviation > 0 > DMSL0SD`. We remove such nodes and | |
| reattach their single child to the node's parent, keeping siblings together. | |
| Scope is deliberately narrow: only nodes whose relation_type is 'has_condition' | |
| or 'is_statistic_of' are touched, so genuine single-member CONCEPT nodes that | |
| carry a distinctive name are preserved (per the chosen policy). | |
| """ | |
| _FACET_RELS = {'has_condition', 'is_statistic_of'} | |
| changed = True | |
| while changed: | |
| changed = False | |
| pm = build_parent_map(nodes) | |
| m = nmap(nodes) | |
| for n in list(nodes): | |
| if n.get('type') != 'aggregation': | |
| continue | |
| if n['info'].get('relation_type') not in _FACET_RELS: | |
| continue | |
| nid = int(n['id']) | |
| children = [int(c) for c in n.get('related', [])] | |
| # "Single variable" = exactly one child and that child is a leaf attribute. | |
| if len(children) == 1 and m.get(children[0], {}).get('type') == 'attribute': | |
| parent = pm.get(nid) | |
| if parent is None: | |
| continue | |
| add_child(nodes, parent, children[0]) | |
| remove_child(nodes, parent, nid) | |
| nodes[:] = [x for x in nodes if int(x['id']) != nid] | |
| changed = True | |
| break | |
| return nodes | |
| def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project', | |
| n_clusters_per_group=8): | |
| """ | |
| Build hierarchy using automatic concept label assignment. | |
| No hardcoded patterns. Labels come from metadata + external concept table. | |
| [GON] NΓM alignment Β· [TAX] leaf=attribute, node=abstract concept Β· [HIE] task-first | |
| """ | |
| nodes = [{'id': 0, 'name': project, 'desc': 'Root node', 'type': 'root', | |
| 'dtype': 'root', 'isShown': True, 'related': []}] | |
| leaf_to_id = {} | |
| for i, (_, r) in enumerate(can.iterrows(), start=1): | |
| leaf_to_id[r['_leaf_id']] = i | |
| nodes.append({ | |
| 'id': i, | |
| 'name': r['_leaf_label'], | |
| 'dtype': r['_dtype'], | |
| 'related': [], | |
| 'isShown': True, | |
| 'type': 'attribute', | |
| 'desc': r['_text'], | |
| 'semantic_desc': r.get('_semantic_text', r['_text']), | |
| 'source_file': r['_source_file'], | |
| 'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}, | |
| }) | |
| # Embed concept table once for the whole hierarchy build | |
| if concept_table: | |
| concept_texts = [c['full_text'] for c in concept_table] | |
| concept_embs = embedder.encode(concept_texts) | |
| else: | |
| concept_embs = None | |
| # ββ Dataset-wide reference embeddings for semantic IDF / specificity [GON] ββ | |
| # Encode every variable once, then build one centroid per top-level group. | |
| # A candidate label that is close to ONE group centroid and far from the rest | |
| # is discriminative; one close to ALL of them is boilerplate. corpus_centroid | |
| # is the global mean (generic = central). Both are derived purely from data. | |
| sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text' | |
| # Active domain β used by the hybrid label scorer's external-grounding signal. | |
| global _ACTIVE_DOMAIN | |
| _ACTIVE_DOMAIN = detect_domain(can) | |
| # Corpus IDF over description n-grams β KeyBERT c-TF-IDF distinctiveness weight | |
| # (only consulted when USE_CTFIDF=True). Data-derived, dataset-agnostic. | |
| global _CORPUS_IDF | |
| _CORPUS_IDF = {} | |
| try: | |
| from sklearn.feature_extraction.text import CountVectorizer as _CV | |
| _docs = can[sem_col_all].fillna('').astype(str).tolist() | |
| _cv = _CV(ngram_range=(1, 3), binary=True, lowercase=True, | |
| token_pattern=r'[a-z][a-z\-]+') | |
| _dt = _cv.fit_transform(_docs) | |
| _dfa = np.asarray(_dt.sum(axis=0)).ravel(); _N = _dt.shape[0] | |
| _CORPUS_IDF = {p: float(np.log((_N + 1) / (_dfa[i] + 1)) + 1.0) | |
| for p, i in _cv.vocabulary_.items()} | |
| except Exception: | |
| _CORPUS_IDF = {} | |
| ref_centroids = corpus_centroid = None | |
| try: | |
| all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist()) | |
| corpus_centroid = _centroid(all_var_embs) | |
| _tops = can['_group_path'].fillna('Ungrouped').apply( | |
| lambda x: str(x).split(' > ')[0].strip() or 'Ungrouped') | |
| _cent = [] | |
| for g in _tops.unique(): | |
| gm = (_tops == g).to_numpy() | |
| if gm.sum() >= 1: | |
| _cent.append(_centroid(all_var_embs[gm])) | |
| ref_centroids = np.array(_cent) if len(_cent) >= 2 else None | |
| except Exception: | |
| pass | |
| path_ids = {} | |
| def ensure_path(parts, relation='belongs_to', provenance=None): | |
| key = tuple(str(p) for p in parts) | |
| if key in path_ids: | |
| return path_ids[key] | |
| nid = next_id(nodes) | |
| path_ids[key] = nid | |
| nodes.append(make_agg(nid, parts[-1], | |
| desc=f'Concept group: {" > ".join(str(p) for p in parts)}', | |
| relation_type=relation, | |
| provenance=provenance)) | |
| parent = 0 if len(parts) == 1 else ensure_path(parts[:-1]) | |
| add_child(nodes, parent, nid) | |
| return nid | |
| # Group variables by top-level group path | |
| work = can.copy() | |
| work['_top'] = work['_group_path'].apply( | |
| lambda x: str(x).split(' > ')[0].strip() | |
| if str(x) not in ('', 'nan', 'Ungrouped') else 'Ungrouped' | |
| ) | |
| for top_label, tdf in work.groupby('_top', dropna=False, sort=False): | |
| top_label = str(top_label) | |
| # Check if sub-group paths already exist (level 2+) | |
| subgroup_paths = tdf['_group_path'].apply( | |
| lambda x: ' > '.join(str(x).split(' > ')[1:]).strip() | |
| if len(str(x).split(' > ')) > 1 else '' | |
| ) | |
| has_subgroups = subgroup_paths.str.strip().str.len().gt(0).any() | |
| if has_subgroups: | |
| # [C4][CAS] UnaryPenalty: count distinct subgroup paths under this top group. | |
| # Castanet: "eliminate a child whose name appears within the parent's name" | |
| # and nodes that create unary (1-child) chains weaken the hierarchy. | |
| # If a structural column creates only ONE branch under this parent, | |
| # it is a pass-through (e.g. "DMS Recommended Standard" under "DMS") | |
| # and should be demoted β cluster directly under the top-level node instead. | |
| distinct_subpaths = subgroup_paths[subgroup_paths.str.strip().str.len().gt(0)].unique() | |
| n_distinct_subpaths = len(distinct_subpaths) | |
| if n_distinct_subpaths <= 1: | |
| # UnaryPenalty triggered β structural column creates only 1 branch. | |
| # Cluster directly under top_label, skip the variant pass-through. [C4] | |
| _cluster_and_label( | |
| tdf, [top_label], nodes, leaf_to_id, embedder, | |
| concept_table, concept_embs, ensure_path, | |
| n_clusters_per_group, can, ref_centroids, corpus_centroid | |
| ) | |
| else: | |
| # Multiple distinct subgroups β structural column is meaningful, keep it. | |
| for subpath, sdf in tdf.groupby(subgroup_paths, dropna=False, sort=False): | |
| subpath = str(subpath).strip() | |
| if subpath: | |
| parts = [top_label] + [p.strip() for p in subpath.split(' > ') if p.strip()] | |
| else: | |
| parts = [top_label] | |
| # Cluster and label within this subgroup | |
| _cluster_and_label( | |
| sdf, parts, nodes, leaf_to_id, embedder, | |
| concept_table, concept_embs, ensure_path, | |
| n_clusters_per_group, can, ref_centroids, corpus_centroid | |
| ) | |
| else: | |
| # No pre-existing subgroups β cluster all variables under this top group | |
| _cluster_and_label( | |
| tdf, [top_label], nodes, leaf_to_id, embedder, | |
| concept_table, concept_embs, ensure_path, | |
| n_clusters_per_group, can, ref_centroids, corpus_centroid | |
| ) | |
| # [Fix2] Nest statistics under their shared measure (Correct Latency β Mean/β¦), | |
| # then merge same-named duplicates, THEN sub-split the consolidated concept | |
| # nodes by a distinctive title descriptor (Error β Incorrect Colour / β¦). Order | |
| # matters: sub-splitting last avoids the merge re-parenting sub-nodes oddly. | |
| _nest_by_measure(nodes) | |
| _merge_duplicate_concepts(nodes) | |
| _subsplit_concept_by_title(nodes) | |
| # Remove empty concept nodes (no variables) β meaningless and they break the | |
| # branchvalues='total' sunburst (parent value < sum of children β blank render). | |
| _prune_empty_aggregations(nodes) | |
| # Dissolve 1-variable Statistic/Condition facet nodes (no grouping value). | |
| _dissolve_facet_singletons(nodes) | |
| _prune_empty_aggregations(nodes) | |
| # NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted β | |
| # it regressed setOverlap (0.914β0.836: mis-grouping) and added depth beyond gold. | |
| # _nest_by_category() is kept defined but intentionally NOT called. | |
| _alive = {int(n['id']) for n in nodes} | |
| for n in nodes: | |
| n['related'] = [x for x in dict.fromkeys(int(x) for x in n.get('related', [])) | |
| if x in _alive] # dedup + drop dangling refs (sunburst safety) | |
| return nodes | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HIEXPAN-INSPIRED REFINEMENT [HIE] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _leaf_texts(nodes, nid, text_cache): | |
| return [text_cache[i] for i in leaf_ids(nodes, nid) if i in text_cache] | |
| def _build_emb_cache(nodes, embedder, text_cache): | |
| """ | |
| [HIE] Pre-compute ALL leaf embeddings in one batch call. | |
| Paper-correct: HiExpan pre-computes entity representations once upfront, | |
| then all expansion passes reuse the cache β no re-encoding per leaf per pass. | |
| """ | |
| ids = [int(n['id']) for n in nodes | |
| if n['type'] == 'attribute' and text_cache.get(int(n['id']), '').strip()] | |
| if not ids: | |
| return {} | |
| texts = [text_cache[i] for i in ids] | |
| embs = embedder.encode(texts) # ONE batch call for everything | |
| return {nid: embs[i] for i, nid in enumerate(ids)} | |
| def hiexpan_sibling_coherence(nodes, embedder, text_cache, emb_cache=None): | |
| """[HIE] Mean pairwise cosine similarity of attribute children per group node.""" | |
| report = [] | |
| m = nmap(nodes) | |
| if emb_cache is None: | |
| emb_cache = _build_emb_cache(nodes, embedder, text_cache) | |
| for node in nodes: | |
| if node['type'] not in ('aggregation', 'root'): continue | |
| attr_ch = [int(c) for c in node.get('related', []) | |
| if m.get(int(c), {}).get('type') == 'attribute'] | |
| if len(attr_ch) < 2: continue | |
| embs_ = np.array([emb_cache[cid] for cid in attr_ch if cid in emb_cache]) | |
| if len(embs_) < 2: continue | |
| sims = cosine_similarity(embs_) | |
| n_ = len(embs_) | |
| mask_ = np.triu(np.ones((n_, n_), dtype=bool), k=1) | |
| mean_sim = float(sims[mask_].mean()) if mask_.any() else 1.0 | |
| report.append({'node_id': int(node['id']), 'name': node['name'], | |
| 'n_attr_children': n_, 'coherence_score': round(mean_sim, 3), | |
| 'is_incoherent': mean_sim < 0.25}) | |
| return sorted(report, key=lambda x: x['coherence_score']) | |
| def hiexpan_width_expansion(nodes, embedder, text_cache, threshold=0.45, emb_cache=None): | |
| """[HIE] Move each leaf to the sibling group with highest centroid similarity. | |
| Uses pre-built emb_cache β no per-leaf encode() calls (paper-correct, fast).""" | |
| import copy | |
| nodes = copy.deepcopy(nodes) | |
| m = nmap(nodes) | |
| if emb_cache is None: | |
| emb_cache = _build_emb_cache(nodes, embedder, text_cache) | |
| measure_nodes = [n for n in nodes if n['type'] == 'aggregation' | |
| and any(m.get(int(c), {}).get('type') == 'attribute' | |
| for c in n.get('related', []))] | |
| if len(measure_nodes) < 2: return nodes, 0 | |
| # Build group centroids from cached embeddings β no new encode() calls | |
| node_centroids = {} | |
| for mn in measure_nodes: | |
| leaf_ids_ = [int(c) for c in leaf_ids(nodes, int(mn['id'])) | |
| if int(c) in emb_cache] | |
| if leaf_ids_: | |
| embs_ = np.array([emb_cache[i] for i in leaf_ids_]) | |
| node_centroids[int(mn['id'])] = _centroid(embs_) | |
| if len(node_centroids) < 2: return nodes, 0 | |
| cent_ids = list(node_centroids.keys()) | |
| cent_array = np.stack([node_centroids[i] for i in cent_ids]) | |
| n_moves = 0 | |
| for leaf in [n for n in nodes if n['type'] == 'attribute']: | |
| lid = int(leaf['id']) | |
| if lid not in emb_cache: continue | |
| leaf_emb = emb_cache[lid] # cached β no encode() call | |
| current_parents = parents(nodes, lid) | |
| current_measure = [p for p in current_parents if p in node_centroids] | |
| if not current_measure: continue | |
| cur_p = current_measure[0] | |
| sims = cent_array.dot(leaf_emb) | |
| best_idx = int(np.argmax(sims)) | |
| best_p = cent_ids[best_idx] | |
| if best_p != cur_p and float(sims[best_idx]) > node_centroids[cur_p].dot(leaf_emb) + 0.02: | |
| nodes = move_node(nodes, lid, best_p) | |
| n_moves += 1 | |
| return nodes, n_moves | |
| def hiexpan_depth_expansion_semantic(nodes, embedder, text_cache, concept_table, | |
| concept_embs, n_subclusters=3, emb_cache=None, | |
| coherence_threshold=0.45, top_level_tasks=None): | |
| """ | |
| [HIE][C1][C2] Depth expansion β embedding-based, no hardcoded patterns. | |
| Uses pre-built emb_cache β no per-group encode() calls (paper-correct, fast). | |
| [C2][TAX][RAPTOR] Quality gate added: | |
| - Balance check: largest sub-cluster must be β€ 70% of total (prevents degenerate splits) | |
| - Min size: each sub-cluster must have β₯ 2 variables (HiExpan coherence requires siblings) | |
| - coherence_threshold: tunable per recursive pass (lowered across passes for deeper trees) | |
| """ | |
| import copy | |
| nodes = copy.deepcopy(nodes) | |
| m = nmap(nodes) | |
| if emb_cache is None: | |
| emb_cache = _build_emb_cache(nodes, embedder, text_cache) | |
| n_exp = 0 | |
| for agg in [n for n in nodes if n['type'] == 'aggregation']: | |
| attr_ch = [int(c) for c in agg.get('related', []) | |
| if m.get(int(c), {}).get('type') == 'attribute'] | |
| if len(attr_ch) < 3: | |
| continue | |
| # Check coherence using cached embeddings β no new encode() calls | |
| cached_ids = [cid for cid in attr_ch if cid in emb_cache] | |
| if len(cached_ids) < 2: | |
| continue | |
| embs = np.array([emb_cache[cid] for cid in cached_ids]) | |
| sims = cosine_similarity(embs) | |
| n_ = len(cached_ids) | |
| mask_ = np.triu(np.ones((n_, n_), dtype=bool), k=1) | |
| coherence = float(sims[mask_].mean()) if mask_.any() else 1.0 | |
| # [F6][HIE][RAPTOR] Revised depth-expansion gate: | |
| # Original: skip if coherence β₯ threshold β WRONG for narrow-vocab domains | |
| # (CANTAB/HCP have high cosine similarity even across sub-types). | |
| # New rule: skip ONLY when coherent AND small AND low vocabulary diversity. | |
| # HiExpan: expands wide nodes; RAPTOR: splits while BIC improves. | |
| # A coherent-but-large node with diverse sub-vocabularies is a good parent | |
| # whose children have not yet been discovered β depth expansion IS needed. | |
| if coherence >= coherence_threshold: | |
| # Allow depth expansion for large nodes with vocabulary diversity | |
| if len(attr_ch) < 6: | |
| continue # Truly small coherent cluster β no further split needed | |
| # Compute vocabulary range: max unique non-stop tokens minus min | |
| vocab_sizes = [] | |
| for cid in cached_ids: | |
| txt = text_cache.get(cid, '').lower() | |
| toks = set(re.findall(r'\b[a-z]{4,}\b', txt)) - _STOP | |
| vocab_sizes.append(len(toks)) | |
| vocab_range = max(vocab_sizes) - min(vocab_sizes) if vocab_sizes else 0 | |
| if vocab_range < 3: | |
| continue # Low internal diversity β truly homogeneous, stop here | |
| # Re-cluster the leaves | |
| k_sub = min(n_subclusters, max(2, len(attr_ch) // 3)) | |
| try: | |
| dist = cosine_distances(embs).astype(float) | |
| np.fill_diagonal(dist, 0.0) | |
| sub_lbs = AgglomerativeClustering(n_clusters=k_sub, metric='precomputed', | |
| linkage='average').fit_predict(dist) | |
| except Exception: | |
| continue | |
| # [C2][TAX][RAPTOR] Quality gate: reject unbalanced or degenerate splits | |
| cluster_sizes = [int((sub_lbs == sk).sum()) for sk in range(k_sub)] | |
| total_size = sum(cluster_sizes) | |
| max_cluster = max(cluster_sizes) if cluster_sizes else 0 | |
| # Balance: largest cluster β€ 70% of total | |
| if total_size > 0 and max_cluster / total_size > 0.70: | |
| continue # Degenerate split β one cluster dominates, no real gain | |
| # Min-size: every sub-cluster must have β₯ 2 variables (HiExpan sibling coherence) | |
| if any(s < 2 for s in cluster_sizes): | |
| continue | |
| # Remove direct leaf connections from this agg node | |
| agg_id = int(agg['id']) | |
| for cid in attr_ch: | |
| remove_child(nodes, agg_id, cid) | |
| # Per sub-cluster leaf ids + their TITLES (so HiExpan labels from titles too, | |
| # not the boilerplate definition path). | |
| sub_cids_by_sk = [[cached_ids[i] for i, m_ in enumerate(sub_lbs == sk) if m_] | |
| for sk in range(k_sub)] | |
| sub_titles_by_sk = [[_concept_title(text_cache.get(cid, '')) for cid in cids] | |
| for cids in sub_cids_by_sk] | |
| agg_ancestors = ancestor_names(nodes, agg_id) + [agg['name']] | |
| _anc_words = set(re.findall(r'[a-z]{3,}', ' '.join(agg_ancestors).lower())) | |
| _used_sub = set() | |
| # Create sub-nodes β TITLE label wins; concept scoring only as fallback. | |
| for sk in range(k_sub): | |
| sub_cids = sub_cids_by_sk[sk] | |
| if not sub_cids: | |
| continue | |
| sub_mask = sub_lbs == sk | |
| sub_texts = [text_cache.get(cid, '') for cid in sub_cids] | |
| sub_emb = _centroid(embs[sub_mask]) | |
| title_label = _title_cluster_label( | |
| sub_titles_by_sk[sk], | |
| [sub_titles_by_sk[j] for j in range(k_sub) if j != sk], | |
| ancestor_words=_anc_words, used_labels=_used_sub) | |
| if title_label: | |
| label = title_label | |
| provenance = {'node_label': label, 'source_evidence': ['description_title'], | |
| 'confidence': 0.0, 'alternatives': []} | |
| elif sub_emb is not None and concept_embs is not None and concept_table: | |
| scores = score_concepts_for_cluster(sub_emb, concept_embs, concept_table, sub_texts) | |
| label, provenance = assign_concept_label( | |
| scores, fallback=f'{agg["name"]} {sk+1}', | |
| ancestor_names=agg_ancestors, top_level_tasks=top_level_tasks, | |
| ) | |
| else: | |
| label = f'{agg["name"]} {sk+1}' | |
| provenance = None | |
| _used_sub.add(str(label).lower()) | |
| nid = next_id(nodes) | |
| nodes.append(make_agg(nid, label, | |
| desc=f'Sub-group of {agg["name"]}: {label}', | |
| relation_type='belongs_to', | |
| provenance=provenance)) | |
| add_child(nodes, agg_id, nid) | |
| for cid in sub_cids: | |
| add_child(nodes, nid, cid) | |
| n_exp += 1 | |
| return nodes, n_exp | |
| def hiexpan_global_optimization(nodes, embedder, text_cache, n_passes=2, emb_cache=None): | |
| """[HIE] Global optimization β repeated width expansion passes until convergence. | |
| Reuses emb_cache β no new encode() calls across passes.""" | |
| total = 0 | |
| for _ in range(n_passes): | |
| nodes, moves = hiexpan_width_expansion(nodes, embedder, text_cache, | |
| threshold=0.40, emb_cache=emb_cache) | |
| total += moves | |
| if moves == 0: break | |
| return nodes, total | |
| def run_hiexpan(nodes, can, embedder, concept_table=None, concept_embs=None, | |
| max_depth_passes=4): | |
| """ | |
| [HIE][C1][C2] Run all HiExpan passes with a single pre-built embedding cache. | |
| Paper-correct: encode all leaves ONCE, reuse across coherence / width / depth / global. | |
| This reduces HiExpan from O(n_passes Γ n_leaves) encode calls to O(1). | |
| [C1][HIE][TAX] Recursive depth expansion loop: | |
| HiExpan: "builds the taxonomy by recursively expanding all these sets." | |
| TaxoGen: "splitting a coarse topic into fine-grained ones" is iterative. | |
| Progressive coherence thresholds: [0.45, 0.38, 0.30, 0.22] β each pass allows | |
| finer splits, pushing the hierarchy deeper until max_depth_passes or convergence. | |
| """ | |
| text_cache = {int(n['id']): str(n.get('semantic_desc', n.get('desc', ''))) | |
| for n in nodes if n['type'] == 'attribute'} | |
| # ββ Pre-encode ALL leaves once (HiExpan paper: pre-compute entity representations) | |
| emb_cache = _build_emb_cache(nodes, embedder, text_cache) | |
| report = {} | |
| report['coherence_before'] = hiexpan_sibling_coherence( | |
| nodes, embedder, text_cache, emb_cache=emb_cache) | |
| nodes, n_width = hiexpan_width_expansion( | |
| nodes, embedder, text_cache, emb_cache=emb_cache) | |
| report['width_expansion_moves'] = n_width | |
| # [C1][HIE][TAX] Recursive depth expansion β progressive threshold schedule | |
| # Pass 1: threshold=0.45 (broad splits) | |
| # Pass 2: threshold=0.38 (medium splits) | |
| # Pass 3: threshold=0.30 (fine splits) | |
| # Pass 4: threshold=0.22 (very fine β only if still incoherent) | |
| # [FIX1] DEPTH-EXPANSION DISABLED. It split concept nodes (Total Correct, | |
| # Percent Correct, Error) by DELAY condition into sub-clusters whose titles | |
| # differ only by a parenthetical number β so the title labeler found nothing | |
| # distinctive and fell back to the boilerplate "Calculated Assessed Trials" | |
| # candidate. Those repetitive "same children" tiers are removed by not running | |
| # this pass; the hierarchy stays Task β concept β leaves. MeasureβStatistic | |
| # depth is handled separately from the title composition (Fix 2), not here. | |
| total_depth_exp = 0 | |
| pass_idx = -1 | |
| report['depth_expansion_nodes'] = total_depth_exp | |
| report['depth_expansion_passes'] = pass_idx + 1 | |
| nodes, n_global = hiexpan_global_optimization( | |
| nodes, embedder, text_cache, emb_cache=emb_cache) | |
| report['global_optimization_moves'] = n_global | |
| report['coherence_after'] = hiexpan_sibling_coherence( | |
| nodes, embedder, text_cache, emb_cache=emb_cache) | |
| return nodes, report | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFLICT RESOLUTION TABLE [HIE] | |
| # Variables where the top-2 concept assignments differ by < 0.05 in score. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compute_conflict_table(can, nodes): | |
| """ | |
| [HIE] Full conflict resolution table. | |
| For each low-confidence variable, computes similarity to current parent centroid | |
| and to all sibling group centroids β shows top-2 alternative placements. | |
| """ | |
| pm = build_parent_map(nodes) | |
| m = nmap(nodes) | |
| # Build centroid cache for all aggregation nodes | |
| agg_nodes = [n for n in nodes if n.get('type') == 'aggregation'] | |
| text_cache = {int(n['id']): str(n.get('desc', '')) | |
| for n in nodes if n.get('type') == 'attribute'} | |
| # TF-IDF similarity proxy (no embedder available here β use text overlap) | |
| def _sim_to_group(var_text, agg_node): | |
| agg_texts = [text_cache.get(int(c), '') | |
| for c in agg_node.get('related', []) | |
| if m.get(int(c), {}).get('type') == 'attribute'] | |
| if not agg_texts: | |
| return 0.0 | |
| combined = ' '.join(agg_texts).lower() | |
| var_words = set(re.findall(r'\b[a-z]{3,}\b', var_text.lower())) - _STOP | |
| group_words = set(re.findall(r'\b[a-z]{3,}\b', combined)) - _STOP | |
| if not var_words: | |
| return 0.0 | |
| return len(var_words & group_words) / len(var_words) | |
| rows = [] | |
| for _, row in can.iterrows(): | |
| score = float(row.get('_concept_score', 0.0)) | |
| if not (0 < score < 0.25): | |
| continue | |
| lid = row['_leaf_id'] | |
| lid_n = [n for n in nodes if n.get('metadata', {}).get('leaf_id') == lid] | |
| if not lid_n: | |
| continue | |
| nid = int(lid_n[0]['id']) | |
| parent_id = pm.get(nid) | |
| parent_n = m.get(parent_id, {}) | |
| parent_nm = parent_n.get('name', '') | |
| var_text = str(row.get('_text', '')) | |
| # Sibling groups = all aggregation nodes under same grandparent | |
| grandparent_id = pm.get(parent_id) | |
| sibling_groups = [ | |
| n for n in agg_nodes | |
| if pm.get(int(n['id'])) == grandparent_id and int(n['id']) != parent_id | |
| ] | |
| sib_sims = sorted( | |
| [{'name': sn['name'], 'sim': round(_sim_to_group(var_text, sn), 3)} | |
| for sn in sibling_groups], | |
| key=lambda x: -x['sim'] | |
| ) | |
| cur_sim = round(_sim_to_group(var_text, parent_n), 3) if parent_n else 0.0 | |
| alt1 = sib_sims[0] if len(sib_sims) > 0 else {'name': 'β', 'sim': 0.0} | |
| alt2 = sib_sims[1] if len(sib_sims) > 1 else {'name': 'β', 'sim': 0.0} | |
| decision = ('Move to alt-1' if alt1['sim'] > cur_sim + 0.10 | |
| else 'Review manually' if alt1['sim'] > cur_sim | |
| else 'Keep current') | |
| rows.append({ | |
| 'variable': row['_leaf_label'], | |
| 'concept_label': row.get('_concept_label', ''), | |
| 'concept_score': round(score, 3), | |
| 'current_parent': parent_nm, | |
| 'current_sim': cur_sim, | |
| 'alt_parent_1': alt1['name'], | |
| 'alt_sim_1': alt1['sim'], | |
| 'alt_parent_2': alt2['name'], | |
| 'alt_sim_2': alt2['sim'], | |
| 'decision': decision, | |
| 'source': row.get('_concept_source', ''), | |
| }) | |
| cols = ['variable', 'concept_label', 'concept_score', | |
| 'current_parent', 'current_sim', | |
| 'alt_parent_1', 'alt_sim_1', | |
| 'alt_parent_2', 'alt_sim_2', | |
| 'decision', 'source'] | |
| return pd.DataFrame(rows, columns=cols) if rows else pd.DataFrame(columns=cols) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CASTANET PARALLEL FACETED HIERARCHIES [CAS] | |
| # Uses automatic concept assignments β no hardcoded dimension patterns. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def detect_facets(can, code_expansions=None): | |
| """ | |
| [CAS] Auto-detect semantic facet dimensions from the actual metadata structure. | |
| Each facet is only added when meaningful (>1 distinct value in the data). | |
| No hardcoded labels β facet names and groups come entirely from the data. | |
| Detects (in order, only if data supports them): | |
| Task β top-level group path | |
| Variant β second-level group path | |
| Measure Type β automatically assigned concept label (NΓM alignment) | |
| Statistic β detected from description text (mean/sd/median/percent etc.) | |
| Condition β numeric code suffix (delay/timepoint conditions) | |
| Outcome Type β outcome/error keywords detected from description text | |
| Scale/Precision β metadata columns (decimal places, unit, format) | |
| Code Family β structural prefix groups | |
| """ | |
| facets = [] | |
| code_expansions = code_expansions or {} | |
| sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text' | |
| # ββ Facet: Task (top-level group path) ββββββββββββββββββββββββββββββββββββ | |
| top_groups = can['_group_path'].apply( | |
| lambda x: str(x).split(' > ')[0].strip() | |
| if str(x) not in ('', 'nan', 'Ungrouped') else 'Ungrouped' | |
| ) | |
| if top_groups.nunique() > 1: | |
| can['_facet_task'] = top_groups | |
| facets.append({ | |
| 'name': 'Task', | |
| 'desc': 'Variables grouped by their top-level task or domain.', | |
| 'fn': (lambda col: lambda row: str(row.get(col, 'Ungrouped')))('_facet_task'), | |
| 'relation': 'belongs_to', | |
| }) | |
| # ββ Facet: Variant (second-level group path, if present) ββββββββββββββββββ | |
| def _second_level(gpath): | |
| parts = str(gpath).split(' > ') | |
| return parts[1].strip() if len(parts) > 1 else '' | |
| second = can['_group_path'].apply(_second_level) | |
| if second.ne('').any() and second.nunique() > 1: | |
| can['_facet_variant'] = second.where(second != '', 'No Variant') | |
| facets.append({ | |
| 'name': 'Variant', | |
| 'desc': 'Variables grouped by their second-level structural variant.', | |
| 'fn': (lambda col: lambda row: str(row.get(col, 'No Variant')))('_facet_variant'), | |
| 'relation': 'belongs_to', | |
| }) | |
| # ββ Facet: Measure Type (concept label from NΓM alignment) ββββββββββββββββ | |
| has_concepts = can['_concept_label'].fillna('').ne('').any() | |
| if has_concepts and can['_concept_label'].nunique() > 1: | |
| facets.append({ | |
| 'name': 'Measure Type', | |
| 'desc': 'Variables grouped by automatically discovered concept label (NΓM embedding alignment).', | |
| 'fn': lambda row: str(row['_concept_label']) if str(row['_concept_label']) not in ('', 'nan') else 'Unclassified', | |
| 'relation': 'related_to', | |
| }) | |
| # ββ Facet: Statistic (detected from description text) βββββββββββββββββββββ | |
| _stat_re = re.compile( | |
| r'\b(mean|average|median|standard deviation|std|percent|proportion|' | |
| r'probability|total|sum|count|maximum|minimum|range|variance|' | |
| r'coefficient|ratio|rate|frequency)\b', re.IGNORECASE | |
| ) | |
| _stat_norm = { | |
| 'average': 'Mean', 'std': 'Standard Deviation', 'proportion': 'Percent', | |
| 'sum': 'Total', 'count': 'Total', 'frequency': 'Rate', | |
| } | |
| def _extract_stat(row): | |
| hits = _stat_re.findall(str(row.get(sem_col, row.get('_text', ''))).lower()) | |
| if not hits: return '' | |
| h = hits[0].lower() | |
| return _stat_norm.get(h, h.title()) | |
| stat_col = can.apply(_extract_stat, axis=1) | |
| if stat_col.ne('').any() and stat_col.nunique() > 1: | |
| can['_facet_stat'] = stat_col.where(stat_col != '', 'Other') | |
| facets.append({ | |
| 'name': 'Statistic', | |
| 'desc': 'Variables grouped by statistical summary type detected from descriptions.', | |
| 'fn': (lambda col: lambda row: str(row.get(col, 'Other')))('_facet_stat'), | |
| 'relation': 'is_statistic_of', | |
| }) | |
| # ββ Facet: Condition (numeric code suffix) βββββββββββββββββββββββββββββββββ | |
| _num_re = re.compile(r'(\d+)') | |
| def _extract_cond(row): | |
| hits = _num_re.findall(str(row['_leaf_label']).split('/')[0].strip()) | |
| return hits[0] if hits else '' | |
| cond_col = can.apply(_extract_cond, axis=1) | |
| if cond_col.ne('').any() and cond_col.nunique() > 1: | |
| can['_facet_cond'] = cond_col.where(cond_col != '', 'No Condition') | |
| # Name facet from most common unit word in descriptions | |
| _unit_re = re.compile(r'\b(second|msec|millisecond|month|week|day|year|trial|block|session|delay)\b', re.IGNORECASE) | |
| all_text = ' '.join(can[sem_col].fillna('').astype(str).tolist()).lower() | |
| unit_hits = _unit_re.findall(all_text) | |
| fname = (max(set(unit_hits), key=unit_hits.count).title() + ' Condition') if unit_hits else 'Condition' | |
| facets.append({ | |
| 'name': fname, | |
| 'desc': 'Variables grouped by numeric condition variant in variable codes.', | |
| 'fn': (lambda col: lambda row: str(row.get(col, 'No Condition')))('_facet_cond'), | |
| 'relation': 'has_condition', | |
| }) | |
| # ββ Facet: Outcome/Error Type (from description keywords) βββββββββββββββββ | |
| _out_re = re.compile(r'\b(error|errors|miss|false alarm|omission|commission|incorrect|outcome|penalty)\b', re.IGNORECASE) | |
| def _extract_outcome(row): | |
| hits = _out_re.findall(str(row.get(sem_col, row.get('_text', ''))).lower()) | |
| return hits[0].title() if hits else '' | |
| out_col = can.apply(_extract_outcome, axis=1) | |
| if out_col.ne('').any() and out_col.nunique() > 1: | |
| can['_facet_outcome'] = out_col.where(out_col != '', 'Other') | |
| facets.append({ | |
| 'name': 'Outcome Type', | |
| 'desc': 'Variables grouped by outcome/error type detected from description text.', | |
| 'fn': (lambda col: lambda row: str(row.get(col, 'Other')))('_facet_outcome'), | |
| 'relation': 'has_measure', | |
| }) | |
| # ββ Facet: Scale/Precision (from _raw metadata columns) βββββββββββββββββββ | |
| _prec_re = re.compile(r'\b(decimal|precision|unit|scale|format)\b', re.IGNORECASE) | |
| if '_raw' in can.columns: | |
| sample_raw = can['_raw'].dropna().iloc[0] if len(can) > 0 else {} | |
| prec_cols = [c for c in (sample_raw.keys() if isinstance(sample_raw, dict) else []) | |
| if _prec_re.search(str(c))] | |
| if prec_cols: | |
| def _extract_prec(row): | |
| raw = row.get('_raw', {}) | |
| if not isinstance(raw, dict): return '' | |
| for pc in prec_cols: | |
| v = str(raw.get(pc, '')).strip() | |
| if v and v.lower() not in ('nan', 'none', ''): return v | |
| return '' | |
| prec_col = can.apply(_extract_prec, axis=1) | |
| if prec_col.ne('').any() and prec_col.nunique() > 1: | |
| can['_facet_prec'] = prec_col.where(prec_col != '', 'Unspecified') | |
| facets.append({ | |
| 'name': 'Scale/Precision', | |
| 'desc': 'Variables grouped by decimal places or unit of measurement.', | |
| 'fn': (lambda col: lambda row: str(row.get(col, 'Unspecified')))('_facet_prec'), | |
| 'relation': 'belongs_to', | |
| }) | |
| # ββ Facet: Code Family βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| has_families = can['_code_family'].fillna('').ne('').any() | |
| if has_families and can['_code_family'].nunique() > 1: | |
| facets.append({ | |
| 'name': 'Code Family', | |
| 'desc': 'Variables grouped by variable-code structural prefix.', | |
| 'fn': lambda row: str(row['_code_family']) if str(row['_code_family']) not in ('', 'nan') else 'Other', | |
| 'relation': 'belongs_to', | |
| }) | |
| # Fallback: TF-IDF semantic clusters if fewer than 2 facets detected | |
| if len(facets) < 2: | |
| texts = can[sem_col].fillna('').tolist() | |
| for nc in [5, 8]: | |
| lbls = tfidf_cluster_labels(texts, max_clusters=nc) | |
| lbl_col = f'_tfidf_cluster_{nc}' | |
| can[lbl_col] = lbls | |
| facets.append({ | |
| 'name': f'Semantic Cluster (k={nc})', | |
| 'desc': f'TF-IDF agglomerative clustering into {nc} groups.', | |
| 'fn': (lambda col: lambda row: str(row.get(col, 'Other')))(lbl_col), | |
| 'relation': 'related_to', | |
| }) | |
| return facets | |
| def build_facet_hierarchy(can, facet, project='root'): | |
| """[CAS] Single-level facet hierarchy: Root β Group β Leaf.""" | |
| nodes = [{'id': 0, 'name': project, 'type': 'root', 'dtype': 'root', | |
| 'isShown': True, 'related': [], 'desc': f"Facet: {facet['name']}"}] | |
| group_ids = {} | |
| for i, (_, row) in enumerate(can.iterrows(), start=1): | |
| group_label = str(facet['fn'](row)) | |
| if group_label not in group_ids: | |
| gid = len(nodes) | |
| group_ids[group_label] = gid | |
| nodes.append(make_agg(gid, group_label, | |
| desc=f"{facet['name']}: {group_label}", | |
| relation_type=facet['relation'])) | |
| add_child(nodes, 0, gid) | |
| lid = len(nodes) | |
| nodes.append({'id': lid, 'name': str(row['_leaf_label']), 'dtype': str(row['_dtype']), | |
| 'related': [], 'isShown': True, 'type': 'attribute', | |
| 'desc': str(row['_text']), 'source_file': str(row['_source_file']), | |
| 'metadata': {'leaf_id': str(row['_leaf_id']), | |
| 'group_path': str(row['_group_path'])}}) | |
| add_child(nodes, group_ids[group_label], lid) | |
| for n in nodes: | |
| n['related'] = list(dict.fromkeys([int(x) for x in n.get('related', [])])) | |
| return nodes | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # VISUALIZATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RELATION_COLORS = { | |
| 'has_measure': '#4C72B0', 'is_statistic_of': '#DD8452', 'has_condition': '#55A868', | |
| 'part_of': '#C44E52', 'instance_of': '#8172B2', 'subclass_of': '#937860', | |
| 'belongs_to': '#8C8C8C', 'related_to': '#CCB974', '': '#8C8C8C', | |
| } | |
| def _node_color(n): | |
| if n.get('type') == 'root': return '#c44e52' | |
| if n.get('type') == 'attribute': return '#4C72B0' | |
| if n.get('type') == 'collapsed': return '#bbbbbb' | |
| return RELATION_COLORS.get(n.get('info', {}).get('relation_type', ''), '#8C8C8C') | |
| def _wrap_hover(text, width=90): | |
| """Word-wrap plain text into HTML lines for Plotly hover tooltips.""" | |
| text = str(text).replace('<', '<').replace('>', '>') | |
| words, lines, line, length = text.split(), [], [], 0 | |
| for w in words: | |
| if length + len(w) + 1 > width and line: | |
| lines.append(' '.join(line)) | |
| line, length = [w], len(w) | |
| else: | |
| line.append(w); length += len(w) + 1 | |
| if line: | |
| lines.append(' '.join(line)) | |
| return '<br>'.join(lines) | |
| def _rich_hover(n, nodes): | |
| """Full, word-wrapped hover tooltip (name, provenance, complete description). | |
| Shared by every view so the treemap and node-link tooltips are as readable as | |
| the sunburst's β leaves show their full semantic_desc, no truncation.""" | |
| nid = int(n['id']); lc = len(leaf_ids(nodes, nid)) | |
| rel = n.get('info', {}).get('relation_label', '') if n.get('type') == 'aggregation' else '' | |
| prov = n.get('concept_provenance', {}) | |
| raw_desc = (n.get('semantic_desc') or n.get('desc', '')) \ | |
| if n.get('type') == 'attribute' else n.get('desc', '') | |
| desc_html = _wrap_hover(raw_desc) | |
| alts = ', '.join(prov.get('alternatives', [])) | |
| src = ', '.join(prov.get('source_evidence', [])) | |
| return (f'<b>{n.get("name","")}</b><br>Type: {n.get("type","")}<br>' | |
| f'Relation: {rel}<br>Variables: {lc}' | |
| + (f'<br>Confidence: {prov.get("confidence","")} | Source: {src}' | |
| f'<br>Alternatives: {alts}' if prov else '') | |
| + f'<br><br>{desc_html}') | |
| def plot_sunburst(nodes, max_depth=4): | |
| pm = build_parent_map(nodes) | |
| ids, labels, parents_, values, hover = [], [], [], [], [] | |
| for n in nodes: | |
| nid = int(n['id']); lc = len(leaf_ids(nodes, nid)) | |
| ids.append(str(nid)); labels.append(str(n.get('name', ''))[:40]) | |
| parents_.append('' if nid == 0 else str(pm.get(nid, 0))) | |
| values.append(max(1, lc)) | |
| hover.append(_rich_hover(n, nodes)) | |
| fig = go.Figure(go.Sunburst(ids=ids, labels=labels, parents=parents_, values=values, | |
| branchvalues='total', hovertext=hover, hoverinfo='text', | |
| maxdepth=max_depth, insidetextorientation='radial', | |
| marker=dict(colorscale='Blues', line=dict(width=1, color='white')), | |
| leaf=dict(opacity=0.85))) | |
| fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10), | |
| title='Click sector to drill down β hover for concept provenance') | |
| return fig | |
| def plot_treemap(nodes): | |
| pm = build_parent_map(nodes) | |
| ids, labels, parents_, values, hover = [], [], [], [], [] | |
| for n in nodes: | |
| nid = int(n['id']); lc = len(leaf_ids(nodes, nid)) | |
| ids.append(str(nid)); labels.append(str(n.get('name', ''))[:40]) | |
| parents_.append('' if nid == 0 else str(pm.get(nid, 0))) | |
| values.append(max(1, lc)) | |
| hover.append(_rich_hover(n, nodes)) | |
| fig = go.Figure(go.Treemap(ids=ids, labels=labels, parents=parents_, values=values, | |
| branchvalues='total', hovertext=hover, hoverinfo='text', | |
| textinfo='label+value', | |
| marker=dict(colorscale='Blues', line=dict(width=1, color='white')))) | |
| fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10)) | |
| return fig | |
| def plot_facets_parallel(facet_trees): | |
| """[CAS] Side-by-side sunbursts β one per facet dimension.""" | |
| names = list(facet_trees.keys()); nf = len(names) | |
| if nf == 0: return go.Figure() | |
| cols = min(3, nf); rows = (nf + cols - 1) // cols | |
| specs = [[{'type': 'sunburst'} for _ in range(cols)] for _ in range(rows)] | |
| fig = make_subplots(rows=rows, cols=cols, specs=specs, subplot_titles=names) | |
| for idx, (fname, fnodes) in enumerate(facet_trees.items()): | |
| row = idx // cols + 1; col = idx % cols + 1 | |
| pm = build_parent_map(fnodes) | |
| ids, labels, parents_, values, hover = [], [], [], [], [] | |
| for n_ in fnodes: | |
| nid = int(n_['id']); lc = len(leaf_ids(fnodes, nid)) | |
| ids.append(f'{fname}_{nid}'); labels.append(str(n_.get('name', ''))[:28]) | |
| parents_.append('' if nid == 0 else f'{fname}_{pm.get(nid, 0)}') | |
| values.append(max(1, lc)) | |
| hover.append(f'<b>{n_.get("name","")}</b><br>Variables: {lc}') | |
| fig.add_trace(go.Sunburst(ids=ids, labels=labels, parents=parents_, values=values, | |
| branchvalues='total', hovertext=hover, hoverinfo='text', | |
| maxdepth=2, leaf=dict(opacity=0.8), | |
| marker=dict(line=dict(width=1, color='white'))), | |
| row=row, col=col) | |
| fig.update_layout(height=420 * rows, margin=dict(l=10, r=10, t=50, b=10), | |
| title_text='Castanet Parallel Faceted Hierarchies β same variables, different views', | |
| title_font_size=13) | |
| return fig | |
| def display_graph(nodes, max_depth=4, show_hidden=False): | |
| m = nmap(nodes); dnodes = {}; edges = []; counter = 10 ** 9 | |
| def rec(nid, depth): | |
| nonlocal counter | |
| n = m.get(int(nid)) | |
| if not n: return | |
| if not show_hidden and n.get('isShown') is False and depth > 0: return | |
| dnodes[int(nid)] = n | |
| if depth >= max_depth and n.get('related'): | |
| counter += 1; cid = counter | |
| dnodes[cid] = {'id': cid, 'name': f'β¦ {len(leaf_ids(nodes,nid))} variables', | |
| 'type': 'collapsed', 'dtype': 'determine', 'related': [], | |
| 'desc': f'Collapsed: {n.get("name")}', 'isShown': True} | |
| edges.append((int(nid), cid)); return | |
| for c in n.get('related', []): | |
| ch = m.get(int(c)) | |
| if not ch: continue | |
| if not show_hidden and ch.get('isShown') is False: continue | |
| edges.append((int(nid), int(c))); rec(int(c), depth + 1) | |
| rec(0, 0); return list(dnodes.values()), edges | |
| def positions(dnodes, edges): | |
| """ | |
| Reingold-Tilford style layout. | |
| x = depth Γ horizontal_scale (breathing room between levels) | |
| y = subtree-aware vertical placement with 1.8 spacing per leaf | |
| Aggregation nodes centered over their children's y range. | |
| """ | |
| H_SCALE = 3.0 # horizontal gap between depth levels | |
| V_SPACE = 1.8 # vertical gap between leaf slots | |
| children = defaultdict(list) | |
| for p, c in edges: | |
| children[p].append(c) | |
| pos = {} | |
| counter = {'v': 0} | |
| def rec(nid, depth): | |
| ch = children.get(nid, []) | |
| if not ch: | |
| # Leaf β assign next vertical slot | |
| y_pos = counter['v'] * V_SPACE | |
| counter['v'] += 1 | |
| pos[nid] = (depth * H_SCALE, y_pos) | |
| return y_pos | |
| child_ys = [rec(c, depth + 1) for c in ch] | |
| # Parent centered over children range | |
| y_pos = float(np.mean(child_ys)) | |
| pos[nid] = (depth * H_SCALE, y_pos) | |
| return y_pos | |
| rec(0, 0) | |
| return pos | |
| def plot_node_link(nodes, max_depth, show_hidden, show_leaf_labels): | |
| """ | |
| Node-link tree with Reingold-Tilford layout. | |
| Paper: Taxonomizer recommends Sunburst as primary view for large hierarchies. | |
| Node-link is supplementary β best for exploring structure at moderate depth. | |
| """ | |
| dnodes, edges = display_graph(nodes, max_depth, show_hidden) | |
| pos = positions(dnodes, edges) | |
| # Edges: elbow-style (horizontal then vertical) | |
| ex, ey = [], [] | |
| for p, c in edges: | |
| if p not in pos or c not in pos: continue | |
| x0, y0 = pos[p] | |
| x1, y1 = pos[c] | |
| # Draw: parent β midpoint horizontally β child vertically β child | |
| xm = (x0 + x1) / 2 | |
| ex += [x0, xm, xm, x1, None] | |
| ey += [y0, y0, y1, y1, None] | |
| traces = [go.Scatter(x=ex, y=ey, mode='lines', | |
| line=dict(width=1, color='#c8c8c8'), | |
| hoverinfo='skip', showlegend=False)] | |
| # Nodes β split aggregation and leaf into two traces for cleaner rendering | |
| agg_xs, agg_ys, agg_labels, agg_colors, agg_hover = [], [], [], [], [] | |
| lf_xs, lf_ys, lf_labels, lf_colors, lf_hover = [], [], [], [], [] | |
| for n in dnodes: | |
| nid = int(n['id']) | |
| if nid not in pos: continue | |
| x, y = pos[nid] | |
| lc = len(leaf_ids(nodes, nid)) | |
| lab = n.get('name', str(nid)) | |
| htxt = _rich_hover(n, nodes) | |
| col = _node_color(n) | |
| if n.get('type') in ('root', 'aggregation'): | |
| display_lab = (lab + (f' ({lc})' if lc else ''))[:50] | |
| agg_xs.append(x); agg_ys.append(y) | |
| agg_labels.append(display_lab) | |
| agg_colors.append(col); agg_hover.append(htxt) | |
| else: | |
| display_lab = lab[:40] if show_leaf_labels else '' | |
| lf_xs.append(x); lf_ys.append(y) | |
| lf_labels.append(display_lab) | |
| lf_colors.append(col); lf_hover.append(htxt) | |
| if agg_xs: | |
| traces.append(go.Scatter( | |
| x=agg_xs, y=agg_ys, mode='markers+text', | |
| text=agg_labels, textposition='middle right', | |
| hovertext=agg_hover, hoverinfo='text', | |
| marker=dict(size=16, color=agg_colors, | |
| line=dict(color='white', width=2)), | |
| showlegend=False | |
| )) | |
| if lf_xs: | |
| traces.append(go.Scatter( | |
| x=lf_xs, y=lf_ys, mode='markers+text', | |
| text=lf_labels, textposition='middle right', | |
| hovertext=lf_hover, hoverinfo='text', | |
| marker=dict(size=7, color=lf_colors, | |
| symbol='circle', opacity=0.75, | |
| line=dict(color='white', width=1)), | |
| showlegend=False | |
| )) | |
| n_leaves = max(12, len(lf_xs)) | |
| fig = go.Figure(traces) | |
| fig.update_layout( | |
| height=max(700, min(4000, int(n_leaves * 32))), | |
| margin=dict(l=20, r=220, t=40, b=20), | |
| plot_bgcolor='white', paper_bgcolor='white', | |
| xaxis=dict(visible=False, fixedrange=False), | |
| yaxis=dict(visible=False, autorange='reversed', fixedrange=False), | |
| dragmode='pan', | |
| annotations=[dict( | |
| text='Tip: Sunburst (above) is better for large hierarchies [Taxonomizer]', | |
| xref='paper', yref='paper', x=0.0, y=1.01, | |
| showarrow=False, font=dict(size=11, color='grey'), | |
| align='left' | |
| )] | |
| ) | |
| return fig | |
| def semantic_map(can): | |
| texts = can['_text'].fillna('').astype(str).tolist() | |
| labels = can['_leaf_label'].astype(str).tolist() | |
| groups = can['_group_path'].fillna('Ungrouped').astype(str).apply(lambda x: x.split(' > ')[0]) | |
| X = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), | |
| max_features=1000).fit_transform(texts) | |
| coords = TruncatedSVD(n_components=2, random_state=42).fit_transform(X) if X.shape[1] >= 2 \ | |
| else np.zeros((len(texts), 2)) | |
| fig = go.Figure() | |
| for g in list(dict.fromkeys(groups)): | |
| mask = groups == g; idx = np.where(mask.values)[0] | |
| fig.add_trace(go.Scatter(x=coords[mask, 0], y=coords[mask, 1], mode='markers', | |
| name=str(g), text=[labels[i] for i in idx], | |
| hovertext=[f'<b>{labels[i]}</b><br>{texts[i][:400]}' for i in idx], | |
| hoverinfo='text', marker=dict(size=8, opacity=0.85))) | |
| fig.update_layout(height=600, plot_bgcolor='white', paper_bgcolor='white') | |
| return fig | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STREAMLIT STATE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| for _key in ['nodes', 'canonical', 'configs', 'embedder', 'facet_trees', | |
| 'hiexpan_report', 'concept_table', 'domain']: | |
| if _key not in st.session_state: | |
| st.session_state[_key] = None | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SIDEBAR | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.header('1. Input') | |
| uploads = st.file_uploader('Upload metadata file(s)', | |
| type=['csv','tsv','txt','xlsx','xls','json','md','markdown'], | |
| accept_multiple_files=True) | |
| existing = st.file_uploader('Load existing hierarchy JSON', type=['json']) | |
| st.header('2. Generation') | |
| project = st.text_input('Root / project name', value='metadata_project') | |
| max_rows = st.slider('Max variables', 10, 3000, 600, 10) | |
| merge_files = st.checkbox('Merge uploaded files', value=True) | |
| n_clusters = st.slider('Max clusters per group', 2, 16, 8, 1, | |
| help='Maximum number of concept sub-groups per top-level group.') | |
| st.header('3. Semantic embedder') | |
| model_choice = st.selectbox('Embedding model', | |
| ['all-MiniLM-L6-v2', 'all-mpnet-base-v2', | |
| 'paraphrase-MiniLM-L6-v2', 'TF-IDF (no ST)'], | |
| help='[TAX][GON] Sentence-BERT for dense semantic embeddings.') | |
| max_concepts = st.slider('Max candidate concepts', 30, 300, 120, 10, | |
| help='How many candidate concepts to extract from metadata text.') | |
| st.caption('HiExpan refinement runs automatically after every build. ' | |
| 'Wikidata / Wikipedia / PubMed activate automatically for biomedical, ' | |
| 'cognitive, and neurological domains.') | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # LOAD EXISTING HIERARCHY | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if existing is not None: | |
| try: | |
| obj = json.loads(existing.getvalue().decode('utf-8', errors='replace')) | |
| if isinstance(obj, list): | |
| st.session_state.nodes = obj | |
| st.success('Loaded hierarchy JSON.') | |
| else: | |
| st.error('Hierarchy JSON must be a list of nodes.') | |
| except Exception as e: | |
| st.error(f'Could not load: {e}') | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN PIPELINE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if uploads: | |
| paths = save_uploads(uploads) | |
| raw_by = {}; cfg_by = {}; prof_by = {} | |
| st.subheader('Step 1 β Inspect files') | |
| for p in paths: | |
| try: | |
| df = load_any(p); warn = False | |
| if probably_raw(df): | |
| df = raw_to_metadata(df); warn = True | |
| cfg, prof = detect_roles(df) | |
| raw_by[p.name] = df; cfg_by[p.name] = cfg; prof_by[p.name] = prof | |
| with st.expander(f'π {p.name}', expanded=False): | |
| if warn: | |
| st.warning('Looked like raw data β columns converted to metadata rows.') | |
| st.write(f'Rows: **{len(df):,}**, Columns: **{len(df.columns)}**') | |
| st.dataframe(df.head(10), width='stretch') | |
| except Exception as e: | |
| st.error(f'Failed to load {p.name}: {e}') | |
| st.subheader('Step 2 β Confirm column roles') | |
| configs = {} | |
| for name, df in raw_by.items(): | |
| with st.expander(f'βοΈ {name}', expanded=True): | |
| cols = list(df.columns); auto = cfg_by[name] | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| leaf = st.multiselect('Leaf column(s)', cols, | |
| default=[c for c in auto['leaf_cols'] if c in cols], | |
| key=f'leaf_{name}') | |
| group = st.multiselect('Group/Task column(s)', cols, | |
| default=[c for c in auto['group_cols'] if c in cols], | |
| key=f'group_{name}') | |
| with c2: | |
| text = st.multiselect('Description column(s)', cols, | |
| default=[c for c in auto['text_cols'] if c in cols], | |
| key=f'text_{name}') | |
| meta = st.multiselect('Type/unit column(s)', cols, | |
| default=[c for c in auto['metadata_cols'] if c in cols], | |
| key=f'meta_{name}') | |
| prev = list(dict.fromkeys(leaf + group + text + meta)) | |
| if prev: | |
| st.dataframe(df[prev].head(6), width='stretch') | |
| configs[name] = {'leaf_cols': leaf, 'group_cols': group, | |
| 'text_cols': text, 'metadata_cols': meta} | |
| if st.button('π³ Build Approach 1 hierarchy', type='primary'): | |
| try: | |
| # ββ Step A: Build canonical schemas per file βββββββββββββββββββββββ | |
| cans = [build_canonical(df.head(max_rows), configs[name], name) | |
| for name, df in raw_by.items()] | |
| # ββ Step A.5: Domain check before merging multiple files ββββββββββ | |
| if merge_files and len(cans) > 1: | |
| file_domains = [detect_domain(c) for c in cans] | |
| unique_domains = list(dict.fromkeys(file_domains)) | |
| if len(unique_domains) > 1: | |
| names_str = ', '.join( | |
| f'{n} β {d}' for n, d in zip(list(raw_by.keys()), file_domains) | |
| ) | |
| st.warning( | |
| f'Files appear to be from different domains: {names_str}. ' | |
| f'Merging may produce a mixed hierarchy. ' | |
| f'Uncheck **Merge uploaded files** to process separately.' | |
| ) | |
| can = pd.concat(cans, ignore_index=True) | |
| else: | |
| can = cans[0] | |
| if len(can) > max_rows: | |
| can = can.head(max_rows).copy() | |
| # ββ Step B: Code family + acronym expansion βββββββββββββββββββββββ | |
| with st.spinner('Detecting variable code families and expanding acronyms...'): | |
| can = cluster_codes_by_prefix(can) | |
| n_families = can['_code_family'].ne('').sum() | |
| if n_families > 0: | |
| st.info(f'Detected {can["_code_family"].nunique()} code families ' | |
| f'({n_families} coded variables).') | |
| code_expansions = expand_variable_codes(can) | |
| if code_expansions: | |
| st.info(f'Identified {len(code_expansions)} acronym/segment expansions.') | |
| st.session_state['code_expansions'] = code_expansions | |
| # ββ Step B.5: Patch _semantic_text with acronym expansions ββββββββββ | |
| # [C7][GON][LOB] Description text is the dominant semantic signal. | |
| # GonΓ§alves: NΓM alignment right-hand side uses concept descriptions, not codes. | |
| # Lobo: "generate additional context for column names to aid matching." | |
| # Fix: prepend ONLY when expansion adds genuinely new semantic content. | |
| # Do NOT prepend if the expansion token is already present in the description | |
| # (prevents "DMS DMS Correct Latency..." doubling that caused "Dms Dms" labels). | |
| # [F1] Word-boundary coverage check for _patch_semantic. | |
| # Previous guard used raw substring match: "DMS Correct Latency SD" | |
| # not in description even when description has "DMS Correct Latency | |
| # Standard Deviation" β because "SD" β "Standard Deviation". | |
| # New check: expansion is "covered" if β₯60% of its non-stop words | |
| # appear as whole words in the description. If covered, skip prepend. | |
| _patch_stop = {'the','a','an','is','are','was','to','of','in', | |
| 'on','at','for','with','by','and','or','as'} | |
| def _exp_covered(exp_str, base_lower): | |
| words = [w for w in exp_str.lower().split() | |
| if w not in _patch_stop and len(w) > 2] | |
| if not words: | |
| return True | |
| found = sum( | |
| 1 for w in words | |
| if re.search(r'\b' + re.escape(w) + r'\b', base_lower) | |
| ) | |
| return found / len(words) >= 0.60 | |
| def _patch_semantic(row, exps): | |
| base = str(row.get('_semantic_text', row['_text'])) | |
| base_lower = base.lower() | |
| code = str(row['_leaf_label']).strip().split('/')[0] | |
| seg_tok = re.compile(r'([A-Z]{2,}|\d+)') | |
| segments = seg_tok.findall(code) | |
| new_parts = [] | |
| for s in segments: | |
| exp = exps.get(s, {}).get('expansion', '') | |
| # Only prepend if expansion adds genuinely new information | |
| # [F1] word-boundary check: skip if β₯60% of exp words already present | |
| if exp and not _exp_covered(exp, base_lower): | |
| new_parts.append(exp) | |
| if new_parts: | |
| return f'{" ".join(new_parts)} {base}' | |
| # Family expansion β same word-boundary guard | |
| fam = str(row.get('_code_family', '')) | |
| if fam and fam in exps: | |
| fam_exp = exps[fam].get('expansion', '') | |
| if fam_exp and not _exp_covered(fam_exp, base_lower): | |
| return f'{fam_exp} {base}' | |
| return base | |
| can['_semantic_text'] = can.apply( | |
| lambda r: _patch_semantic(r, code_expansions), axis=1 | |
| ) | |
| # ββ Step C: Load SBERT embedder (always attempt; fallback graceful) β | |
| with st.spinner('Loading SBERT embedding model...'): | |
| model_name = model_choice if model_choice != 'TF-IDF (no ST)' else 'all-MiniLM-L6-v2' | |
| emb = SemanticEmbedder(model_name=model_name) | |
| ok, msg = emb.load() # always attempt SBERT load | |
| if ok: | |
| st.success(f'SBERT loaded: {msg}') | |
| else: | |
| st.warning(f'SBERT unavailable β {msg}. Using TF-IDF+SVD fallback.') | |
| st.session_state.embedder = emb | |
| # ββ Step D: Detect domain βββββββββββββββββββββββββββββββββββββββββ | |
| domain = detect_domain(can) | |
| st.session_state.domain = domain | |
| _bio_domains = ('biomedical', 'cognitive', 'neurological') | |
| _use_external = domain in _bio_domains | |
| st.info(f'Detected domain: **{domain}**' | |
| + (' β Wikidata / Wikipedia / PubMed activated' if _use_external else '')) | |
| # ββ Step E: Extract candidate concepts from metadata ββββββββββββββ | |
| with st.spinner('Extracting candidate concepts from metadata text...'): | |
| candidates = extract_candidate_concepts_from_metadata(can, max_concepts=max_concepts) | |
| st.info(f'Extracted {len(candidates)} candidate concepts from metadata.') | |
| # ββ Step F: Build concept table ββββββββββββββββββββββββββββββββββββ | |
| # Biomedical / cognitive / neurological β enrich via Wikidata + PubMed | |
| # Wikipedia excluded: too slow for interactive use; Wikidata covers same ground | |
| # All other domains β local-only (no HTTP calls) | |
| if _use_external: | |
| with st.spinner(f'Enriching concept table via Wikidata / PubMed ({domain} domain)...'): | |
| pb = st.progress(0) | |
| concept_table = retrieve_concept_table( | |
| candidates, domain=domain, | |
| use_wikidata=True, use_wikipedia=False, | |
| use_wordnet=True, use_pubmed=True, | |
| bioportal_key='', | |
| progress_cb=lambda x: pb.progress(x), | |
| code_expansions=code_expansions, | |
| ) | |
| pb.empty() | |
| n_wd = sum(1 for c in concept_table if 'wikidata' in c.get('source', '')) | |
| n_pm = sum(1 for c in concept_table if 'pubmed' in c.get('source', '')) | |
| st.success(f'Concept table: {len(concept_table)} entries ' | |
| f'(Wikidata: {n_wd}, PubMed: {n_pm})') | |
| else: | |
| concept_table = [ | |
| {'label': c['label'], 'full_text': c['label'], | |
| 'source': c.get('source', 'metadata_tfidf'), | |
| 'frequency': c.get('frequency', 0), | |
| 'tfidf_score': c.get('tfidf_score', 0.0), | |
| 'broader_relations': []} | |
| for c in candidates | |
| ] | |
| st.success(f'Concept table: {len(concept_table)} entries (local metadata β no external calls)') | |
| st.session_state.concept_table = concept_table | |
| # ββ Step F.5: Fit shared vector space for TF-IDF fallback βββββββββ | |
| # CRITICAL: must encode variables + concepts in the SAME space for | |
| # NΓM cosine similarity to be valid. No-op when SBERT is active. | |
| with st.spinner('Fitting shared embedding space...'): | |
| var_texts = can['_semantic_text'].fillna('').astype(str).tolist() | |
| concept_texts = [c['full_text'] for c in concept_table] | |
| emb.fit_joint(var_texts + concept_texts) | |
| # ββ Step F.6: Pre-compute Statistic and Condition facets βββββββββ | |
| # [F3][F5][CAS] These columns are needed inside _cluster_and_label | |
| # for facet sub-splitting. They must be computed BEFORE Step G. | |
| # detect_facets / build_castanet_facets runs AFTER hierarchy build | |
| # (Step I), so we pre-compute only _facet_cond here. The statistic tier | |
| # is produced data-drivenly later by _nest_by_measure (no hardcoded vocab). | |
| with st.spinner('Pre-computing Condition facets [CAS]...'): | |
| can = precompute_stat_cond_facets(can) | |
| n_cond = can['_facet_cond'].ne('').sum() | |
| st.info(f'Facet pre-computation: {n_cond} variables with Condition. ' | |
| f'Statistic depth is derived from concept titles (_nest_by_measure).') | |
| # ββ Step G: Build concept hierarchy (NΓM alignment) ββββββββββββββ | |
| with st.spinner('Building concept hierarchy via NΓM alignment [GON][TAX]...'): | |
| nodes = build_concept_hierarchy( | |
| can, emb, concept_table, | |
| project=project, | |
| n_clusters_per_group=n_clusters, | |
| ) | |
| # ββ Step H: HiExpan refinement (always automatic) βββββββββββββββββ | |
| with st.spinner('Running HiExpan refinement [HIE]...'): | |
| if concept_table: | |
| c_embs = emb.encode([c['full_text'] for c in concept_table]) | |
| else: | |
| c_embs = None | |
| nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs) | |
| # HiExpan's width/global passes MOVE leaves between concepts; a concept | |
| # that loses all its leaves becomes empty. build_concept_hierarchy prunes | |
| # internally, but that runs BEFORE HiExpan β so re-prune here, else empty | |
| # nodes break the Plotly branchvalues='total' sunburst/treemap (parent | |
| # value < sum(children) β blank render; node-link is unaffected). | |
| _prune_empty_aggregations(nodes) | |
| _alive = {int(n['id']) for n in nodes} | |
| for _n in nodes: | |
| _n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', [])) | |
| if x in _alive] | |
| st.session_state.hiexpan_report = report | |
| wmoves = report.get('width_expansion_moves', 0) | |
| dexp = report.get('depth_expansion_nodes', 0) | |
| gmoves = report.get('global_optimization_moves', 0) | |
| st.success(f'HiExpan complete β width moves: {wmoves}, ' | |
| f'depth expansions: {dexp}, global moves: {gmoves}') | |
| # ββ Step I: Castanet facets βββββββββββββββββββββββββββββββββββββββ | |
| with st.spinner('Building Castanet parallel facets [CAS]...'): | |
| facets = detect_facets(can, code_expansions=code_expansions) | |
| facet_trees = {f['name']: build_facet_hierarchy(can, f, project) | |
| for f in facets} | |
| st.session_state.facet_trees = facet_trees | |
| # ββ Step J: Evaluation metrics ββββββββββββββββββββββββββββββββββββ | |
| n_total = len(can) | |
| n_aligned = can['_concept_score'].gt(0.08).sum() | |
| n_lowconf = can['_concept_score'].between(0, 0.25, inclusive='right').sum() | |
| n_fallbk = can['_concept_score'].eq(0.0).sum() | |
| n_family = can['_code_family'].ne('').sum() | |
| avg_conf = float(can['_concept_score'].mean()) | |
| cov_pct = round(100 * n_aligned / max(n_total, 1), 1) | |
| st.session_state['eval_metrics'] = { | |
| 'total_variables': n_total, | |
| 'alignment_coverage_%': cov_pct, | |
| 'avg_label_confidence': round(avg_conf, 3), | |
| 'fallback_count': int(n_fallbk), | |
| 'fallback_rate_%': round(100 * n_fallbk / max(n_total, 1), 1), | |
| 'low_confidence_count': int(n_lowconf), | |
| 'variables_with_family': int(n_family), | |
| 'code_family_%': round(100 * n_family / max(n_total, 1), 1), | |
| 'concept_table_size': len(concept_table), | |
| 'wikidata_entries': sum(1 for c in concept_table if 'wikidata' in c.get('source', '')), | |
| 'pubmed_entries': sum(1 for c in concept_table if 'pubmed' in c.get('source', '')), | |
| 'acronym_expansions': len(code_expansions), | |
| 'hiexpan_width_moves': wmoves, | |
| 'hiexpan_depth_exp': dexp, | |
| 'hiexpan_global_moves': gmoves, | |
| } | |
| # ββ Build concept-label provenance DataFrame (4th export) ββββββββββ | |
| prov_rows_build = [] | |
| for _n in nodes: | |
| if _n.get('type') == 'aggregation' and _n.get('concept_provenance'): | |
| _p = _n['concept_provenance'] | |
| prov_rows_build.append({ | |
| 'Node': _n['name'], | |
| 'Confidence': _p.get('confidence', ''), | |
| 'Source': ', '.join(_p.get('source_evidence', [])), | |
| 'Embedding sim': _p.get('embedding_sim', ''), | |
| 'Alternatives': ', '.join(_p.get('alternatives', [])[:3]), | |
| }) | |
| prov_df = pd.DataFrame(prov_rows_build) if prov_rows_build else pd.DataFrame() | |
| st.session_state.canonical = can | |
| st.session_state.configs = configs | |
| st.session_state.nodes = nodes | |
| st.session_state['prov_df'] = prov_df | |
| n_l = len([n for n in nodes if n.get('type') == 'attribute']) | |
| n_i = len([n for n in nodes if n.get('type') == 'aggregation']) | |
| st.success(f'Built: {n_l} variables, {n_i} internal nodes, ' | |
| f'{len(facets)} facets β {cov_pct}% concept-aligned.') | |
| except Exception as e: | |
| st.error(f'Build failed: {e}') | |
| import traceback; st.code(traceback.format_exc()) | |
| if st.session_state.nodes is None: | |
| st.info('Upload a metadata file and click **Build Approach 1 hierarchy**.') | |
| st.stop() | |
| nodes = st.session_state.nodes | |
| can = st.session_state.canonical | |
| facet_trees = st.session_state.facet_trees or {} | |
| hiexpan_report = st.session_state.hiexpan_report or {} | |
| concept_table = st.session_state.concept_table or [] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TABS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tabs = st.tabs(['π³ LoD tree', 'π² Faceted view', '𧬠HiExpan report', | |
| 'β οΈ Conflicts', 'βοΈ Edit', 'π Search', | |
| 'πΊοΈ Semantic map', 'π Metadata', 'β¬οΈ Export', 'βΉοΈ Method', | |
| 'π Evaluation']) | |
| # ββ Tab 0: LoD tree βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[0]: | |
| # ββ Visualization controls (above chart β easy to find) βββββββββββββββββββ | |
| vc1, vc2, vc3, vc4 = st.columns([2, 2, 1, 1]) | |
| with vc1: | |
| viz_mode = st.radio('View mode', | |
| ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'], | |
| horizontal=True, index=0, | |
| help='Sunburst recommended for large hierarchies [Taxonomizer]. Node-link best for exploring structure at smaller depth.') | |
| with vc2: | |
| depth = st.slider('Depth (Level of Detail)', 1, 8, 3, 1) | |
| with vc3: | |
| show_leaf_labels = st.checkbox('Leaf labels', value=False) | |
| with vc4: | |
| show_hidden = st.checkbox('Hidden nodes', value=False) | |
| st.divider() | |
| if viz_mode == 'Sunburst (drill-down)': | |
| st.caption('Hover for concept provenance (confidence, source, alternatives). Click to drill down.') | |
| st.plotly_chart(plot_sunburst(nodes, depth), width='stretch') | |
| elif viz_mode == 'Treemap': | |
| st.plotly_chart(plot_treemap(nodes), width='stretch') | |
| else: | |
| st.plotly_chart(plot_node_link(nodes, depth, show_hidden, show_leaf_labels), | |
| width='stretch') | |
| pr = path_rows(nodes) | |
| max_d = max((r['depth'] for r in pr), default=0) | |
| c1, c2, c3 = st.columns(3) | |
| c1.metric('Variables', len([n for n in nodes if n.get('type') == 'attribute'])) | |
| c2.metric('Internal nodes', len([n for n in nodes if n.get('type') == 'aggregation'])) | |
| c3.metric('Max depth', max_d) | |
| emb_ = st.session_state.embedder | |
| if emb_: | |
| st.caption(f'Embedding backend: **{emb_.backend}** | Domain: **{st.session_state.domain or "unknown"}**') | |
| # Code expansion table | |
| code_exp = st.session_state.get('code_expansions', {}) | |
| if code_exp: | |
| with st.expander(f'Acronym / code segment expansions ({len(code_exp)} found)', expanded=False): | |
| exp_rows = [{'Segment': seg, 'Expansion': v['expansion'], | |
| 'Evidence': ', '.join(v['evidence'])} | |
| for seg, v in code_exp.items()] | |
| st.dataframe(pd.DataFrame(exp_rows), width='stretch') | |
| # Concept label provenance for internal nodes | |
| prov_rows = [] | |
| for n in nodes: | |
| if n.get('type') == 'aggregation' and n.get('concept_provenance'): | |
| p = n['concept_provenance'] | |
| prov_rows.append({ | |
| 'Node': n['name'], | |
| 'Confidence': p.get('confidence', ''), | |
| 'Source': ', '.join(p.get('source_evidence', [])), | |
| 'Embedding sim': p.get('embedding_sim', ''), | |
| 'Alternatives': ', '.join(p.get('alternatives', [])[:3]), | |
| }) | |
| if prov_rows: | |
| with st.expander('Concept label provenance for internal nodes', expanded=False): | |
| st.dataframe(pd.DataFrame(prov_rows), width='stretch') | |
| # ββ Tab 1: Faceted view βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[1]: | |
| st.subheader('Castanet Parallel Faceted Hierarchies') | |
| st.markdown( | |
| '**[CAS]** Each sunburst organises the same variables by a different dimension. ' | |
| 'Concept facet uses automatically assigned labels from embedding alignment.' | |
| ) | |
| if facet_trees: | |
| st.plotly_chart(plot_facets_parallel(facet_trees), width='stretch') | |
| st.markdown('### Per-facet detail') | |
| sel_facet = st.selectbox('Inspect facet tree', list(facet_trees.keys())) | |
| ft = facet_trees[sel_facet] | |
| st.plotly_chart(plot_sunburst(ft, max_depth=3), width='stretch') | |
| n_groups = len([n for n in ft if n.get('type') == 'aggregation']) | |
| st.info(f'Facet **{sel_facet}**: {n_groups} groups, ' | |
| f'{len([n for n in ft if n.get("type")=="attribute"])} variables') | |
| else: | |
| st.info('Build the hierarchy first to see faceted views.') | |
| # ββ Tab 2: HiExpan report βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[2]: | |
| st.subheader('HiExpan Refinement Report') | |
| if hiexpan_report: | |
| c1, c2, c3 = st.columns(3) | |
| c1.metric('Width expansion moves', hiexpan_report.get('width_expansion_moves', 0)) | |
| c2.metric('Depth expansion nodes', hiexpan_report.get('depth_expansion_nodes', 0)) | |
| c3.metric('Global optimization moves', hiexpan_report.get('global_optimization_moves', 0)) | |
| st.markdown('### Sibling coherence β before refinement (worst first)') | |
| before = hiexpan_report.get('coherence_before', []) | |
| if before: | |
| st.dataframe(pd.DataFrame(before), width='stretch') | |
| st.markdown('### Sibling coherence β after refinement') | |
| after = hiexpan_report.get('coherence_after', []) | |
| if after: | |
| st.dataframe(pd.DataFrame(after), width='stretch') | |
| b_mean = np.mean([r['coherence_score'] for r in before]) if before else float('nan') | |
| a_mean = np.mean([r['coherence_score'] for r in after]) | |
| st.metric('Mean coherence improvement', | |
| f'{a_mean:.3f}', delta=f'{a_mean - b_mean:+.3f}') | |
| else: | |
| st.info('HiExpan runs automatically. Build the hierarchy to see results.') | |
| # ββ Evaluation metrics βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ev = st.session_state.get('eval_metrics', {}) | |
| if ev: | |
| st.markdown('---') | |
| st.subheader('Evaluation Metrics') | |
| st.markdown( | |
| 'These metrics help evaluate how well the automatic concept alignment worked. ' | |
| 'For thesis evaluation, compare against a manually curated hierarchy.' | |
| ) | |
| ea, eb, ec, ed = st.columns(4) | |
| ea.metric('Alignment coverage', f'{ev.get("alignment_coverage_%", 0)}%', | |
| help='% of variables with concept score > 0.08 (non-fallback)') | |
| eb.metric('Avg label confidence', f'{ev.get("avg_label_confidence", 0):.3f}', | |
| help='Mean concept score across all variables (0β1)') | |
| ec.metric('Low-confidence placements', ev.get('low_confidence_count', 0), | |
| help='Variables with concept score 0β0.25 (review in Conflicts tab)') | |
| ed.metric('Fallback rate', f'{ev.get("fallback_rate_%", 0)}%', | |
| help='% of variables that got score=0 (no concept matched above threshold)') | |
| e2a, e2b, e2c = st.columns(3) | |
| e2a.metric('Concept table size', ev.get('concept_table_size', 0)) | |
| e2b.metric('Variables w/ code family', f'{ev.get("code_family_%", 0)}%') | |
| e2c.metric('Acronym expansions', ev.get('acronym_expansions', 0)) | |
| st.caption( | |
| '**Thesis interpretation:** Alignment coverage > 70% indicates the concept table ' | |
| 'adequately covers the metadata domain. Fallback rate > 30% suggests the domain vocabulary ' | |
| 'is sparse β try a biomedical/cognitive dataset to activate Wikidata + PubMed enrichment. ' | |
| 'Low-confidence > 20% suggests HiExpan depth expansion created subclusters with ambiguous boundaries.' | |
| ) | |
| # ββ Tab 3: Conflict resolution ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[3]: | |
| st.subheader('Conflict Resolution β Low-Confidence Placements') | |
| st.markdown( | |
| 'Variables with concept assignment confidence < 0.25 may belong to multiple groups. ' | |
| 'Review and use the **Edit** tab to move them if needed.' | |
| ) | |
| if can is not None: | |
| conflict_df = compute_conflict_table(can, nodes) | |
| if len(conflict_df): | |
| st.dataframe(conflict_df, width='stretch') | |
| else: | |
| st.success('No low-confidence placements detected.') | |
| else: | |
| st.info('Build the hierarchy first.') | |
| # ββ Tab 4: Edit βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[4]: | |
| rows_ = path_rows(nodes); choice_to_id = {r['choice']: r['id'] for r in rows_} | |
| selected = st.selectbox('Select node', list(choice_to_id.keys())) | |
| sid = choice_to_id[selected]; node = get_node(nodes, sid) | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| st.markdown('### Node properties') | |
| new_name = st.text_input('Name', value=node.get('name', ''), key=f'nm{sid}') | |
| new_desc = st.text_area('Description', value=node.get('desc', ''), key=f'ds{sid}', height=100) | |
| dtype_vals = ['root', 'number', 'string', 'determine', 'unknown'] | |
| cur = node.get('dtype', 'determine') | |
| new_dtype = st.selectbox('Data type', dtype_vals, | |
| index=dtype_vals.index(cur) if cur in dtype_vals else 3, | |
| key=f'dt{sid}') | |
| new_shown = st.checkbox('Shown', value=bool(node.get('isShown', True)), key=f'sh{sid}') | |
| if node.get('type') == 'aggregation': | |
| rel_opts = list(RELATION_TYPES.keys()) | |
| cur_rel = node.get('info', {}).get('relation_type', 'belongs_to') | |
| new_rel = st.selectbox('Relation type', rel_opts, | |
| index=rel_opts.index(cur_rel) if cur_rel in rel_opts else 0, | |
| format_func=lambda k: f'{k} β {RELATION_TYPES[k]}', | |
| key=f'rel{sid}') | |
| if node.get('concept_provenance'): | |
| prov = node['concept_provenance'] | |
| st.markdown('**Concept provenance:**') | |
| st.json(prov) | |
| if st.button('Save changes'): | |
| info = dict(node.get('info', {})) | |
| if node.get('type') == 'aggregation': | |
| info['relation_type'] = new_rel | |
| info['relation_label'] = RELATION_TYPES.get(new_rel, '') | |
| st.session_state.nodes = update_node(nodes, sid, name=new_name, desc=new_desc, | |
| dtype=new_dtype, isShown=new_shown, info=info) | |
| st.rerun() | |
| with c2: | |
| st.markdown('### Move / add / delete') | |
| if node.get('type') in ['root', 'aggregation']: | |
| with st.form('add_grp'): | |
| cname = st.text_input('New child name', value='New Group') | |
| crel = st.selectbox('Relation type', list(RELATION_TYPES.keys())) | |
| cdesc = st.text_area('Description', value='') | |
| if st.form_submit_button('Add child'): | |
| nid_ = next_id(nodes) | |
| nodes.append(make_agg(nid_, cname, desc=cdesc, relation_type=crel)) | |
| add_child(nodes, sid, nid_) | |
| st.session_state.nodes = nodes; st.rerun() | |
| pops = [o for o in agg_opts(nodes, True) if o['id'] != sid] | |
| if sid != 0 and pops: | |
| tgt = st.selectbox('Move under', [o['label'] for o in pops]) | |
| tgt_id = next(o['id'] for o in pops if o['label'] == tgt) | |
| if st.button('Move node'): | |
| st.session_state.nodes = move_node(nodes, sid, tgt_id); st.rerun() | |
| if node.get('type') == 'aggregation': | |
| rea = st.checkbox('Reattach children when deleting', value=True) | |
| if st.button('Delete aggregation'): | |
| st.session_state.nodes = delete_agg(nodes, sid, rea); st.rerun() | |
| st.markdown('### Children') | |
| cns = [get_node(nodes, c) for c in node.get('related', [])] | |
| st.dataframe(pd.DataFrame([{'id': c.get('id'), 'name': c.get('name'), | |
| 'type': c.get('type'), | |
| 'relation': c.get('info', {}).get('relation_label', ''), | |
| 'desc': str(c.get('desc', ''))[:120]} | |
| for c in cns if c]), width='stretch') | |
| # ββ Tab 5: Search βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[5]: | |
| q = st.text_input('Search name, description, relation, type') | |
| out_ = [] | |
| for n in nodes: | |
| hay = ' '.join([str(n.get(k, '')) for k in ['name', 'desc', 'dtype', 'type']] | |
| + [n.get('info', {}).get('relation_label', '')]).lower() | |
| if not q or q.lower() in hay: | |
| out_.append({'id': n.get('id'), 'name': n.get('name'), 'type': n.get('type'), | |
| 'relation': n.get('info', {}).get('relation_label', ''), | |
| 'n_children': len(n.get('related', [])), | |
| 'desc': str(n.get('desc', ''))[:200]}) | |
| st.dataframe(pd.DataFrame(out_), width='stretch') | |
| # ββ Tab 6: Semantic map βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[6]: | |
| if can is None or len(can) < 3: | |
| st.info('Semantic map available after build.') | |
| else: | |
| st.plotly_chart(semantic_map(can), width='stretch') | |
| # ββ Tab 7: Metadata βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[7]: | |
| if can is None: | |
| st.info('Available after build.') | |
| else: | |
| show_cols = [c for c in can.columns if c != '_raw'] | |
| st.dataframe(can[show_cols], width='stretch') | |
| # ββ Tab 8: Export βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[8]: | |
| # Name downloads after the uploaded CSV (fall back to the project name). | |
| if uploads: | |
| _base = safe_name(Path(uploads[0].name).stem) | |
| else: | |
| _base = safe_name(project) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.download_button( | |
| 'Hierarchy JSON', | |
| data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'), | |
| file_name=f'{_base}_approach1_hierarchy.json', | |
| mime='application/json', | |
| width='stretch', | |
| ) | |
| with col2: | |
| if facet_trees: | |
| st.download_button( | |
| 'Facets JSON', | |
| data=json.dumps(facet_trees, indent=2, ensure_ascii=False).encode('utf-8'), | |
| file_name=f'{_base}_approach1_facets.json', | |
| mime='application/json', | |
| width='stretch', | |
| ) | |
| col3, col4 = st.columns(2) | |
| with col3: | |
| if can is not None: | |
| st.download_button( | |
| 'Canonical CSV', | |
| data=can.drop(columns=['_raw'], errors='ignore').to_csv(index=False).encode('utf-8'), | |
| file_name=f'{_base}_approach1_canonical.csv', | |
| mime='text/csv', | |
| width='stretch', | |
| ) | |
| with col4: | |
| _prov_df = st.session_state.get('prov_df', pd.DataFrame()) | |
| if not _prov_df.empty: | |
| st.download_button( | |
| 'Concept labels CSV', | |
| data=_prov_df.to_csv(index=False).encode('utf-8'), | |
| file_name=f'{_base}_approach1_concept_labels.csv', | |
| mime='text/csv', | |
| width='stretch', | |
| ) | |
| st.divider() | |
| # ββ Save directly into the project's outputs/approach_1/ folder ββββββββββββ | |
| _out_dir = Path(__file__).resolve().parent / 'outputs' / 'approach_1' | |
| st.markdown('### Save to project folder') | |
| st.caption( | |
| 'The download buttons above go to your browserβs Downloads folder (a browser ' | |
| f'restriction). This button instead writes the files into `{_out_dir}` with the ' | |
| 'dataset name β convenient for `evaluate_all.py`.' | |
| ) | |
| if st.button('πΎ Save all to outputs/approach_1/', type='primary', | |
| width='stretch'): | |
| try: | |
| _out_dir.mkdir(parents=True, exist_ok=True) | |
| saved = [] | |
| (_out_dir / f'{_base}_approach1_hierarchy.json').write_text( | |
| json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8') | |
| saved.append(f'{_base}_approach1_hierarchy.json') | |
| if facet_trees: | |
| (_out_dir / f'{_base}_approach1_facets.json').write_text( | |
| json.dumps(facet_trees, indent=2, ensure_ascii=False), encoding='utf-8') | |
| saved.append(f'{_base}_approach1_facets.json') | |
| if can is not None: | |
| can.drop(columns=['_raw'], errors='ignore').to_csv( | |
| _out_dir / f'{_base}_approach1_canonical.csv', index=False) | |
| saved.append(f'{_base}_approach1_canonical.csv') | |
| _prov_df2 = st.session_state.get('prov_df', pd.DataFrame()) | |
| if not _prov_df2.empty: | |
| _prov_df2.to_csv(_out_dir / f'{_base}_approach1_concept_labels.csv', index=False) | |
| saved.append(f'{_base}_approach1_concept_labels.csv') | |
| st.success(f'Saved to `{_out_dir}`:\n\n- ' + '\n- '.join(saved)) | |
| except Exception as _e: | |
| st.error(f'Could not save: {_e}') | |
| st.divider() | |
| st.markdown('### Hierarchy preview (first 5 nodes)') | |
| st.json(nodes[:5]) | |
| # ββ Tab 9: Method βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[9]: | |
| st.markdown(f""" | |
| ## Method β Approach 1 | |
| ### Algorithm (no hardcoded domain labels) | |
| | Step | What happens | Paper | | |
| |---|---|---| | |
| | 1. Canonical schema | Every metadata file β unified `_text` object | [GON] | | |
| | 2. Code family detection | Variable-code structural prefix clustering (DMSL*, SWMBE*) | [GON] | | |
| | 3. Domain detection | Auto-detect biomedical / cognitive / finance / environment / general | β | | |
| | 4. Candidate concept extraction | TF-IDF n-grams + noun phrases + group-path terms from **input data only** | [GON][TAX] | | |
| | 5. External concept table | Wikidata + WordNet + Wikipedia + PubMed + BioPortal β concept TABLE | [GON][TAX][HIE] | | |
| | 6. Concept embedding | SBERT encodes variables + concept table entries | [TAX][GON] | | |
| | 7. NΓM cosine similarity | Variables Γ concepts: embedding + string + frequency + source score | [GON] | | |
| | 8. Concept label assignment | Best-scoring concept label + confidence + alternatives + provenance | [GON] | | |
| | 9. Hierarchy construction | Task/group-first backbone + automatic concept sub-groups | [TAX][HIE] | | |
| | 10. HiExpan refinement | Sibling coherence, width expansion, depth expansion (embedding-based), global opt | [HIE] | | |
| | 11. Castanet facets | Concept Β· Task Β· Code family Β· Data type parallel views | [CAS] | | |
| **Detected domain:** `{st.session_state.domain or 'not yet detected'}` | |
| **Concept table size:** `{len(concept_table)} entries` | |
| **WordNet available:** `{'yes' if _WORDNET_AVAILABLE else 'no β run: pip install nltk'}` | |
| ### Why no hardcoded patterns? | |
| The previous version used regex lists (`MEASURE_PATTERNS`, `STAT_PATTERNS`, `CONDITION_PATTERNS`) | |
| to label hierarchy nodes. These only worked for the AI-Mind neuropsychology dataset. | |
| This version discovers concept labels **automatically**: | |
| - Extracts candidate terms from **whatever metadata text the user provides** | |
| - Validates them against **universal external knowledge** (Wikidata, WordNet, Wikipedia) | |
| - Selects the best label by **embedding cosine similarity** β the GonΓ§alves NΓM alignment step | |
| For an AI-Mind dataset, the pipeline will discover "correct latency", "standard deviation", | |
| "0 second delay" β because those phrases appear in the data. For a climate dataset, | |
| it will discover "temperature anomaly", "precipitation rate" β again, from the data. | |
| No domain vocabulary is assumed or hardcoded. | |
| ### Scoring formula (per cluster) [GON] | |
| ``` | |
| score(cluster, concept) = | |
| 0.50 Γ SBERT cosine similarity (cluster centroid β concept embedding) | |
| + 0.20 Γ word overlap (cluster description words β© concept label words) | |
| + 0.15 Γ frequency (concept appears in N metadata rows / max frequency) | |
| + 0.10 Γ source confidence (Wikidata=0.88, WordNet=0.83, Wikipedia=0.78, ...) | |
| + 0.05 Γ hierarchy evidence (concept has P31/P279/P361 relations in Wikidata) | |
| ``` | |
| ### External sources | |
| | Source | Domain | What it provides | | |
| |---|---|---| | |
| | **Wikidata** (always) | Any | Structured descriptions, P31/P279/P361 broader relations | | |
| | **WordNet** (default) | Any | Definitions, hypernyms, synonyms | | |
| | **Wikipedia** (optional) | Any | Full text definitions | | |
| | **PubMed** (optional) | Biomedical/Cognitive | Abstract text for domain embeddings | | |
| | **BioPortal** (optional, key) | Biomedical | Ontology class labels and definitions | | |
| """) | |
| # ββ Tab 10: Evaluation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tabs[10]: | |
| import hierarchy_eval as he | |
| st.subheader('Hierarchy Quality Evaluation') | |
| can_eval = st.session_state.get('canonical', pd.DataFrame()) | |
| nodes_eval = st.session_state.get('nodes', []) | |
| if can_eval.empty or not nodes_eval: | |
| st.info('Build a hierarchy first β metrics appear here after the build completes.') | |
| else: | |
| st.caption( | |
| 'The group column is a *construction input* (GonΓ§alves text object + concept ' | |
| 'alignment), so it cannot be ground truth. The primary metrics below are ' | |
| '**reference-free** β they assess the hierarchy itself, no gold standard.' | |
| ) | |
| with st.spinner('Computing reference-free metricsβ¦'): | |
| tm = he.traco_metrics(nodes_eval) | |
| npmi = he.npmi_coherence(nodes_eval, can_eval['_text'].tolist()) | |
| # ββ PRIMARY: reference-free hierarchy quality βββββββββββββββββββββββββ | |
| st.markdown('#### Primary β reference-free hierarchy quality') | |
| p1, p2, p3 = st.columns(3) | |
| p1.metric('Parentβchild coherence', tm['pc_coherence'], | |
| help='TraCo (Wu et al., AAAI 2024). Children correctly nest under parent theme.') | |
| p2.metric('Sibling diversity', tm['sibling_diversity'], | |
| help='TraCo (Wu et al., AAAI 2024). Higher = distinct siblings; LOW = redundant.') | |
| p3.metric('NPMI label coherence', npmi, | |
| help='Lau et al., EACL 2014. Label terms genuinely co-occur in the data.') | |
| st.caption(f'Embedding backend: **{tm["encoder"]}**.') | |
| # ββ Approach-1-specific alignment metrics βββββββββββββββββββββββββββββ | |
| em = st.session_state.get('eval_metrics', {}) | |
| if em: | |
| st.markdown('#### Concept-alignment metrics (Approach-1 specific)') | |
| a1, a2, a3 = st.columns(3) | |
| a1.metric('Alignment coverage', f"{em.get('alignment_coverage_pct', 0):.1f}%", | |
| help='% of variables assigned an external concept label') | |
| a2.metric('Avg label confidence', f"{em.get('avg_label_confidence', 0):.3f}", | |
| help='Mean concept-alignment cosine score across all variables') | |
| a3.metric('Fallback rate', f"{em.get('fallback_rate_pct', 0):.1f}%", | |
| help='% of variables that used TF-IDF fallback instead of external concept') | |
| # ββ Structural statistics βββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown('#### Structural statistics') | |
| sm = he.structural_stats(nodes_eval) | |
| s1, s2, s3, s4, s5 = st.columns(5) | |
| s1.metric('Aggregation nodes', sm['n_aggregation_nodes']) | |
| s2.metric('Max leaf depth', sm['max_depth']) | |
| s3.metric('Avg leaf depth', sm['avg_leaf_depth']) | |
| s4.metric('Avg branching', sm['avg_branching_factor']) | |
| s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%") | |
| # ββ SECONDARY: group preservation (caveated) ββββββββββββββββββββββββββ | |
| st.markdown('#### Secondary β group-structure preservation *(descriptive)*') | |
| st.caption( | |
| 'β οΈ The group column was an **input** to construction, so these are NOT accuracy ' | |
| 'metrics β only how much the hierarchy still reflects the pre-existing group column.' | |
| ) | |
| gp = he.group_preservation(nodes_eval, can_eval) | |
| g1, g2, g3 = st.columns(3) | |
| g1.metric('NMI', gp['NMI']); g2.metric('ARI', gp['ARI']); g3.metric('Purity', gp['Purity']) | |