Spaces:

rufasharon
/

metadata_hierarchy_tfm2026

Sleeping

App Files Files Community

RoophaSharon commited on 6 days ago

Commit

2b56f2e

1 Parent(s): 51c62ea

Sync demo (downloads, build summary, HCP depth fix) + latest approach_1; clean canonical outputs

Browse files

Files changed (7) hide show

approach_1.py +394 -66
demo.py +111 -7
outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json → HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json} +0 -0
outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json → HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json} +0 -0
outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json +182 -142
outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json +353 -255
pages/2_Approach_1.py +57 -14

approach_1.py CHANGED Viewed

@@ -111,6 +111,57 @@ _STOP = {
     'using','use','based','given','defined','number','value','values','score',
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # FILE LOADING
 # ─────────────────────────────────────────────────────────────────────────────
@@ -308,6 +359,9 @@ def build_canonical(df, cfg, source):
         if not sem_parts:
             sem_parts = list(leaf_parts) if leaf_parts else []
         semantic_text = ' '.join(sem_parts) if sem_parts else text
         rows.append({
             '_source_file':    source,
             '_row_index':      int(i),
@@ -342,34 +396,22 @@ def build_canonical(df, cfg, source):
 # ─────────────────────────────────────────────────────────────────────────────
 def precompute_stat_cond_facets(can):
     """
-    Pre-compute _facet_stat and _facet_cond on can.
-    Called before build_concept_hierarchy so that _cluster_and_label can use
-    these columns to insert Statistic and Condition sub-tiers.
-    No hardcoding: all patterns are learned from the data descriptions.
     [CAS] Castanet parallel facets · [HIE] HiExpan sub-set discovery
     """
     can = can.copy()
     sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text'
-    # ── Statistic type: detected from description text ─────────────────────────
-    _stat_re = re.compile(
-        r'\b(mean|average|median|standard deviation|std|percent|proportion|'
-        r'probability|total|sum|count|maximum|minimum|range|variance|'
-        r'coefficient|ratio|rate|frequency)\b', re.IGNORECASE
-    )
-    _stat_norm = {
-        'average': 'Mean', 'std': 'Standard Deviation', 'proportion': 'Percent',
-        'sum': 'Total', 'count': 'Total', 'frequency': 'Rate',
-    }
-    def _extract_stat(row):
-        hits = _stat_re.findall(str(row.get(sem_col, row.get('_text', ''))).lower())
-        if not hits:
-            return ''
-        h = hits[0].lower()
-        return _stat_norm.get(h, h.title())
-    stat_col = can.apply(_extract_stat, axis=1)
-    can['_facet_stat'] = stat_col.where(stat_col != '', '')
     # ── Condition: digit in variable code VALIDATED by description text ──────────
     # [FIX2][GON] Gonçalves et al. (ESWC 2019): structural code alignment must be
     # validated against description text — the description is the authoritative source.
@@ -1756,8 +1798,11 @@ _MIN_FACET_GROUP = 2  # minimum variables per facet sub-group
 def _do_facet_subsplit(sub_can, parent_id, current_path,
                        nodes, leaf_to_id, ensure_path_fn):
     """
-    [F4][CAS] Split by _facet_stat first, then delegate to _do_cond_subsplit.
-    If fewer than 2 valid stat groups, skip stat and go straight to cond.
     """
     # A facet tier that merely repeats the parent concept label (e.g. a "Total"
     # statistic under a "Total" concept) is redundant — skip it.
@@ -1858,6 +1903,133 @@ def _do_cond_subsplit(sub_can, parent_id, current_path,
 #   4. [F4] For each concept cluster: facet sub-split by Statistic → Condition
 #   5. Store concept assignment back on each variable in can
 # ─────────────────────────────────────────────────────────────────────────────
 def _concept_title(text):
     """
     Extract the human-written concept TITLE from a metadata description.
@@ -1900,9 +2072,10 @@ def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None
     used_labels    = {str(u).lower() for u in (used_labels or [])}
     def _phrases(title):
-        t = re.sub(r'\([^)]*\)', ' ', title.lower())      # drop parenthetical conditions
         toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
-                if w not in _STOP and w not in ancestor_words]
         out = set()
         for nlen in range(1, max_words + 1):
             for i in range(len(toks) - nlen + 1):
@@ -1951,9 +2124,10 @@ def _raw_title(text):
 def _label_from_own_title(title, ancestor_words, max_words=4):
     """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
     words and parentheticals). Returns '' for sentence-like / empty titles."""
-    t = re.sub(r'\([^)]*\)', ' ', str(title).lower())
     toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
-            if w not in _STOP and w not in ancestor_words]
     if not toks or len(toks) > 7:          # >7 words ⇒ prose, not a concept title
         return ''
     return ' '.join(toks[:max_words]).title()
@@ -2092,11 +2266,18 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
     _aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks
     if n < 3 or concept_embs is None or len(concept_table) == 0:
-        # Too few variables to cluster — label each from its own title [Fix5];
-        # ensure_path merges it into an existing concept of the same name.
         pid = ensure_path_fn(path_prefix)
         for i, (_, row) in enumerate(tdf.iterrows()):
             lbl = _label_from_own_title(titles[i], _aw_base)
             tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \
                   {a.lower() for a in ancestor_names} else pid
             add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
@@ -2174,6 +2355,14 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
         if len(cluster_idxs) == 1:
             _, row = rows_list[cluster_idxs[0]]
             lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base)
             if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}:
                 tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to')
                 can.at[row.name, '_concept_label'] = lbl
@@ -2182,7 +2371,7 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
                 can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root'
             add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
             can.at[row.name, '_concept_score']  = 0.0
-            can.at[row.name, '_concept_source'] = 'singleton_title'
             continue
         if cluster_emb is not None:
@@ -2201,32 +2390,100 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
         else:
             scores = []
-        # PRIMARY LABEL = the concept shared by the cluster's member TITLES, chosen
-        # contrastively against siblings (tree-based local-IDF). Reads the data's own
-        # human-written names — never the boilerplate definition text — so
-        # "Calculated Assessed Trials" can no longer be a label. No hardcoding.
         ancestor_words = set(re.findall(r'[a-z]{3,}',
                                         ' '.join(ancestor_names).lower())) | _top_level_tasks
         member_titles_k     = [titles[i] for i in cluster_idxs]
         sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k]
         title_label = _title_cluster_label(member_titles_k, sibling_title_lists,
-                                            ancestor_words=ancestor_words,
-                                            used_labels=used_sibling_labels)
-        # The TITLE wins whenever it exists. External enrichment only attaches a
-        # definition to a metadata candidate — it does NOT give it a cleaner NAME,
-        # so a 'cognitive_atlas'-sourced candidate can still be boilerplate like
-        # "Calculated Assessed Trials". Scored candidates are therefore only a
-        # FALLBACK used when the cluster has no shared title concept at all.
-        sibling_texts  = [all_cluster_texts[j] for j in range(n_clust) if j != k]
         fallback_label = (title_label
                           or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts)
                           or f'Group {k+1}')
-        candidate_scores = [] if title_label else scores
         label, provenance = assign_concept_label(
             candidate_scores,
             fallback=fallback_label,
             ancestor_names=ancestor_names,
             used_sibling_labels=used_sibling_labels,
             top_level_tasks=_top_level_tasks,
@@ -2282,24 +2539,24 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
         pid = ensure_path_fn(path_prefix + [label],
                               relation='belongs_to', provenance=provenance)
-        # Store concept assignment on can (needed by Castanet facets later)
         for ci in cluster_idxs:
             _, row = rows_list[ci]
             can.at[row.name, '_concept_label']  = label
-            can.at[row.name, '_concept_score']  = round(scores[0]['score'], 3) if scores else 0.0
-            can.at[row.name, '_concept_source'] = scores[0]['source'] if scores else 'fallback'
-        # [F4][CAS][HIE] Facet-guided sub-splitting: Statistic → Condition tiers.
-        # NOTE: this uses a small hardcoded statistic/condition word list
-        # (precompute_stat_cond_facets). Removing it measurably degraded the
-        # structure (it is what separates Mean/Median/SD), so it is kept. The
-        # parent-duplicate guard inside prevents redundant "Total > Total" tiers.
-        cluster_idx_list = [rows_list[ci][0] for ci in cluster_idxs]
-        cluster_can      = can.loc[cluster_idx_list]
-        _do_facet_subsplit(
-            cluster_can, pid, path_prefix + [label],
-            nodes, leaf_to_id, ensure_path_fn
-        )
 def _remove_phrase(tokens, phrase_tokens):
@@ -2511,6 +2768,43 @@ def _prune_empty_aggregations(nodes):
     return nodes
 def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project',
                              n_clusters_per_group=8):
     """
@@ -2550,6 +2844,27 @@ def build_concept_hierarchy(can, embedder, concept_table, project='metadata_proj
     # is discriminative; one close to ALL of them is boilerplate. corpus_centroid
     # is the global mean (generic = central). Both are derived purely from data.
     sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text'
     ref_centroids = corpus_centroid = None
     try:
         all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist())
@@ -2648,6 +2963,9 @@ def build_concept_hierarchy(can, embedder, concept_table, project='metadata_proj
     # Remove empty concept nodes (no variables) — meaningless and they break the
     # branchvalues='total' sunburst (parent value < sum of children → blank render).
     _prune_empty_aggregations(nodes)
     # NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted —
     # it regressed setOverlap (0.914→0.836: mis-grouping) and added depth beyond gold.
     # _nest_by_category() is kept defined but intentionally NOT called.
@@ -3745,13 +4063,13 @@ if uploads:
             # [F3][F5][CAS] These columns are needed inside _cluster_and_label
             # for facet sub-splitting. They must be computed BEFORE Step G.
             # detect_facets / build_castanet_facets runs AFTER hierarchy build
-            # (Step I), so we pre-compute only _facet_stat and _facet_cond here.
-            with st.spinner('Pre-computing Statistic and Condition facets [CAS]...'):
                 can = precompute_stat_cond_facets(can)
-                n_stat  = can['_facet_stat'].ne('').sum()
                 n_cond  = can['_facet_cond'].ne('').sum()
-                st.info(f'Facet pre-computation: {n_stat} variables with Statistic, '
-                        f'{n_cond} with Condition.')
             # ── Step G: Build concept hierarchy (N×M alignment) ──────────────
             with st.spinner('Building concept hierarchy via N×M alignment [GON][TAX]...'):
@@ -3768,6 +4086,16 @@ if uploads:
                 else:
                     c_embs = None
                 nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
                 st.session_state.hiexpan_report = report
                 wmoves = report.get('width_expansion_moves', 0)
                 dexp   = report.get('depth_expansion_nodes', 0)

     'using','use','based','given','defined','number','value','values','score',
 }
+# ─── KeyBERT / labelling configuration ───────────────────────────────────────
+# These tune the KeyBERT label synthesizer used in the hybrid scorer.
+#
+# USE_NOUN_PHRASES — True: candidate phrases are NLTK POS-tagged noun phrases
+#   (needs the 'averaged_perceptron_tagger' corpus); False: plain n-gram candidates
+#   from tokens. False is robust for short CANTAB/AI-MIND descriptions and avoids the
+#   extra NLTK dependency.
+USE_NOUN_PHRASES  = False
+# USE_CTFIDF — True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
+#   boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
+USE_CTFIDF        = True
+# KEYBERT_DIVERSITY — MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
+#   (pick the single most relevant phrase); 0.5 = standard MMR diversification.
+KEYBERT_DIVERSITY = 0
+# ─── Title-SEEDED KeyBERT label-scorer weights ───────────────────────────────
+# Concept labels are FORMED FROM THE DESCRIPTIONS (KeyBERT candidate phrases over the
+# cluster's member descriptions). The pre-colon title is a ranking SEED/anchor, not the
+# label itself: LABEL_W_TITLE controls how strongly it biases the choice toward the
+# human-canonical phrasing (this is "Guided/Seeded KeyBERT"). Set LABEL_W_TITLE=0 for a
+# pure-description ablation. Magnitudes are relative (need not sum to 1).
+LABEL_W_RELEVANCE = 0.45   # cosine(candidate, cluster centroid)  — description fit (α)
+LABEL_W_TITLE     = 0.35   # cosine(candidate, pre-colon title)   — title influence (β)
+LABEL_W_CONTRAST  = 0.15   # discriminativeness vs sibling clusters (γ)
+# NOTE: node labels are formed from DESCRIPTIONS + pre-colon TITLE only. External
+# ontology sources (Cognitive Atlas / Wikidata / WordNet / PubMed) inform the embedding
+# space / semantic understanding but are never used to name a node — so there is no
+# external-grounding term in the label score.
+# Corpus IDF over description n-grams; populated in build_concept_hierarchy() and
+# consumed by _keybert_label when USE_CTFIDF=True.
+_CORPUS_IDF: dict = {}
+# Active dataset domain; set in build_concept_hierarchy(), read by the hybrid label
+# scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
+_ACTIVE_DOMAIN: str = 'general'
+# Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from
+# data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows
+# embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the
+# embedding text so they can neither name a node nor distort clustering. Domain-agnostic
+# documentation/scale tokens only — not concept vocabulary.
+_LABEL_BOILERPLATE = {
+    'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link',
+    'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category',
+    'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat',
+}
+# Inline URLs in free text (http://…, www.…/…) — removed from the embedding text.
+_URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
+                     re.IGNORECASE)
 # ─────────────────────────────────────────────────────────────────────────────
 # FILE LOADING
 # ─────────────────────────────────────────────────────────────────────────────
         if not sem_parts:
             sem_parts = list(leaf_parts) if leaf_parts else []
         semantic_text = ' '.join(sem_parts) if sem_parts else text
+        # Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens
+        # cannot dominate either the embedding (clustering) or the KeyBERT label.
+        semantic_text = _URL_RE.sub(' ', semantic_text)
         rows.append({
             '_source_file':    source,
             '_row_index':      int(i),
 # ─────────────────────────────────────────────────────────────────────────────
 def precompute_stat_cond_facets(can):
     """
+    Pre-compute _facet_cond on can (numeric experimental conditions only).
+    Called before build_concept_hierarchy so that _cluster_and_label can use it to
+    insert Condition sub-tiers.
+    NOTE: the statistic tier (Mean / Median / SD / …) is NO LONGER computed here.
+    It used to come from a hardcoded statistic vocabulary regex, which (a) is domain
+    hardcoding and (b) is not derived from the data's own concept titles. Statistic
+    depth is now produced data-drivenly by _nest_by_measure(), which discovers the
+    shared measure phrase and keeps the residual (Mean/Median/SD) as children — no
+    word list. Condition detection below stays: it is structural (a digit in the
+    code validated against the description text), not a hardcoded vocabulary.
     [CAS] Castanet parallel facets · [HIE] HiExpan sub-set discovery
     """
     can = can.copy()
     sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text'
     # ── Condition: digit in variable code VALIDATED by description text ──────────
     # [FIX2][GON] Gonçalves et al. (ESWC 2019): structural code alignment must be
     # validated against description text — the description is the authoritative source.
 def _do_facet_subsplit(sub_can, parent_id, current_path,
                        nodes, leaf_to_id, ensure_path_fn):
     """
+    [F4][CAS] Facet sub-split by _facet_cond (numeric condition) only.
+    The statistic tier is no longer inserted here — it came from a hardcoded
+    statistic vocabulary and is now produced data-drivenly by _nest_by_measure().
+    Kept defensive: if a legacy _facet_stat column is present it is still honoured,
+    but precompute_stat_cond_facets() no longer produces one.
     """
     # A facet tier that merely repeats the parent concept label (e.g. a "Total"
     # statistic under a "Total" concept) is redundant — skip it.
 #   4. [F4] For each concept cluster: facet sub-split by Statistic → Condition
 #   5. Store concept assignment back on each variable in can
 # ─────────────────────────────────────────────────────────────────────────────
+def _noun_phrases(text, max_words=4):
+    """
+    Grammatical noun phrases via NLTK POS tagging (used when USE_NOUN_PHRASES=True).
+    Returns [] if NLTK / the tagger is unavailable, so the caller falls back to
+    n-grams. Phrases are contiguous runs of adjectives/nouns up to max_words long.
+    """
+    try:
+        import nltk
+        for _pkg in ('averaged_perceptron_tagger', 'punkt'):
+            try:
+                nltk.data.find(f'taggers/{_pkg}' if 'tagger' in _pkg else f'tokenizers/{_pkg}')
+            except LookupError:
+                nltk.download(_pkg, quiet=True)
+        toks = nltk.word_tokenize(str(text))
+        tags = nltk.pos_tag(toks)
+    except Exception:
+        return []
+    phrases, cur = [], []
+    for w, t in tags:
+        if t.startswith('NN') or t.startswith('JJ'):
+            cur.append(w)
+            if len(cur) > max_words:
+                cur = cur[-max_words:]
+        else:
+            if len(cur) >= 1:
+                phrases.append(' '.join(cur))
+            cur = []
+    if cur:
+        phrases.append(' '.join(cur))
+    return [p for p in phrases if len(p) >= 3]
+def _keybert_label(member_texts, cluster_centroid, embedder, ancestor_words=None,
+                   corpus_centroid=None, used_labels=None, max_words=4,
+                   gen_weight=0.0, diversity=KEYBERT_DIVERSITY, cap=500):
+    """
+    KeyBERT-style extractive labeller. Extract candidate phrases from the cluster's
+    DESCRIPTIONS, embed them, and pick by:
+        score = (1 − diversity)·cos(phrase, cluster_centroid)
+              −      diversity ·cos(phrase, mean candidate phrase)   # MMR redundancy
+    With diversity=0 this is plain cosine-to-centroid (argmax relevance). When
+    USE_CTFIDF=True the relevance is modulated by corpus IDF so boilerplate (low IDF)
+    is suppressed. Candidates come from noun phrases (USE_NOUN_PHRASES=True) or
+    n-grams. Extractive — never hallucinates a label. Returns a title-cased string.
+    """
+    ancestor_words = ancestor_words or set()
+    used = {str(u).lower() for u in (used_labels or [])}
+    cand = set()
+    for t in member_texts:
+        raw = re.sub(r'\([^)]*\)', ' ', str(t))            # drop parentheticals
+        nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
+        if nps:
+            for p in nps:
+                toks = [w for w in p.lower().split()
+                        if w not in _STOP and w not in ancestor_words]
+                if toks:
+                    cand.add(' '.join(toks))
+        else:
+            toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower())
+                    if w not in _STOP and w not in ancestor_words]
+            for nlen in range(1, max_words + 1):
+                for i in range(len(toks) - nlen + 1):
+                    cand.add(' '.join(toks[i:i + nlen]))
+    # Junk filter: drop used labels, pure-number phrases, immediately-repeated words.
+    cand = [c for c in cand if len(c) >= 4 and c.lower() not in used
+            and not c.replace(' ', '').isdigit()
+            and not re.search(r'\b(\w+)\s+\1\b', c.lower())]
+    if not cand:
+        return ''
+    cand = cand[:cap]
+    embs = np.asarray(embedder.encode(cand), dtype=float)
+    sims = cosine_similarity([cluster_centroid], embs)[0]          # relevance
+    if USE_CTFIDF and _CORPUS_IDF:
+        mx = max(_CORPUS_IDF.values()) or 1.0
+        idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), mx) / mx) for c in cand])
+        sims = sims * (0.5 + 0.5 * idf)
+    if gen_weight and corpus_centroid is not None:
+        sims = sims - gen_weight * cosine_similarity([corpus_centroid], embs)[0]
+    if diversity > 0 and len(embs) > 1:                            # MMR penalty
+        generic = cosine_similarity(embs.mean(axis=0, keepdims=True), embs)[0]
+        score = (1.0 - diversity) * sims - diversity * generic
+    else:
+        score = sims
+    return cand[int(np.argmax(score))].title()
+def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None,
+                        max_words=3, cap=500):
+    """
+    Extract the KeyBERT CANDIDATE phrases from a cluster's member descriptions —
+    the same generation logic as _keybert_label but returns the full candidate list
+    (un-ranked) so the caller can score them with the title-seeded scorer. Phrases
+    are noun phrases (USE_NOUN_PHRASES=True) or n-grams, with ancestor/task words,
+    pure numbers, used labels and immediate repeats filtered out.
+    """
+    ancestor_words = ancestor_words or set()
+    used = {str(u).lower() for u in (used_labels or [])}
+    block = _STOP | ancestor_words | _LABEL_BOILERPLATE   # boilerplate/web/Likert tokens out
+    cand = set()
+    for t in member_texts:
+        raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t)))
+        nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
+        if nps:
+            for p in nps:
+                toks = [w for w in p.lower().split() if w not in block]
+                if toks:
+                    cand.add(' '.join(toks))
+        else:
+            toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block]
+            for nlen in range(1, max_words + 1):
+                for i in range(len(toks) - nlen + 1):
+                    cand.add(' '.join(toks[i:i + nlen]))
+    def _ok(c):
+        words = c.split()
+        if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit():
+            return False
+        if re.search(r'\b(\w+)\s+\1\b', c.lower()):        # adjacent word repeat
+            return False
+        if len(words) == 4 and words[:2] == words[2:]:     # phrase repeat "x y x y"
+            return False
+        if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)):  # bare fragment/acronym
+            return False
+        return True
+    return [c for c in cand if _ok(c)][:cap]
 def _concept_title(text):
     """
     Extract the human-written concept TITLE from a metadata description.
     used_labels    = {str(u).lower() for u in (used_labels or [])}
     def _phrases(title):
+        t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower()))   # drop parens + URLs
         toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
+                if w not in _STOP and w not in ancestor_words
+                and w not in _LABEL_BOILERPLATE]                          # web/Likert out
         out = set()
         for nlen in range(1, max_words + 1):
             for i in range(len(toks) - nlen + 1):
 def _label_from_own_title(title, ancestor_words, max_words=4):
     """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
     words and parentheticals). Returns '' for sentence-like / empty titles."""
+    t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower()))
     toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
+            if w not in _STOP and w not in ancestor_words
+            and w not in _LABEL_BOILERPLATE]
     if not toks or len(toks) > 7:          # >7 words ⇒ prose, not a concept title
         return ''
     return ' '.join(toks[:max_words]).title()
     _aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks
     if n < 3 or concept_embs is None or len(concept_table) == 0:
+        # Too few variables to cluster — label each from its own title [Fix5], or
+        # KeyBERT over its description when no title exists. ensure_path merges it
+        # into an existing concept of the same name.
         pid = ensure_path_fn(path_prefix)
+        _small = embedder.encode(texts) if texts else None
         for i, (_, row) in enumerate(tdf.iterrows()):
             lbl = _label_from_own_title(titles[i], _aw_base)
+            if not lbl and _small is not None:
+                lbl = _keybert_label([texts[i]], _small[i], embedder,
+                                     ancestor_words=_aw_base, used_labels=set(),
+                                     max_words=2, gen_weight=0.3,
+                                     diversity=KEYBERT_DIVERSITY)
             tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \
                   {a.lower() for a in ancestor_names} else pid
             add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
         if len(cluster_idxs) == 1:
             _, row = rows_list[cluster_idxs[0]]
             lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base)
+            src = 'singleton_title'
+            if not lbl and cluster_emb is not None:
+                lbl = _keybert_label([cluster_texts_k[0]], cluster_emb, embedder,
+                                     ancestor_words=_aw_base,
+                                     used_labels=used_sibling_labels,
+                                     max_words=2, gen_weight=0.3,
+                                     diversity=KEYBERT_DIVERSITY)
+                src = 'singleton_keybert'
             if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}:
                 tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to')
                 can.at[row.name, '_concept_label'] = lbl
                 can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root'
             add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
             can.at[row.name, '_concept_score']  = 0.0
+            can.at[row.name, '_concept_source'] = src
             continue
         if cluster_emb is not None:
         else:
             scores = []
+        # ── TITLE-SEEDED LABEL SELECTION (Guided KeyBERT) ─────────────────────
+        # The label is FORMED FROM THE DESCRIPTIONS: candidates are KeyBERT phrases
+        # extracted from the cluster's member descriptions (+ scored concept-table
+        # entries). The pre-colon TITLE does NOT override — it is a ranking SEED:
+        #   score = α·cos(cand, cluster centroid)   # description fit
+        #         + β·cos(cand, title embedding)     # title INFLUENCE (LABEL_W_TITLE)
+        #         + γ·contrast(vs siblings)
+        #         + δ·external grounding
+        # So the displayed label is always a description-derived phrase, pulled toward
+        # the human-canonical title phrasing. Set LABEL_W_TITLE=0 for a pure-description
+        # ablation. The title phrase is also added as ONE candidate so a clean title can
+        # still win on merit (it is usually present verbatim in the descriptions anyway).
         ancestor_words = set(re.findall(r'[a-z]{3,}',
                                         ' '.join(ancestor_names).lower())) | _top_level_tasks
         member_titles_k     = [titles[i] for i in cluster_idxs]
         sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k]
+        sibling_texts       = [all_cluster_texts[j] for j in range(n_clust) if j != k]
+        # Pre-colon title → used only as the SEED ANCHOR (and one candidate), never a
+        # direct override.
         title_label = _title_cluster_label(member_titles_k, sibling_title_lists,
+                                           ancestor_words=ancestor_words,
+                                           used_labels=used_sibling_labels)
+        title_emb = (embedder.encode([title_label])[0]
+                     if title_label else None)
+        # Candidate phrases drawn ONLY from the cluster's DESCRIPTIONS (KeyBERT) plus
+        # the pre-colon title. External ontology sources (Cognitive Atlas / Wikidata /
+        # WordNet / PubMed) are deliberately NOT candidates — per design they inform the
+        # embedding space / semantic understanding only, and must never name a node.
+        kb_cands = _keybert_candidates(cluster_texts_k, ancestor_words=ancestor_words,
+                                       used_labels=used_sibling_labels, max_words=3)
+        pool_src = [(c, 'keybert') for c in kb_cands]
+        if title_label:
+            pool_src.append((title_label, 'description_title'))
+        # Dedup; title's source tag takes priority over keybert when the phrase matches.
+        seen_pool = {}
+        for lbl, src in pool_src:
+            key = lbl.lower()
+            if key not in seen_pool or src == 'description_title':
+                seen_pool[key] = (lbl, src)
+        pool      = [v[0] for v in seen_pool.values()]
+        pool_srcs = [v[1] for v in seen_pool.values()]
+        keybert_label = kb_cands[0] if kb_cands else ''  # for fallback only
+        candidate_scores = []
+        if pool and cluster_emb is not None:
+            cand_embs = np.asarray(embedder.encode(pool), dtype=float)
+            relevance = cosine_similarity([cluster_emb], cand_embs)[0]
+            # c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic
+            # phrases ("test", "description", "measure", "scores") lose to distinctive ones.
+            if USE_CTFIDF and _CORPUS_IDF:
+                _mx  = max(_CORPUS_IDF.values()) or 1.0
+                _idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool])
+                relevance = relevance * (0.5 + 0.5 * _idf)
+            if sibling_centroids:
+                sib_sim  = cosine_similarity(cand_embs,
+                                             np.asarray(sibling_centroids, dtype=float)).max(axis=1)
+                contrast = np.clip(relevance - sib_sim, 0.0, 1.0)
+            else:
+                contrast = np.zeros(len(pool))
+            # Title SEED: cosine of each description-derived candidate to the title.
+            if title_emb is not None:
+                title_sim = cosine_similarity(cand_embs, [title_emb])[:, 0]
+            else:
+                title_sim = np.zeros(len(pool))
+            for i, cand in enumerate(pool):
+                hyb = (LABEL_W_RELEVANCE * float(relevance[i])
+                       + LABEL_W_TITLE    * float(title_sim[i])
+                       + LABEL_W_CONTRAST * float(contrast[i]))
+                candidate_scores.append({
+                    'label':             cand,
+                    'score':             hyb,
+                    'embedding_sim':     float(relevance[i]),
+                    'coverage':          float(relevance[i]),
+                    'contrast':          float(contrast[i]),
+                    'specificity':       0.0,
+                    'string_sim':        float(title_sim[i]),  # title seed alignment
+                    'source':            pool_srcs[i],
+                    'broader_relations': [],
+                    '_emb':              cand_embs[i],
+                })
+            candidate_scores.sort(key=lambda x: -x['score'])
         fallback_label = (title_label
+                          or keybert_label
                           or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts)
                           or f'Group {k+1}')
         label, provenance = assign_concept_label(
             candidate_scores,
             fallback=fallback_label,
+            min_score=0.0,
             ancestor_names=ancestor_names,
             used_sibling_labels=used_sibling_labels,
             top_level_tasks=_top_level_tasks,
         pid = ensure_path_fn(path_prefix + [label],
                               relation='belongs_to', provenance=provenance)
+        # Store concept assignment on can (needed by Castanet facets later).
+        # Provenance reflects the HYBRID winner (title / keybert / concept_table),
+        # not the old semantic-only scorer — so the exported labels CSV is accurate.
         for ci in cluster_idxs:
             _, row = rows_list[ci]
             can.at[row.name, '_concept_label']  = label
+            can.at[row.name, '_concept_score']  = provenance.get('confidence', 0.0)
+            can.at[row.name, '_concept_source'] = (provenance.get('source_evidence') or ['fallback'])[0]
+        # Attach the cluster's variables directly under the concept node. The former
+        # Statistic/Condition facet sub-split is removed: the statistic tier came from
+        # a hardcoded vocabulary (now produced data-drivenly by _nest_by_measure), and
+        # the numeric Condition tier produced bare-digit nodes (0/4/12) that inflated
+        # singleton%/n_agg and moved the tree away from gold. Castanet's Condition facet
+        # still exists as a separate parallel view via detect_facets() — not a tier.
+        for ci in cluster_idxs:
+            _, row = rows_list[ci]
+            add_child(nodes, pid, leaf_to_id[row['_leaf_id']])
 def _remove_phrase(tokens, phrase_tokens):
     return nodes
+def _dissolve_facet_singletons(nodes):
+    """
+    Dissolve FACET tier nodes (Statistic / Condition) that wrap a single variable.
+    A condition or statistic node with exactly one leaf child carries no grouping
+    value — e.g. `Standard Deviation > 0 > DMSL0SD`. We remove such nodes and
+    reattach their single child to the node's parent, keeping siblings together.
+    Scope is deliberately narrow: only nodes whose relation_type is 'has_condition'
+    or 'is_statistic_of' are touched, so genuine single-member CONCEPT nodes that
+    carry a distinctive name are preserved (per the chosen policy).
+    """
+    _FACET_RELS = {'has_condition', 'is_statistic_of'}
+    changed = True
+    while changed:
+        changed = False
+        pm = build_parent_map(nodes)
+        m  = nmap(nodes)
+        for n in list(nodes):
+            if n.get('type') != 'aggregation':
+                continue
+            if n['info'].get('relation_type') not in _FACET_RELS:
+                continue
+            nid      = int(n['id'])
+            children = [int(c) for c in n.get('related', [])]
+            # "Single variable" = exactly one child and that child is a leaf attribute.
+            if len(children) == 1 and m.get(children[0], {}).get('type') == 'attribute':
+                parent = pm.get(nid)
+                if parent is None:
+                    continue
+                add_child(nodes, parent, children[0])
+                remove_child(nodes, parent, nid)
+                nodes[:] = [x for x in nodes if int(x['id']) != nid]
+                changed = True
+                break
+    return nodes
 def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project',
                              n_clusters_per_group=8):
     """
     # is discriminative; one close to ALL of them is boilerplate. corpus_centroid
     # is the global mean (generic = central). Both are derived purely from data.
     sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text'
+    # Active domain — used by the hybrid label scorer's external-grounding signal.
+    global _ACTIVE_DOMAIN
+    _ACTIVE_DOMAIN = detect_domain(can)
+    # Corpus IDF over description n-grams — KeyBERT c-TF-IDF distinctiveness weight
+    # (only consulted when USE_CTFIDF=True). Data-derived, dataset-agnostic.
+    global _CORPUS_IDF
+    _CORPUS_IDF = {}
+    try:
+        from sklearn.feature_extraction.text import CountVectorizer as _CV
+        _docs = can[sem_col_all].fillna('').astype(str).tolist()
+        _cv = _CV(ngram_range=(1, 3), binary=True, lowercase=True,
+                  token_pattern=r'[a-z][a-z\-]+')
+        _dt = _cv.fit_transform(_docs)
+        _dfa = np.asarray(_dt.sum(axis=0)).ravel(); _N = _dt.shape[0]
+        _CORPUS_IDF = {p: float(np.log((_N + 1) / (_dfa[i] + 1)) + 1.0)
+                       for p, i in _cv.vocabulary_.items()}
+    except Exception:
+        _CORPUS_IDF = {}
     ref_centroids = corpus_centroid = None
     try:
         all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist())
     # Remove empty concept nodes (no variables) — meaningless and they break the
     # branchvalues='total' sunburst (parent value < sum of children → blank render).
     _prune_empty_aggregations(nodes)
+    # Dissolve 1-variable Statistic/Condition facet nodes (no grouping value).
+    _dissolve_facet_singletons(nodes)
+    _prune_empty_aggregations(nodes)
     # NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted —
     # it regressed setOverlap (0.914→0.836: mis-grouping) and added depth beyond gold.
     # _nest_by_category() is kept defined but intentionally NOT called.
             # [F3][F5][CAS] These columns are needed inside _cluster_and_label
             # for facet sub-splitting. They must be computed BEFORE Step G.
             # detect_facets / build_castanet_facets runs AFTER hierarchy build
+            # (Step I), so we pre-compute only _facet_cond here. The statistic tier
+            # is produced data-drivenly later by _nest_by_measure (no hardcoded vocab).
+            with st.spinner('Pre-computing Condition facets [CAS]...'):
                 can = precompute_stat_cond_facets(can)
                 n_cond  = can['_facet_cond'].ne('').sum()
+                st.info(f'Facet pre-computation: {n_cond} variables with Condition. '
+                        f'Statistic depth is derived from concept titles (_nest_by_measure).')
             # ── Step G: Build concept hierarchy (N×M alignment) ──────────────
             with st.spinner('Building concept hierarchy via N×M alignment [GON][TAX]...'):
                 else:
                     c_embs = None
                 nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
+                # HiExpan's width/global passes MOVE leaves between concepts; a concept
+                # that loses all its leaves becomes empty. build_concept_hierarchy prunes
+                # internally, but that runs BEFORE HiExpan — so re-prune here, else empty
+                # nodes break the Plotly branchvalues='total' sunburst/treemap (parent
+                # value < sum(children) → blank render; node-link is unaffected).
+                _prune_empty_aggregations(nodes)
+                _alive = {int(n['id']) for n in nodes}
+                for _n in nodes:
+                    _n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', []))
+                                     if x in _alive]
                 st.session_state.hiexpan_report = report
                 wmoves = report.get('width_expansion_moves', 0)
                 dexp   = report.get('depth_expansion_nodes', 0)

demo.py CHANGED Viewed

@@ -46,8 +46,8 @@ PREBUILT = {
             "facets":    ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
         },
         "HCP": {
-            "hierarchy": ROOT / "approach_1" / "keybert" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
-            "facets":    ROOT / "approach_1" / "keybert" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
         },
     },
     "Approach 2": {
@@ -211,7 +211,7 @@ def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
                                  font=dict(size=13), x=0.5))
     return fig
-def plot_treemap(nodes: list, color: str):
     nodes = _filter_dissolved(nodes)
     pm = _parent_map(nodes)
     vm = _tree_value_map(nodes, pm)
@@ -228,7 +228,7 @@ def plot_treemap(nodes: list, color: str):
     fig = go.Figure(go.Treemap(
         ids=ids, labels=labels, parents=parents, values=values,
         branchvalues="total", hovertext=hover, hoverinfo="text",
-        textinfo="label+value",
         marker=dict(colorscale=color, line=dict(width=1, color="white"))))
     fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
     return fig
@@ -359,6 +359,33 @@ def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_lab
     )
     return fig
 # ─────────────────────────────────────────────────────────────────────────────
 # IO
 # ─────────────────────────────────────────────────────────────────────────────
@@ -367,12 +394,37 @@ def _load_json(path_str: str):
     with open(path_str, encoding="utf-8") as f:
         return json.load(f)
 def count_nodes(nodes: list) -> tuple[int, int]:
     nodes = _filter_dissolved(nodes)
     leaves = sum(1 for n in nodes if n.get("type") == "attribute")
     aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
     return leaves, aggs
 # ───────────────────────────────────────────────────���─────────────────────────
 # SIDEBAR
 # ─────────────────────────────────────────────────────────────────────────────
@@ -413,6 +465,51 @@ c1, c2, c3 = st.columns(3)
 c1.metric("Leaf Variables", leaves)
 c2.metric("Aggregation Nodes", aggs)
 c3.metric("Total Nodes", leaves + aggs)
 st.markdown("---")
 # ── Level-of-Detail controls (above chart — matches the apps) ────────────────
@@ -450,15 +547,22 @@ st.divider()
 display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
 if viz_mode == "Sunburst (drill-down)":
-    st.plotly_chart(plot_sunburst(display_nodes, color, depth), use_container_width=True)
 elif viz_mode == "Treemap":
-    st.plotly_chart(plot_treemap(display_nodes, color), use_container_width=True)
 else:
     st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
                     use_container_width=True)
 # ── Facets (Approach 1 only) ─────────────────────────────────────────────────
-facet_path = paths.get("facets")
 if facet_path is not None and facet_path.exists():
     st.markdown("---")
     st.subheader("🔀 Parallel facets")

             "facets":    ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
         },
         "HCP": {
+            "hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
+            "facets":    ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
         },
     },
     "Approach 2": {
                                  font=dict(size=13), x=0.5))
     return fig
+def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
     nodes = _filter_dissolved(nodes)
     pm = _parent_map(nodes)
     vm = _tree_value_map(nodes, pm)
     fig = go.Figure(go.Treemap(
         ids=ids, labels=labels, parents=parents, values=values,
         branchvalues="total", hovertext=hover, hoverinfo="text",
+        textinfo="label+value", maxdepth=max_depth,
         marker=dict(colorscale=color, line=dict(width=1, color="white"))))
     fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
     return fig
     )
     return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# STATS / SAFE RENDERING
+# ─────────────────────────────────────────────────────────────────────────────
+def _tree_depth(nodes: list) -> int:
+    """Max depth of the rendered single-parent tree (root = depth 0)."""
+    nodes = _filter_dissolved(nodes)
+    m = {int(n["id"]): n for n in nodes}
+    best = {"d": 0}
+    def rec(nid, d):
+        best["d"] = max(best["d"], d)
+        for c in m.get(int(nid), {}).get("related", []):
+            if int(c) in m:
+                rec(int(c), d + 1)
+    rec(0, 0)
+    return best["d"]
+def safe_render_depth(nodes: list, requested: int) -> int:
+    """Plotly sunburst/treemap silently blank when asked to draw too many sectors
+    at once (large hierarchies like HCP). Cap the *initial* render depth — the
+    chart stays fully drillable by clicking, so no data is lost."""
+    n = len(_filter_dissolved(nodes))
+    if n > 400:
+        return min(requested, 3)
+    if n > 150:
+        return min(requested, 4)
+    return requested
 # ─────────────────────────────────────────────────────────────────────────────
 # IO
 # ─────────────────────────────────────────────────────────────────────────────
     with open(path_str, encoding="utf-8") as f:
         return json.load(f)
+def _read_bytes(path_str: str) -> bytes:
+    with open(path_str, "rb") as f:
+        return f.read()
+@st.cache_data(show_spinner=False)
+def _outputs_zip(root_str: str) -> bytes:
+    """Zip the entire bundled outputs/ folder for one-click download."""
+    import io, zipfile
+    root = Path(root_str)
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+        for p in sorted(root.rglob("*")):
+            if p.is_file():
+                zf.write(p, arcname=p.relative_to(root.parent).as_posix())
+    return buf.getvalue()
 def count_nodes(nodes: list) -> tuple[int, int]:
     nodes = _filter_dissolved(nodes)
     leaves = sum(1 for n in nodes if n.get("type") == "attribute")
     aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
     return leaves, aggs
+def concept_aligned_pct(nodes: list) -> float | None:
+    """% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
+    aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
+    if not aggs:
+        return None
+    aligned = sum(1 for n in aggs
+                  if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
+    return 100.0 * aligned / len(aggs) if aligned else None
 # ───────────────────────────────────────────────────���─────────────────────────
 # SIDEBAR
 # ─────────────────────────────────────────────────────────────────────────────
 c1.metric("Leaf Variables", leaves)
 c2.metric("Aggregation Nodes", aggs)
 c3.metric("Total Nodes", leaves + aggs)
+# ── Build summary (collapsed) ────────────────────────────────────────────────
+facet_path = paths.get("facets")
+n_facets = None
+if facet_path is not None and facet_path.exists():
+    try:
+        n_facets = len(_load_json(str(facet_path)))
+    except Exception:
+        n_facets = None
+with st.expander("ℹ️ Build summary", expanded=False):
+    bs1, bs2, bs3, bs4 = st.columns(4)
+    bs1.metric("Variables", leaves)
+    bs2.metric("Internal nodes", aggs)
+    bs3.metric("Tree depth", _tree_depth(raw_nodes))
+    bs4.metric("Facets", n_facets if n_facets is not None else "—")
+    pct = concept_aligned_pct(raw_nodes)
+    if pct is not None:
+        st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
+    st.caption(
+        f"Source file: `{hier_path.name}` · "
+        f"Approach: **{approach}** · Dataset: **{dataset}**. "
+        "Tree topology and labels are reproduced exactly from the pre-built "
+        "thesis output (the algorithms are not re-run in this viewer)."
+    )
+# ── Downloads ────────────────────────────────────────────────────────────────
+d1, d2, d3 = st.columns(3)
+with d1:
+    st.download_button("⬇️ Hierarchy JSON", data=_read_bytes(str(hier_path)),
+                       file_name=hier_path.name, mime="application/json",
+                       use_container_width=True)
+with d2:
+    if facet_path is not None and facet_path.exists():
+        st.download_button("⬇️ Facets JSON", data=_read_bytes(str(facet_path)),
+                           file_name=facet_path.name, mime="application/json",
+                           use_container_width=True)
+    else:
+        st.button("⬇️ Facets JSON", disabled=True, use_container_width=True,
+                  help="This approach/dataset has no facet tree.")
+with d3:
+    st.download_button("⬇️ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
+                       file_name="metadata_hierarchy_outputs.zip",
+                       mime="application/zip", use_container_width=True)
 st.markdown("---")
 # ── Level-of-Detail controls (above chart — matches the apps) ────────────────
 display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
 if viz_mode == "Sunburst (drill-down)":
+    eff = safe_render_depth(display_nodes, depth)
+    if eff < depth:
+        st.caption(f"Large hierarchy — showing {eff} levels initially to render "
+                   "reliably. **Click any sector to drill deeper.**")
+    st.plotly_chart(plot_sunburst(display_nodes, color, eff), use_container_width=True)
 elif viz_mode == "Treemap":
+    eff = safe_render_depth(display_nodes, depth)
+    if eff < depth:
+        st.caption(f"Large hierarchy — showing {eff} levels initially to render "
+                   "reliably. **Click a tile to drill deeper.**")
+    st.plotly_chart(plot_treemap(display_nodes, color, eff), use_container_width=True)
 else:
     st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
                     use_container_width=True)
 # ── Facets (Approach 1 only) ─────────────────────────────────────────────────
 if facet_path is not None and facet_path.exists():
     st.markdown("---")
     st.subheader("🔀 Parallel facets")

outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json → HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json} RENAMED Viewed

The diff for this file is too large to render. See raw diff

outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json → HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json} RENAMED Viewed

The diff for this file is too large to render. See raw diff

outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json CHANGED Viewed

@@ -3527,16 +3527,18 @@
         62,
         69,
         76,
         91,
-        93,
-        95,
-        99,
-        105,
-        111,
         114,
-        118,
-        120,
-        129,
         132
       ],
       "desc": "Facet: Measure Type"
@@ -3686,7 +3688,7 @@
     },
     {
       "id": 10,
-      "name": "Correct Latency",
       "related": [
         11,
         12,
@@ -3699,13 +3701,7 @@
         19,
         20,
         21,
-        22,
-        83,
-        84,
-        85,
-        86,
-        87,
-        88
       ],
       "type": "aggregation",
       "info": {
@@ -3717,7 +3713,7 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Correct Latency",
       "dtype": "determine",
       "recover": true
     },
@@ -3891,16 +3887,14 @@
     },
     {
       "id": 23,
-      "name": "Percent Correct",
       "related": [
         24,
         25,
         26,
         27,
         28,
-        29,
-        89,
-        90
       ],
       "type": "aggregation",
       "info": {
@@ -3912,7 +3906,7 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Percent Correct",
       "dtype": "determine",
       "recover": true
     },
@@ -4002,7 +3996,7 @@
     },
     {
       "id": 30,
-      "name": "Probability Error",
       "related": [
         31,
         32
@@ -4017,7 +4011,7 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Probability Error",
       "dtype": "determine",
       "recover": true
     },
@@ -4160,16 +4154,15 @@
     },
     {
       "id": 40,
-      "name": "Total Errors",
       "related": [
         41,
         42,
-        79,
-        124,
-        125,
-        126,
         127,
-        128
       ],
       "type": "aggregation",
       "info": {
@@ -4181,7 +4174,7 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Total Errors",
       "dtype": "determine",
       "recover": true
     },
@@ -4524,7 +4517,7 @@
     },
     {
       "id": 62,
-      "name": "Total Attempts Patterns",
       "related": [
         63,
         64,
@@ -4543,7 +4536,7 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Total Attempts Patterns",
       "dtype": "determine",
       "recover": true
     },
@@ -4746,9 +4739,9 @@
       "related": [
         77,
         78,
-        80,
         81,
-        82
       ],
       "type": "aggregation",
       "info": {
@@ -4794,6 +4787,26 @@
     },
     {
       "id": 79,
       "name": "PALTEA28",
       "dtype": "determine",
       "related": [],
@@ -4807,7 +4820,7 @@
       }
     },
     {
-      "id": 80,
       "name": "PALTEA4",
       "dtype": "determine",
       "related": [],
@@ -4821,7 +4834,7 @@
       }
     },
     {
-      "id": 81,
       "name": "PALTEA6",
       "dtype": "determine",
       "related": [],
@@ -4835,7 +4848,7 @@
       }
     },
     {
-      "id": 82,
       "name": "PALTEA8",
       "dtype": "determine",
       "related": [],
@@ -4849,7 +4862,32 @@
       }
     },
     {
-      "id": 83,
       "name": "PRMCLSDD",
       "dtype": "determine",
       "related": [],
@@ -4863,7 +4901,7 @@
       }
     },
     {
-      "id": 84,
       "name": "PRMCLSDI",
       "dtype": "determine",
       "related": [],
@@ -4877,7 +4915,7 @@
       }
     },
     {
-      "id": 85,
       "name": "PRMMCLD",
       "dtype": "determine",
       "related": [],
@@ -4891,7 +4929,7 @@
       }
     },
     {
-      "id": 86,
       "name": "PRMMCLI",
       "dtype": "determine",
       "related": [],
@@ -4905,7 +4943,7 @@
       }
     },
     {
-      "id": 87,
       "name": "PRMMDCLD",
       "dtype": "determine",
       "related": [],
@@ -4919,7 +4957,7 @@
       }
     },
     {
-      "id": 88,
       "name": "PRMMDCLI",
       "dtype": "determine",
       "related": [],
@@ -4933,7 +4971,28 @@
       }
     },
     {
-      "id": 89,
       "name": "PRMPCD",
       "dtype": "determine",
       "related": [],
@@ -4947,7 +5006,7 @@
       }
     },
     {
-      "id": 90,
       "name": "PRMPCI",
       "dtype": "determine",
       "related": [],
@@ -4961,10 +5020,10 @@
       }
     },
     {
-      "id": 91,
       "name": "Time Since Delayed Stimuli",
       "related": [
-        92
       ],
       "type": "aggregation",
       "info": {
@@ -4981,7 +5040,7 @@
       "recover": true
     },
     {
-      "id": 92,
       "name": "PRMTSDSP",
       "dtype": "determine",
       "related": [],
@@ -4995,10 +5054,10 @@
       }
     },
     {
-      "id": 93,
       "name": "Detection Measure",
       "related": [
-        94
       ],
       "type": "aggregation",
       "info": {
@@ -5015,7 +5074,7 @@
       "recover": true
     },
     {
-      "id": 94,
       "name": "RVPA",
       "dtype": "determine",
       "related": [],
@@ -5029,12 +5088,12 @@
       }
     },
     {
-      "id": 95,
-      "name": "Response Latency",
       "related": [
-        96,
-        97,
-        98
       ],
       "type": "aggregation",
       "info": {
@@ -5046,12 +5105,12 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Response Latency",
       "dtype": "determine",
       "recover": true
     },
     {
-      "id": 96,
       "name": "RVPLSD",
       "dtype": "determine",
       "related": [],
@@ -5065,7 +5124,7 @@
       }
     },
     {
-      "id": 97,
       "name": "RVPMDL",
       "dtype": "determine",
       "related": [],
@@ -5079,7 +5138,7 @@
       }
     },
     {
-      "id": 98,
       "name": "RVPML",
       "dtype": "determine",
       "related": [],
@@ -5093,14 +5152,14 @@
       }
     },
     {
-      "id": 99,
       "name": "Total",
       "related": [
-        100,
-        101,
-        102,
         103,
-        104
       ],
       "type": "aggregation",
       "info": {
@@ -5117,7 +5176,7 @@
       "recover": true
     },
     {
-      "id": 100,
       "name": "RVPPFA",
       "dtype": "determine",
       "related": [],
@@ -5131,7 +5190,7 @@
       }
     },
     {
-      "id": 101,
       "name": "RVPPH",
       "dtype": "determine",
       "related": [],
@@ -5145,7 +5204,7 @@
       }
     },
     {
-      "id": 102,
       "name": "RVPTFA",
       "dtype": "determine",
       "related": [],
@@ -5159,7 +5218,7 @@
       }
     },
     {
-      "id": 103,
       "name": "RVPTH",
       "dtype": "determine",
       "related": [],
@@ -5173,7 +5232,7 @@
       }
     },
     {
-      "id": 104,
       "name": "RVPTM",
       "dtype": "determine",
       "related": [],
@@ -5187,14 +5246,14 @@
       }
     },
     {
-      "id": 105,
-      "name": "Errors Boxes",
       "related": [
-        106,
-        107,
-        108,
         109,
-        110
       ],
       "type": "aggregation",
       "info": {
@@ -5206,12 +5265,12 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Errors Boxes",
       "dtype": "determine",
       "recover": true
     },
     {
-      "id": 106,
       "name": "SWMBE12",
       "dtype": "determine",
       "related": [],
@@ -5225,7 +5284,7 @@
       }
     },
     {
-      "id": 107,
       "name": "SWMBE4",
       "dtype": "determine",
       "related": [],
@@ -5239,7 +5298,7 @@
       }
     },
     {
-      "id": 108,
       "name": "SWMBE468",
       "dtype": "determine",
       "related": [],
@@ -5253,7 +5312,7 @@
       }
     },
     {
-      "id": 109,
       "name": "SWMBE6",
       "dtype": "determine",
       "related": [],
@@ -5267,7 +5326,7 @@
       }
     },
     {
-      "id": 110,
       "name": "SWMBE8",
       "dtype": "determine",
       "related": [],
@@ -5281,13 +5340,13 @@
       }
     },
     {
-      "id": 111,
       "name": "Double Errors Boxes",
       "related": [
-        112,
-        113,
         116,
-        117
       ],
       "type": "aggregation",
       "info": {
@@ -5304,7 +5363,7 @@
       "recover": true
     },
     {
-      "id": 112,
       "name": "SWMDE12",
       "dtype": "determine",
       "related": [],
@@ -5318,7 +5377,7 @@
       }
     },
     {
-      "id": 113,
       "name": "SWMDE4",
       "dtype": "determine",
       "related": [],
@@ -5332,10 +5391,10 @@
       }
     },
     {
-      "id": 114,
       "name": "Double Errors",
       "related": [
-        115
       ],
       "type": "aggregation",
       "info": {
@@ -5352,7 +5411,7 @@
       "recover": true
     },
     {
-      "id": 115,
       "name": "SWMDE468",
       "dtype": "determine",
       "related": [],
@@ -5366,7 +5425,7 @@
       }
     },
     {
-      "id": 116,
       "name": "SWMDE6",
       "dtype": "determine",
       "related": [],
@@ -5380,7 +5439,7 @@
       }
     },
     {
-      "id": 117,
       "name": "SWMDE8",
       "dtype": "determine",
       "related": [],
@@ -5394,10 +5453,10 @@
       }
     },
     {
-      "id": 118,
       "name": "Problem Reached",
       "related": [
-        119
       ],
       "type": "aggregation",
       "info": {
@@ -5414,7 +5473,7 @@
       "recover": true
     },
     {
-      "id": 119,
       "name": "SWMPR",
       "dtype": "determine",
       "related": [],
@@ -5428,12 +5487,12 @@
       }
     },
     {
-      "id": 120,
-      "name": "Strategy",
       "related": [
-        121,
-        122,
-        123
       ],
       "type": "aggregation",
       "info": {
@@ -5445,12 +5504,12 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Strategy",
       "dtype": "determine",
       "recover": true
     },
     {
-      "id": 121,
       "name": "SWMS",
       "dtype": "determine",
       "related": [],
@@ -5464,7 +5523,7 @@
       }
     },
     {
-      "id": 122,
       "name": "SWMS6",
       "dtype": "determine",
       "related": [],
@@ -5478,7 +5537,7 @@
       }
     },
     {
-      "id": 123,
       "name": "SWMSX",
       "dtype": "determine",
       "related": [],
@@ -5492,7 +5551,7 @@
       }
     },
     {
-      "id": 124,
       "name": "SWMTE12",
       "dtype": "determine",
       "related": [],
@@ -5506,7 +5565,7 @@
       }
     },
     {
-      "id": 125,
       "name": "SWMTE4",
       "dtype": "determine",
       "related": [],
@@ -5520,7 +5579,7 @@
       }
     },
     {
-      "id": 126,
       "name": "SWMTE468",
       "dtype": "determine",
       "related": [],
@@ -5534,7 +5593,7 @@
       }
     },
     {
-      "id": 127,
       "name": "SWMTE6",
       "dtype": "determine",
       "related": [],
@@ -5548,7 +5607,7 @@
       }
     },
     {
-      "id": 128,
       "name": "SWMTE8",
       "dtype": "determine",
       "related": [],
@@ -5562,13 +5621,14 @@
       }
     },
     {
-      "id": 129,
-      "name": "Within Errors Boxes",
       "related": [
-        130,
-        131,
         134,
-        135
       ],
       "type": "aggregation",
       "info": {
@@ -5580,12 +5640,12 @@
         "relation_label": "semantically related to"
       },
       "isShown": true,
-      "desc": "Measure Type: Within Errors Boxes",
       "dtype": "determine",
       "recover": true
     },
     {
-      "id": 130,
       "name": "SWMWE12",
       "dtype": "determine",
       "related": [],
@@ -5599,7 +5659,7 @@
       }
     },
     {
-      "id": 131,
       "name": "SWMWE4",
       "dtype": "determine",
       "related": [],
@@ -5613,27 +5673,7 @@
       }
     },
     {
-      "id": 132,
-      "name": "Within Errors",
-      "related": [
-        133
-      ],
-      "type": "aggregation",
-      "info": {
-        "operation": "concat",
-        "usedAttributes": [],
-        "formula": "",
-        "exec": "",
-        "relation_type": "related_to",
-        "relation_label": "semantically related to"
-      },
-      "isShown": true,
-      "desc": "Measure Type: Within Errors",
-      "dtype": "determine",
-      "recover": true
-    },
-    {
-      "id": 133,
       "name": "SWMWE468",
       "dtype": "determine",
       "related": [],
@@ -5647,7 +5687,7 @@
       }
     },
     {
-      "id": 134,
       "name": "SWMWE6",
       "dtype": "determine",
       "related": [],
@@ -5661,7 +5701,7 @@
       }
     },
     {
-      "id": 135,
       "name": "SWMWE8",
       "dtype": "determine",
       "related": [],

         62,
         69,
         76,
+        79,
+        84,
         91,
+        94,
+        96,
+        98,
+        102,
+        108,
         114,
+        117,
+        121,
+        123,
         132
       ],
       "desc": "Facet: Measure Type"
     },
     {
       "id": 10,
+      "name": "Correct Latency Mean",
       "related": [
         11,
         12,
         19,
         20,
         21,
+        22
       ],
       "type": "aggregation",
       "info": {
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Correct Latency Mean",
       "dtype": "determine",
       "recover": true
     },
     },
     {
       "id": 23,
+      "name": "Percent Correct Percentage",
       "related": [
         24,
         25,
         26,
         27,
         28,
+        29
       ],
       "type": "aggregation",
       "info": {
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Percent Correct Percentage",
       "dtype": "determine",
       "recover": true
     },
     },
     {
       "id": 30,
+      "name": "Probability Error Occurring",
       "related": [
         31,
         32
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Probability Error Occurring",
       "dtype": "determine",
       "recover": true
     },
     },
     {
       "id": 40,
+      "name": "Errors Total",
       "related": [
         41,
         42,
         127,
+        128,
+        129,
+        130,
+        131
       ],
       "type": "aggregation",
       "info": {
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Errors Total",
       "dtype": "determine",
       "recover": true
     },
     },
     {
       "id": 62,
+      "name": "Attempts Patterns Total",
       "related": [
         63,
         64,
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Attempts Patterns Total",
       "dtype": "determine",
       "recover": true
     },
       "related": [
         77,
         78,
         81,
+        82,
+        83
       ],
       "type": "aggregation",
       "info": {
     },
     {
       "id": 79,
+      "name": "Total Errors",
+      "related": [
+        80
+      ],
+      "type": "aggregation",
+      "info": {
+        "operation": "concat",
+        "usedAttributes": [],
+        "formula": "",
+        "exec": "",
+        "relation_type": "related_to",
+        "relation_label": "semantically related to"
+      },
+      "isShown": true,
+      "desc": "Measure Type: Total Errors",
+      "dtype": "determine",
+      "recover": true
+    },
+    {
+      "id": 80,
       "name": "PALTEA28",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 81,
       "name": "PALTEA4",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 82,
       "name": "PALTEA6",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 83,
       "name": "PALTEA8",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 84,
+      "name": "Latency Immediate Standard",
+      "related": [
+        85,
+        86,
+        87,
+        88,
+        89,
+        90
+      ],
+      "type": "aggregation",
+      "info": {
+        "operation": "concat",
+        "usedAttributes": [],
+        "formula": "",
+        "exec": "",
+        "relation_type": "related_to",
+        "relation_label": "semantically related to"
+      },
+      "isShown": true,
+      "desc": "Measure Type: Latency Immediate Standard",
+      "dtype": "determine",
+      "recover": true
+    },
+    {
+      "id": 85,
       "name": "PRMCLSDD",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 86,
       "name": "PRMCLSDI",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 87,
       "name": "PRMMCLD",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 88,
       "name": "PRMMCLI",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 89,
       "name": "PRMMDCLD",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 90,
       "name": "PRMMDCLI",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 91,
+      "name": "Percent Correct Immediate",
+      "related": [
+        92,
+        93
+      ],
+      "type": "aggregation",
+      "info": {
+        "operation": "concat",
+        "usedAttributes": [],
+        "formula": "",
+        "exec": "",
+        "relation_type": "related_to",
+        "relation_label": "semantically related to"
+      },
+      "isShown": true,
+      "desc": "Measure Type: Percent Correct Immediate",
+      "dtype": "determine",
+      "recover": true
+    },
+    {
+      "id": 92,
       "name": "PRMPCD",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 93,
       "name": "PRMPCI",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 94,
       "name": "Time Since Delayed Stimuli",
       "related": [
+        95
       ],
       "type": "aggregation",
       "info": {
       "recover": true
     },
     {
+      "id": 95,
       "name": "PRMTSDSP",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 96,
       "name": "Detection Measure",
       "related": [
+        97
       ],
       "type": "aggregation",
       "info": {
       "recover": true
     },
     {
+      "id": 97,
       "name": "RVPA",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 98,
+      "name": "Response Latency Mean",
       "related": [
+        99,
+        100,
+        101
       ],
       "type": "aggregation",
       "info": {
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Response Latency Mean",
       "dtype": "determine",
       "recover": true
     },
     {
+      "id": 99,
       "name": "RVPLSD",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 100,
       "name": "RVPMDL",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 101,
       "name": "RVPML",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 102,
       "name": "Total",
       "related": [
         103,
+        104,
+        105,
+        106,
+        107
       ],
       "type": "aggregation",
       "info": {
       "recover": true
     },
     {
+      "id": 103,
       "name": "RVPPFA",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 104,
       "name": "RVPPH",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 105,
       "name": "RVPTFA",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 106,
       "name": "RVPTH",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 107,
       "name": "RVPTM",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 108,
+      "name": "Errors Boxes Times",
       "related": [
         109,
+        110,
+        111,
+        112,
+        113
       ],
       "type": "aggregation",
       "info": {
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Errors Boxes Times",
       "dtype": "determine",
       "recover": true
     },
     {
+      "id": 109,
       "name": "SWMBE12",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 110,
       "name": "SWMBE4",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 111,
       "name": "SWMBE468",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 112,
       "name": "SWMBE6",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 113,
       "name": "SWMBE8",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 114,
       "name": "Double Errors Boxes",
       "related": [
+        115,
         116,
+        119,
+        120
       ],
       "type": "aggregation",
       "info": {
       "recover": true
     },
     {
+      "id": 115,
       "name": "SWMDE12",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 116,
       "name": "SWMDE4",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 117,
       "name": "Double Errors",
       "related": [
+        118
       ],
       "type": "aggregation",
       "info": {
       "recover": true
     },
     {
+      "id": 118,
       "name": "SWMDE468",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 119,
       "name": "SWMDE6",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 120,
       "name": "SWMDE8",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 121,
       "name": "Problem Reached",
       "related": [
+        122
       ],
       "type": "aggregation",
       "info": {
       "recover": true
     },
     {
+      "id": 122,
       "name": "SWMPR",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 123,
+      "name": "Strategy High",
       "related": [
+        124,
+        125,
+        126
       ],
       "type": "aggregation",
       "info": {
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Strategy High",
       "dtype": "determine",
       "recover": true
     },
     {
+      "id": 124,
       "name": "SWMS",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 125,
       "name": "SWMS6",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 126,
       "name": "SWMSX",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 127,
       "name": "SWMTE12",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 128,
       "name": "SWMTE4",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 129,
       "name": "SWMTE468",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 130,
       "name": "SWMTE6",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 131,
       "name": "SWMTE8",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 132,
+      "name": "Within Errors",
       "related": [
+        133,
         134,
+        135,
+        136,
+        137
       ],
       "type": "aggregation",
       "info": {
         "relation_label": "semantically related to"
       },
       "isShown": true,
+      "desc": "Measure Type: Within Errors",
       "dtype": "determine",
       "recover": true
     },
     {
+      "id": 133,
       "name": "SWMWE12",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 134,
       "name": "SWMWE4",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 135,
       "name": "SWMWE468",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 136,
       "name": "SWMWE6",
       "dtype": "determine",
       "related": [],
       }
     },
     {
+      "id": 137,
       "name": "SWMWE8",
       "dtype": "determine",
       "related": [],

outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json CHANGED Viewed

@@ -1645,7 +1645,7 @@
       115,
       116,
       117,
-      146
     ],
     "type": "aggregation",
     "info": {
@@ -1687,16 +1687,19 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Total Correct",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -1722,21 +1725,65 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Error",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
       "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
     "id": 113,
-    "name": "Total Errors",
     "related": [
       34,
       35
@@ -1751,21 +1798,24 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: DMS > Total Errors",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Total Errors",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -1794,21 +1844,24 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Correct Latency Standard Deviation",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
     "id": 115,
-    "name": "Probability Error",
     "related": [
       26,
       27
@@ -1823,26 +1876,30 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: DMS > Probability Error",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Probability Error",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
     "id": 116,
-    "name": "Percent Correct",
     "related": [
       20,
       21,
@@ -1861,21 +1918,25 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: DMS > Percent Correct",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Percent Correct",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -1941,11 +2002,11 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Latency Display Stimulus",
-      "confidence": 0.678,
       "alternatives": [
-        "Mean Latency",
-        "Latency",
-        "Response Latency"
       ],
       "source_evidence": [
         "keybert"
@@ -1953,7 +2014,7 @@
       "embedding_sim": 0.732,
       "coverage": 0.732,
       "contrast": 0.595,
-      "specificity": 1.0,
       "string_sim": 0.0
     }
   },
@@ -1979,10 +2040,11 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Total Assessment Trials",
-      "confidence": 0.534,
       "alternatives": [
-        "Assessment Trials",
-        "Assessed Trials Simultaneous"
       ],
       "source_evidence": [
         "keybert"
@@ -1990,7 +2052,7 @@
       "embedding_sim": 0.629,
       "coverage": 0.629,
       "contrast": 0.204,
-      "specificity": 1.0,
       "string_sim": 0.0
     }
   },
@@ -1998,11 +2060,10 @@
     "id": 121,
     "name": "PAL",
     "related": [
-      122,
       123,
-      124,
       125,
       126,
       147
     ],
     "type": "aggregation",
@@ -2021,7 +2082,7 @@
   },
   {
     "id": 122,
-    "name": "Total Attempts Patterns",
     "related": [
       49,
       50,
@@ -2040,21 +2101,24 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: PAL > Total Attempts Patterns",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Total Attempts Patterns",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -2079,7 +2143,7 @@
   },
   {
     "id": 124,
-    "name": "Errors Patterns Total",
     "related": [
       55,
       56,
@@ -2103,11 +2167,11 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Errors Patterns Total",
-      "confidence": 0.507,
       "alternatives": [
-        "Correct Box Attempt",
-        "Subject Revisits Box",
-        "Box Attempt"
       ],
       "source_evidence": [
         "keybert"
@@ -2115,7 +2179,7 @@
       "embedding_sim": 0.619,
       "coverage": 0.619,
       "contrast": 0.115,
-      "specificity": 1.0,
       "string_sim": 0.0
     }
   },
@@ -2184,16 +2248,20 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Include Total Errors Shapes",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -2220,10 +2288,14 @@
   },
   {
     "id": 130,
-    "name": "Correct Latency",
     "related": [
-      152,
-      153
     ],
     "type": "aggregation",
     "info": {
@@ -2235,21 +2307,25 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: PRM > Correct Latency",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Correct Latency",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -2274,7 +2350,7 @@
   },
   {
     "id": 132,
-    "name": "Percent Correct",
     "related": [
       73,
       74
@@ -2289,21 +2365,25 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: PRM > Percent Correct",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Percent Correct",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -2353,21 +2433,24 @@
     "recover": true,
     "concept_provenance": {
       "node_label": "Total",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
     "id": 135,
-    "name": "Response Latency",
     "related": [
       77,
       78,
@@ -2383,21 +2466,25 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: RVP > Response Latency",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Response Latency",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -2426,9 +2513,9 @@
     "related": [
       140,
       141,
       143,
       144,
-      145,
       148
     ],
     "type": "aggregation",
@@ -2445,9 +2532,50 @@
     "dtype": "determine",
     "recover": true
   },
   {
     "id": 140,
-    "name": "Strategy",
     "related": [
       96,
       97,
@@ -2463,21 +2591,25 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: SWM > Strategy",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Strategy",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
-      "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
@@ -2501,14 +2633,14 @@
     "recover": true
   },
   {
-    "id": 143,
-    "name": "Total Errors",
     "related": [
-      99,
-      100,
-      101,
-      102,
-      103
     ],
     "type": "aggregation",
     "info": {
@@ -2520,32 +2652,34 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: SWM > Total Errors",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
-      "node_label": "Total Errors",
-      "confidence": 0.0,
-      "alternatives": [],
       "source_evidence": [
-        "tfidf_fallback"
       ],
-      "embedding_sim": 0.0,
-      "string_sim": 0.0,
-      "coverage": 0.0,
       "contrast": 0.0,
-      "specificity": 0.0
     }
   },
   {
-    "id": 144,
-    "name": "Within Errors",
     "related": [
-      106,
-      104,
-      105,
-      107,
-      108
     ],
     "type": "aggregation",
     "info": {
@@ -2557,12 +2691,28 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Concept group: SWM > Within Errors",
     "dtype": "determine",
-    "recover": true
   },
   {
-    "id": 145,
     "name": "Double Errors",
     "related": [
       92,
@@ -2586,14 +2736,11 @@
     "recover": true
   },
   {
-    "id": 146,
     "name": "Correct Latency",
     "related": [
-      114,
-      154,
-      155,
-      156,
-      157
     ],
     "type": "aggregation",
     "info": {
@@ -2609,6 +2756,27 @@
     "dtype": "determine",
     "recover": true
   },
   {
     "id": 147,
     "name": "Total Errors",
@@ -2634,11 +2802,7 @@
     "id": 148,
     "name": "Errors Boxes",
     "related": [
-      85,
-      86,
-      87,
-      88,
-      89
     ],
     "type": "aggregation",
     "info": {
@@ -2719,50 +2883,6 @@
   },
   {
     "id": 152,
-    "name": "Delayed",
-    "related": [
-      67,
-      69,
-      71
-    ],
-    "type": "aggregation",
-    "info": {
-      "operation": "concat",
-      "usedAttributes": [],
-      "formula": "",
-      "exec": "",
-      "relation_type": "belongs_to",
-      "relation_label": "belongs to"
-    },
-    "isShown": true,
-    "desc": "Sub-group: Delayed",
-    "dtype": "determine",
-    "recover": true
-  },
-  {
-    "id": 153,
-    "name": "Immediate",
-    "related": [
-      72,
-      68,
-      70
-    ],
-    "type": "aggregation",
-    "info": {
-      "operation": "concat",
-      "usedAttributes": [],
-      "formula": "",
-      "exec": "",
-      "relation_type": "belongs_to",
-      "relation_label": "belongs to"
-    },
-    "isShown": true,
-    "desc": "Sub-group: Immediate",
-    "dtype": "determine",
-    "recover": true
-  },
-  {
-    "id": 154,
     "name": "Median Seconds Delay",
     "related": [
       9,
@@ -2784,8 +2904,8 @@
     "recover": true
   },
   {
-    "id": 155,
-    "name": "Mean Seconds Delay",
     "related": [
       16,
       17,
@@ -2801,12 +2921,12 @@
       "relation_label": "belongs to"
     },
     "isShown": true,
-    "desc": "Sub-group: Mean Seconds Delay",
     "dtype": "determine",
     "recover": true
   },
   {
-    "id": 156,
     "name": "Median",
     "related": [
       8,
@@ -2826,27 +2946,5 @@
     "desc": "Sub-group: Median",
     "dtype": "determine",
     "recover": true
-  },
-  {
-    "id": 157,
-    "name": "Mean",
-    "related": [
-      18,
-      19,
-      14
-    ],
-    "type": "aggregation",
-    "info": {
-      "operation": "concat",
-      "usedAttributes": [],
-      "formula": "",
-      "exec": "",
-      "relation_type": "belongs_to",
-      "relation_label": "belongs to"
-    },
-    "isShown": true,
-    "desc": "Sub-group: Mean",
-    "dtype": "determine",
-    "recover": true
   }
 ]

       115,
       116,
       117,
+      145
     ],
     "type": "aggregation",
     "info": {
     "recover": true,
     "concept_provenance": {
       "node_label": "Total Correct",
+      "confidence": 0.507,
+      "alternatives": [
+        "correct total",
+        "correct total times"
+      ],
       "source_evidence": [
+        "description_title"
       ],
+      "embedding_sim": 0.319,
+      "coverage": 0.319,
+      "contrast": 0.086,
+      "specificity": 0.0,
+      "string_sim": 1.0
     }
   },
   {
     "recover": true,
     "concept_provenance": {
       "node_label": "Error",
+      "confidence": 0.447,
+      "alternatives": [
+        "error times subject",
+        "error times",
+        "failed"
+      ],
       "source_evidence": [
+        "description_title"
       ],
+      "embedding_sim": 0.216,
+      "coverage": 0.216,
       "contrast": 0.0,
+      "specificity": 0.0,
+      "string_sim": 1.0
+    }
+  },
+  {
+    "id": 112,
+    "name": "Mean Latency",
+    "related": [
+      14,
+      18,
+      19,
+      152,
+      153,
+      154
+    ],
+    "type": "aggregation",
+    "info": {
+      "operation": "concat",
+      "usedAttributes": [],
+      "formula": "",
+      "exec": "",
+      "relation_type": "belongs_to",
+      "relation_label": "belongs to"
+    },
+    "isShown": true,
+    "desc": "Concept group: DMS > Correct Latency Mean",
+    "dtype": "determine",
+    "recover": true,
+    "concept_provenance": {
+      "node_label": "Correct Latency Mean",
+      "confidence": 0.625,
+      "alternatives": [
+        "latency mean"
+      ],
+      "source_evidence": [
+        "keybert"
+      ],
+      "embedding_sim": 0.676,
+      "coverage": 0.676,
+      "contrast": 0.076,
+      "specificity": 0.0,
+      "string_sim": 0.884
     }
   },
   {
     "id": 113,
+    "name": "Errors Total",
     "related": [
       34,
       35
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: DMS > Errors Total",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Errors Total",
+      "confidence": 0.604,
+      "alternatives": [
+        "errors total times",
+        "Total Errors"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.543,
+      "coverage": 0.543,
+      "contrast": 0.125,
+      "specificity": 0.0,
+      "string_sim": 0.974
     }
   },
   {
     "recover": true,
     "concept_provenance": {
       "node_label": "Correct Latency Standard Deviation",
+      "confidence": 0.687,
+      "alternatives": [
+        "latency standard deviation",
+        "deviation response latencies"
+      ],
       "source_evidence": [
+        "description_title"
       ],
+      "embedding_sim": 0.684,
+      "coverage": 0.684,
+      "contrast": 0.193,
+      "specificity": 0.0,
+      "string_sim": 1.0
     }
   },
   {
     "id": 115,
+    "name": "Probability Error Occurring",
     "related": [
       26,
       27
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: DMS > Probability Error Occurring",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Probability Error Occurring",
+      "confidence": 0.619,
+      "alternatives": [
+        "Probability Error",
+        "probability error made",
+        "reports probability error"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.578,
+      "coverage": 0.578,
+      "contrast": 0.142,
+      "specificity": 0.0,
+      "string_sim": 0.966
     }
   },
   {
     "id": 116,
+    "name": "Percent Correct Percentage",
     "related": [
       20,
       21,
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: DMS > Percent Correct Percentage",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Percent Correct Percentage",
+      "confidence": 0.54,
+      "alternatives": [
+        "correct percentage assessment",
+        "correct percentage",
+        "Percent Correct"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.473,
+      "coverage": 0.473,
+      "contrast": 0.156,
+      "specificity": 0.0,
+      "string_sim": 0.868
     }
   },
   {
     "recover": true,
     "concept_provenance": {
       "node_label": "Latency Display Stimulus",
+      "confidence": 0.418,
       "alternatives": [
+        "mean latency display",
+        "standard deviation latency",
+        "deviation latency calculated"
       ],
       "source_evidence": [
         "keybert"
       "embedding_sim": 0.732,
       "coverage": 0.732,
       "contrast": 0.595,
+      "specificity": 0.0,
       "string_sim": 0.0
     }
   },
     "recover": true,
     "concept_provenance": {
       "node_label": "Total Assessment Trials",
+      "confidence": 0.313,
       "alternatives": [
+        "assessment trials subject",
+        "trials subject failed",
+        "trials subject"
       ],
       "source_evidence": [
         "keybert"
       "embedding_sim": 0.629,
       "coverage": 0.629,
       "contrast": 0.204,
+      "specificity": 0.0,
       "string_sim": 0.0
     }
   },
     "id": 121,
     "name": "PAL",
     "related": [
       123,
       125,
       126,
+      146,
       147
     ],
     "type": "aggregation",
   },
   {
     "id": 122,
+    "name": "Attempts Patterns",
     "related": [
       49,
       50,
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: PAL > Attempts Patterns Total",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Attempts Patterns Total",
+      "confidence": 0.633,
+      "alternatives": [
+        "patterns total attempts",
+        "Total Attempts Patterns"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.598,
+      "coverage": 0.598,
+      "contrast": 0.151,
+      "specificity": 0.0,
+      "string_sim": 0.975
     }
   },
   {
   },
   {
     "id": 124,
+    "name": "Errors Patterns",
     "related": [
       55,
       56,
     "recover": true,
     "concept_provenance": {
       "node_label": "Errors Patterns Total",
+      "confidence": 0.296,
       "alternatives": [
+        "box stimulus assessment",
+        "stimulus assessment problems",
+        "incorrect box stimulus"
       ],
       "source_evidence": [
         "keybert"
       "embedding_sim": 0.619,
       "coverage": 0.619,
       "contrast": 0.115,
+      "specificity": 0.0,
       "string_sim": 0.0
     }
   },
     "recover": true,
     "concept_provenance": {
       "node_label": "Include Total Errors Shapes",
+      "confidence": 0.609,
+      "alternatives": [
+        "total errors shapes",
+        "errors shapes times",
+        "errors shapes"
+      ],
       "source_evidence": [
+        "description_title"
       ],
+      "embedding_sim": 0.549,
+      "coverage": 0.549,
+      "contrast": 0.08,
+      "specificity": 0.0,
+      "string_sim": 1.0
     }
   },
   {
   },
   {
     "id": 130,
+    "name": "Latency Immediate Standard",
     "related": [
+      67,
+      68,
+      69,
+      70,
+      71,
+      72
     ],
     "type": "aggregation",
     "info": {
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: PRM > Latency Immediate Standard",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Latency Immediate Standard",
+      "confidence": 0.653,
+      "alternatives": [
+        "correct latency immediate",
+        "latency immediate",
+        "correct latency delayed"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.715,
+      "coverage": 0.715,
+      "contrast": 0.34,
+      "specificity": 0.0,
+      "string_sim": 0.801
     }
   },
   {
   },
   {
     "id": 132,
+    "name": "Percent Correct Immediate",
     "related": [
       73,
       74
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: PRM > Percent Correct Immediate",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Percent Correct Immediate",
+      "confidence": 0.596,
+      "alternatives": [
+        "Percent Correct",
+        "key percent correct",
+        "percent correct delayed"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.671,
+      "coverage": 0.671,
+      "contrast": 0.245,
+      "specificity": 0.0,
+      "string_sim": 0.735
     }
   },
   {
     "recover": true,
     "concept_provenance": {
       "node_label": "Total",
+      "confidence": 0.407,
+      "alternatives": [
+        "total hits",
+        "hits total"
+      ],
       "source_evidence": [
+        "description_title"
       ],
+      "embedding_sim": 0.111,
+      "coverage": 0.111,
+      "contrast": 0.05,
+      "specificity": 0.0,
+      "string_sim": 1.0
     }
   },
   {
     "id": 135,
+    "name": "Response Latency Mean",
     "related": [
       77,
       78,
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: RVP > Response Latency Mean",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Response Latency Mean",
+      "confidence": 0.676,
+      "alternatives": [
+        "Response Latency",
+        "response latency trials",
+        "latency mean response"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.683,
+      "coverage": 0.683,
+      "contrast": 0.311,
+      "specificity": 0.0,
+      "string_sim": 0.92
     }
   },
   {
     "related": [
       140,
       141,
+      142,
       143,
       144,
       148
     ],
     "type": "aggregation",
     "dtype": "determine",
     "recover": true
   },
+  {
+    "id": 138,
+    "name": "Times Errors",
+    "related": [
+      85,
+      86,
+      87,
+      88,
+      89
+    ],
+    "type": "aggregation",
+    "info": {
+      "operation": "concat",
+      "usedAttributes": [],
+      "formula": "",
+      "exec": "",
+      "relation_type": "belongs_to",
+      "relation_label": "belongs to"
+    },
+    "isShown": true,
+    "desc": "Concept group: SWM > Errors Boxes Times",
+    "dtype": "determine",
+    "recover": true,
+    "concept_provenance": {
+      "node_label": "Errors Boxes Times",
+      "confidence": 0.515,
+      "alternatives": [
+        "Errors Boxes",
+        "key errors boxes",
+        "errors times"
+      ],
+      "source_evidence": [
+        "keybert"
+      ],
+      "embedding_sim": 0.447,
+      "coverage": 0.447,
+      "contrast": 0.0,
+      "specificity": 0.0,
+      "string_sim": 0.896
+    }
+  },
   {
     "id": 140,
+    "name": "Strategy High",
     "related": [
       96,
       97,
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: SWM > Strategy High",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Strategy High",
+      "confidence": 0.569,
+      "alternatives": [
+        "Strategy",
+        "strategy finding",
+        "high strategy"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.509,
+      "coverage": 0.509,
+      "contrast": 0.362,
+      "specificity": 0.0,
+      "string_sim": 0.814
     }
   },
   {
     "recover": true
   },
   {
+    "id": 142,
+    "name": "Within Errors",
     "related": [
+      104,
+      105,
+      107,
+      108,
+      106
     ],
     "type": "aggregation",
     "info": {
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: SWM > Within Errors",
     "dtype": "determine",
     "recover": true,
     "concept_provenance": {
+      "node_label": "Within Errors",
+      "confidence": 0.412,
+      "alternatives": [
+        "boxes times subject"
+      ],
       "source_evidence": [
+        "keybert"
       ],
+      "embedding_sim": 0.303,
+      "coverage": 0.303,
       "contrast": 0.0,
+      "specificity": 0.0,
+      "string_sim": 0.787
     }
   },
   {
+    "id": 143,
+    "name": "Errors Total",
     "related": [
+      99,
+      100,
+      101,
+      102,
+      103
     ],
     "type": "aggregation",
     "info": {
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Concept group: SWM > Errors Total",
     "dtype": "determine",
+    "recover": true,
+    "concept_provenance": {
+      "node_label": "Errors Total",
+      "confidence": 0.593,
+      "alternatives": [
+        "errors total times",
+        "Total Errors"
+      ],
+      "source_evidence": [
+        "keybert"
+      ],
+      "embedding_sim": 0.537,
+      "coverage": 0.537,
+      "contrast": 0.07,
+      "specificity": 0.0,
+      "string_sim": 0.974
+    }
   },
   {
+    "id": 144,
     "name": "Double Errors",
     "related": [
       92,
     "recover": true
   },
   {
+    "id": 145,
     "name": "Correct Latency",
     "related": [
+      112,
+      114
     ],
     "type": "aggregation",
     "info": {
     "dtype": "determine",
     "recover": true
   },
+  {
+    "id": 146,
+    "name": "Patterns Total",
+    "related": [
+      122,
+      124
+    ],
+    "type": "aggregation",
+    "info": {
+      "operation": "concat",
+      "usedAttributes": [],
+      "formula": "",
+      "exec": "",
+      "relation_type": "belongs_to",
+      "relation_label": "belongs to"
+    },
+    "isShown": true,
+    "desc": "Measure: Patterns Total",
+    "dtype": "determine",
+    "recover": true
+  },
   {
     "id": 147,
     "name": "Total Errors",
     "id": 148,
     "name": "Errors Boxes",
     "related": [
+      138
     ],
     "type": "aggregation",
     "info": {
   },
   {
     "id": 152,
     "name": "Median Seconds Delay",
     "related": [
       9,
     "recover": true
   },
   {
+    "id": 153,
+    "name": "Seconds Delay",
     "related": [
       16,
       17,
       "relation_label": "belongs to"
     },
     "isShown": true,
+    "desc": "Sub-group: Seconds Delay",
     "dtype": "determine",
     "recover": true
   },
   {
+    "id": 154,
     "name": "Median",
     "related": [
       8,
     "desc": "Sub-group: Median",
     "dtype": "determine",
     "recover": true
   }
 ]

pages/2_Approach_1.py CHANGED Viewed

@@ -121,7 +121,7 @@ _STOP = {
 USE_NOUN_PHRASES  = False
 # USE_CTFIDF — True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
 #   boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
-USE_CTFIDF        = False
 # KEYBERT_DIVERSITY — MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
 #   (pick the single most relevant phrase); 0.5 = standard MMR diversification.
 KEYBERT_DIVERSITY = 0
@@ -148,6 +148,20 @@ _CORPUS_IDF: dict = {}
 # scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
 _ACTIVE_DOMAIN: str = 'general'
 # ─────────────────────────────────────────────────────────────────────────────
 # FILE LOADING
 # ─────────────────────────────────────────────────────────────────────────────
@@ -345,6 +359,9 @@ def build_canonical(df, cfg, source):
         if not sem_parts:
             sem_parts = list(leaf_parts) if leaf_parts else []
         semantic_text = ' '.join(sem_parts) if sem_parts else text
         rows.append({
             '_source_file':    source,
             '_row_index':      int(i),
@@ -1983,26 +2000,34 @@ def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None,
     """
     ancestor_words = ancestor_words or set()
     used = {str(u).lower() for u in (used_labels or [])}
     cand = set()
     for t in member_texts:
-        raw = re.sub(r'\([^)]*\)', ' ', str(t))
         nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
         if nps:
             for p in nps:
-                toks = [w for w in p.lower().split()
-                        if w not in _STOP and w not in ancestor_words]
                 if toks:
                     cand.add(' '.join(toks))
         else:
-            toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower())
-                    if w not in _STOP and w not in ancestor_words]
             for nlen in range(1, max_words + 1):
                 for i in range(len(toks) - nlen + 1):
                     cand.add(' '.join(toks[i:i + nlen]))
-    cand = [c for c in cand if len(c) >= 4 and c.lower() not in used
-            and not c.replace(' ', '').isdigit()
-            and not re.search(r'\b(\w+)\s+\1\b', c.lower())]
-    return cand[:cap]
 def _concept_title(text):
@@ -2047,9 +2072,10 @@ def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None
     used_labels    = {str(u).lower() for u in (used_labels or [])}
     def _phrases(title):
-        t = re.sub(r'\([^)]*\)', ' ', title.lower())      # drop parenthetical conditions
         toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
-                if w not in _STOP and w not in ancestor_words]
         out = set()
         for nlen in range(1, max_words + 1):
             for i in range(len(toks) - nlen + 1):
@@ -2098,9 +2124,10 @@ def _raw_title(text):
 def _label_from_own_title(title, ancestor_words, max_words=4):
     """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
     words and parentheticals). Returns '' for sentence-like / empty titles."""
-    t = re.sub(r'\([^)]*\)', ' ', str(title).lower())
     toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
-            if w not in _STOP and w not in ancestor_words]
     if not toks or len(toks) > 7:          # >7 words ⇒ prose, not a concept title
         return ''
     return ' '.join(toks[:max_words]).title()
@@ -2413,6 +2440,12 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
         if pool and cluster_emb is not None:
             cand_embs = np.asarray(embedder.encode(pool), dtype=float)
             relevance = cosine_similarity([cluster_emb], cand_embs)[0]
             if sibling_centroids:
                 sib_sim  = cosine_similarity(cand_embs,
                                              np.asarray(sibling_centroids, dtype=float)).max(axis=1)
@@ -4053,6 +4086,16 @@ if uploads:
                 else:
                     c_embs = None
                 nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
                 st.session_state.hiexpan_report = report
                 wmoves = report.get('width_expansion_moves', 0)
                 dexp   = report.get('depth_expansion_nodes', 0)

 USE_NOUN_PHRASES  = False
 # USE_CTFIDF — True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
 #   boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
+USE_CTFIDF        = True
 # KEYBERT_DIVERSITY — MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
 #   (pick the single most relevant phrase); 0.5 = standard MMR diversification.
 KEYBERT_DIVERSITY = 0
 # scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
 _ACTIVE_DOMAIN: str = 'general'
+# Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from
+# data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows
+# embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the
+# embedding text so they can neither name a node nor distort clustering. Domain-agnostic
+# documentation/scale tokens only — not concept vocabulary.
+_LABEL_BOILERPLATE = {
+    'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link',
+    'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category',
+    'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat',
+}
+# Inline URLs in free text (http://…, www.…/…) — removed from the embedding text.
+_URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
+                     re.IGNORECASE)
 # ─────────────────────────────────────────────────────────────────────────────
 # FILE LOADING
 # ─────────────────────────────────────────────────────────────────────────────
         if not sem_parts:
             sem_parts = list(leaf_parts) if leaf_parts else []
         semantic_text = ' '.join(sem_parts) if sem_parts else text
+        # Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens
+        # cannot dominate either the embedding (clustering) or the KeyBERT label.
+        semantic_text = _URL_RE.sub(' ', semantic_text)
         rows.append({
             '_source_file':    source,
             '_row_index':      int(i),
     """
     ancestor_words = ancestor_words or set()
     used = {str(u).lower() for u in (used_labels or [])}
+    block = _STOP | ancestor_words | _LABEL_BOILERPLATE   # boilerplate/web/Likert tokens out
     cand = set()
     for t in member_texts:
+        raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t)))
         nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
         if nps:
             for p in nps:
+                toks = [w for w in p.lower().split() if w not in block]
                 if toks:
                     cand.add(' '.join(toks))
         else:
+            toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block]
             for nlen in range(1, max_words + 1):
                 for i in range(len(toks) - nlen + 1):
                     cand.add(' '.join(toks[i:i + nlen]))
+    def _ok(c):
+        words = c.split()
+        if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit():
+            return False
+        if re.search(r'\b(\w+)\s+\1\b', c.lower()):        # adjacent word repeat
+            return False
+        if len(words) == 4 and words[:2] == words[2:]:     # phrase repeat "x y x y"
+            return False
+        if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)):  # bare fragment/acronym
+            return False
+        return True
+    return [c for c in cand if _ok(c)][:cap]
 def _concept_title(text):
     used_labels    = {str(u).lower() for u in (used_labels or [])}
     def _phrases(title):
+        t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower()))   # drop parens + URLs
         toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
+                if w not in _STOP and w not in ancestor_words
+                and w not in _LABEL_BOILERPLATE]                          # web/Likert out
         out = set()
         for nlen in range(1, max_words + 1):
             for i in range(len(toks) - nlen + 1):
 def _label_from_own_title(title, ancestor_words, max_words=4):
     """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
     words and parentheticals). Returns '' for sentence-like / empty titles."""
+    t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower()))
     toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
+            if w not in _STOP and w not in ancestor_words
+            and w not in _LABEL_BOILERPLATE]
     if not toks or len(toks) > 7:          # >7 words ⇒ prose, not a concept title
         return ''
     return ' '.join(toks[:max_words]).title()
         if pool and cluster_emb is not None:
             cand_embs = np.asarray(embedder.encode(pool), dtype=float)
             relevance = cosine_similarity([cluster_emb], cand_embs)[0]
+            # c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic
+            # phrases ("test", "description", "measure", "scores") lose to distinctive ones.
+            if USE_CTFIDF and _CORPUS_IDF:
+                _mx  = max(_CORPUS_IDF.values()) or 1.0
+                _idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool])
+                relevance = relevance * (0.5 + 0.5 * _idf)
             if sibling_centroids:
                 sib_sim  = cosine_similarity(cand_embs,
                                              np.asarray(sibling_centroids, dtype=float)).max(axis=1)
                 else:
                     c_embs = None
                 nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
+                # HiExpan's width/global passes MOVE leaves between concepts; a concept
+                # that loses all its leaves becomes empty. build_concept_hierarchy prunes
+                # internally, but that runs BEFORE HiExpan — so re-prune here, else empty
+                # nodes break the Plotly branchvalues='total' sunburst/treemap (parent
+                # value < sum(children) → blank render; node-link is unaffected).
+                _prune_empty_aggregations(nodes)
+                _alive = {int(n['id']) for n in nodes}
+                for _n in nodes:
+                    _n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', []))
+                                     if x in _alive]
                 st.session_state.hiexpan_report = report
                 wmoves = report.get('width_expansion_moves', 0)
                 dexp   = report.get('depth_expansion_nodes', 0)