RoophaSharon commited on
Commit
2b56f2e
Β·
1 Parent(s): 51c62ea

Sync demo (downloads, build summary, HCP depth fix) + latest approach_1; clean canonical outputs

Browse files
approach_1.py CHANGED
@@ -111,6 +111,57 @@ _STOP = {
111
  'using','use','based','given','defined','number','value','values','score',
112
  }
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  # ─────────────────────────────────────────────────────────────────────────────
115
  # FILE LOADING
116
  # ─────────────────────────────────────────────────────────────────────────────
@@ -308,6 +359,9 @@ def build_canonical(df, cfg, source):
308
  if not sem_parts:
309
  sem_parts = list(leaf_parts) if leaf_parts else []
310
  semantic_text = ' '.join(sem_parts) if sem_parts else text
 
 
 
311
  rows.append({
312
  '_source_file': source,
313
  '_row_index': int(i),
@@ -342,34 +396,22 @@ def build_canonical(df, cfg, source):
342
  # ─────────────────────────────────────────────────────────────────────────────
343
  def precompute_stat_cond_facets(can):
344
  """
345
- Pre-compute _facet_stat and _facet_cond on can.
346
- Called before build_concept_hierarchy so that _cluster_and_label can use
347
- these columns to insert Statistic and Condition sub-tiers.
348
- No hardcoding: all patterns are learned from the data descriptions.
 
 
 
 
 
 
 
349
  [CAS] Castanet parallel facets Β· [HIE] HiExpan sub-set discovery
350
  """
351
  can = can.copy()
352
  sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text'
353
 
354
- # ── Statistic type: detected from description text ─────────────────────────
355
- _stat_re = re.compile(
356
- r'\b(mean|average|median|standard deviation|std|percent|proportion|'
357
- r'probability|total|sum|count|maximum|minimum|range|variance|'
358
- r'coefficient|ratio|rate|frequency)\b', re.IGNORECASE
359
- )
360
- _stat_norm = {
361
- 'average': 'Mean', 'std': 'Standard Deviation', 'proportion': 'Percent',
362
- 'sum': 'Total', 'count': 'Total', 'frequency': 'Rate',
363
- }
364
- def _extract_stat(row):
365
- hits = _stat_re.findall(str(row.get(sem_col, row.get('_text', ''))).lower())
366
- if not hits:
367
- return ''
368
- h = hits[0].lower()
369
- return _stat_norm.get(h, h.title())
370
- stat_col = can.apply(_extract_stat, axis=1)
371
- can['_facet_stat'] = stat_col.where(stat_col != '', '')
372
-
373
  # ── Condition: digit in variable code VALIDATED by description text ──────────
374
  # [FIX2][GON] GonΓ§alves et al. (ESWC 2019): structural code alignment must be
375
  # validated against description text β€” the description is the authoritative source.
@@ -1756,8 +1798,11 @@ _MIN_FACET_GROUP = 2 # minimum variables per facet sub-group
1756
  def _do_facet_subsplit(sub_can, parent_id, current_path,
1757
  nodes, leaf_to_id, ensure_path_fn):
1758
  """
1759
- [F4][CAS] Split by _facet_stat first, then delegate to _do_cond_subsplit.
1760
- If fewer than 2 valid stat groups, skip stat and go straight to cond.
 
 
 
1761
  """
1762
  # A facet tier that merely repeats the parent concept label (e.g. a "Total"
1763
  # statistic under a "Total" concept) is redundant β€” skip it.
@@ -1858,6 +1903,133 @@ def _do_cond_subsplit(sub_can, parent_id, current_path,
1858
  # 4. [F4] For each concept cluster: facet sub-split by Statistic β†’ Condition
1859
  # 5. Store concept assignment back on each variable in can
1860
  # ─────────────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1861
  def _concept_title(text):
1862
  """
1863
  Extract the human-written concept TITLE from a metadata description.
@@ -1900,9 +2072,10 @@ def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None
1900
  used_labels = {str(u).lower() for u in (used_labels or [])}
1901
 
1902
  def _phrases(title):
1903
- t = re.sub(r'\([^)]*\)', ' ', title.lower()) # drop parenthetical conditions
1904
  toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
1905
- if w not in _STOP and w not in ancestor_words]
 
1906
  out = set()
1907
  for nlen in range(1, max_words + 1):
1908
  for i in range(len(toks) - nlen + 1):
@@ -1951,9 +2124,10 @@ def _raw_title(text):
1951
  def _label_from_own_title(title, ancestor_words, max_words=4):
1952
  """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
1953
  words and parentheticals). Returns '' for sentence-like / empty titles."""
1954
- t = re.sub(r'\([^)]*\)', ' ', str(title).lower())
1955
  toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
1956
- if w not in _STOP and w not in ancestor_words]
 
1957
  if not toks or len(toks) > 7: # >7 words β‡’ prose, not a concept title
1958
  return ''
1959
  return ' '.join(toks[:max_words]).title()
@@ -2092,11 +2266,18 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
2092
  _aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks
2093
 
2094
  if n < 3 or concept_embs is None or len(concept_table) == 0:
2095
- # Too few variables to cluster β€” label each from its own title [Fix5];
2096
- # ensure_path merges it into an existing concept of the same name.
 
2097
  pid = ensure_path_fn(path_prefix)
 
2098
  for i, (_, row) in enumerate(tdf.iterrows()):
2099
  lbl = _label_from_own_title(titles[i], _aw_base)
 
 
 
 
 
2100
  tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \
2101
  {a.lower() for a in ancestor_names} else pid
2102
  add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
@@ -2174,6 +2355,14 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
2174
  if len(cluster_idxs) == 1:
2175
  _, row = rows_list[cluster_idxs[0]]
2176
  lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base)
 
 
 
 
 
 
 
 
2177
  if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}:
2178
  tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to')
2179
  can.at[row.name, '_concept_label'] = lbl
@@ -2182,7 +2371,7 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
2182
  can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root'
2183
  add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
2184
  can.at[row.name, '_concept_score'] = 0.0
2185
- can.at[row.name, '_concept_source'] = 'singleton_title'
2186
  continue
2187
 
2188
  if cluster_emb is not None:
@@ -2201,32 +2390,100 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
2201
  else:
2202
  scores = []
2203
 
2204
- # PRIMARY LABEL = the concept shared by the cluster's member TITLES, chosen
2205
- # contrastively against siblings (tree-based local-IDF). Reads the data's own
2206
- # human-written names β€” never the boilerplate definition text β€” so
2207
- # "Calculated Assessed Trials" can no longer be a label. No hardcoding.
 
 
 
 
 
 
 
 
2208
  ancestor_words = set(re.findall(r'[a-z]{3,}',
2209
  ' '.join(ancestor_names).lower())) | _top_level_tasks
2210
  member_titles_k = [titles[i] for i in cluster_idxs]
2211
  sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k]
 
 
 
 
2212
  title_label = _title_cluster_label(member_titles_k, sibling_title_lists,
2213
- ancestor_words=ancestor_words,
2214
- used_labels=used_sibling_labels)
2215
-
2216
- # The TITLE wins whenever it exists. External enrichment only attaches a
2217
- # definition to a metadata candidate β€” it does NOT give it a cleaner NAME,
2218
- # so a 'cognitive_atlas'-sourced candidate can still be boilerplate like
2219
- # "Calculated Assessed Trials". Scored candidates are therefore only a
2220
- # FALLBACK used when the cluster has no shared title concept at all.
2221
- sibling_texts = [all_cluster_texts[j] for j in range(n_clust) if j != k]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2222
  fallback_label = (title_label
 
2223
  or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts)
2224
  or f'Group {k+1}')
2225
- candidate_scores = [] if title_label else scores
2226
 
2227
  label, provenance = assign_concept_label(
2228
  candidate_scores,
2229
  fallback=fallback_label,
 
2230
  ancestor_names=ancestor_names,
2231
  used_sibling_labels=used_sibling_labels,
2232
  top_level_tasks=_top_level_tasks,
@@ -2282,24 +2539,24 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
2282
  pid = ensure_path_fn(path_prefix + [label],
2283
  relation='belongs_to', provenance=provenance)
2284
 
2285
- # Store concept assignment on can (needed by Castanet facets later)
 
 
2286
  for ci in cluster_idxs:
2287
  _, row = rows_list[ci]
2288
  can.at[row.name, '_concept_label'] = label
2289
- can.at[row.name, '_concept_score'] = round(scores[0]['score'], 3) if scores else 0.0
2290
- can.at[row.name, '_concept_source'] = scores[0]['source'] if scores else 'fallback'
2291
-
2292
- # [F4][CAS][HIE] Facet-guided sub-splitting: Statistic β†’ Condition tiers.
2293
- # NOTE: this uses a small hardcoded statistic/condition word list
2294
- # (precompute_stat_cond_facets). Removing it measurably degraded the
2295
- # structure (it is what separates Mean/Median/SD), so it is kept. The
2296
- # parent-duplicate guard inside prevents redundant "Total > Total" tiers.
2297
- cluster_idx_list = [rows_list[ci][0] for ci in cluster_idxs]
2298
- cluster_can = can.loc[cluster_idx_list]
2299
- _do_facet_subsplit(
2300
- cluster_can, pid, path_prefix + [label],
2301
- nodes, leaf_to_id, ensure_path_fn
2302
- )
2303
 
2304
 
2305
  def _remove_phrase(tokens, phrase_tokens):
@@ -2511,6 +2768,43 @@ def _prune_empty_aggregations(nodes):
2511
  return nodes
2512
 
2513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2514
  def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project',
2515
  n_clusters_per_group=8):
2516
  """
@@ -2550,6 +2844,27 @@ def build_concept_hierarchy(can, embedder, concept_table, project='metadata_proj
2550
  # is discriminative; one close to ALL of them is boilerplate. corpus_centroid
2551
  # is the global mean (generic = central). Both are derived purely from data.
2552
  sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2553
  ref_centroids = corpus_centroid = None
2554
  try:
2555
  all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist())
@@ -2648,6 +2963,9 @@ def build_concept_hierarchy(can, embedder, concept_table, project='metadata_proj
2648
  # Remove empty concept nodes (no variables) β€” meaningless and they break the
2649
  # branchvalues='total' sunburst (parent value < sum of children β†’ blank render).
2650
  _prune_empty_aggregations(nodes)
 
 
 
2651
  # NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted β€”
2652
  # it regressed setOverlap (0.914β†’0.836: mis-grouping) and added depth beyond gold.
2653
  # _nest_by_category() is kept defined but intentionally NOT called.
@@ -3745,13 +4063,13 @@ if uploads:
3745
  # [F3][F5][CAS] These columns are needed inside _cluster_and_label
3746
  # for facet sub-splitting. They must be computed BEFORE Step G.
3747
  # detect_facets / build_castanet_facets runs AFTER hierarchy build
3748
- # (Step I), so we pre-compute only _facet_stat and _facet_cond here.
3749
- with st.spinner('Pre-computing Statistic and Condition facets [CAS]...'):
 
3750
  can = precompute_stat_cond_facets(can)
3751
- n_stat = can['_facet_stat'].ne('').sum()
3752
  n_cond = can['_facet_cond'].ne('').sum()
3753
- st.info(f'Facet pre-computation: {n_stat} variables with Statistic, '
3754
- f'{n_cond} with Condition.')
3755
 
3756
  # ── Step G: Build concept hierarchy (NΓ—M alignment) ──────────────
3757
  with st.spinner('Building concept hierarchy via NΓ—M alignment [GON][TAX]...'):
@@ -3768,6 +4086,16 @@ if uploads:
3768
  else:
3769
  c_embs = None
3770
  nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
 
 
 
 
 
 
 
 
 
 
3771
  st.session_state.hiexpan_report = report
3772
  wmoves = report.get('width_expansion_moves', 0)
3773
  dexp = report.get('depth_expansion_nodes', 0)
 
111
  'using','use','based','given','defined','number','value','values','score',
112
  }
113
 
114
+ # ─── KeyBERT / labelling configuration ───────────────────────────────────────
115
+ # These tune the KeyBERT label synthesizer used in the hybrid scorer.
116
+ #
117
+ # USE_NOUN_PHRASES β€” True: candidate phrases are NLTK POS-tagged noun phrases
118
+ # (needs the 'averaged_perceptron_tagger' corpus); False: plain n-gram candidates
119
+ # from tokens. False is robust for short CANTAB/AI-MIND descriptions and avoids the
120
+ # extra NLTK dependency.
121
+ USE_NOUN_PHRASES = False
122
+ # USE_CTFIDF β€” True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
123
+ # boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
124
+ USE_CTFIDF = True
125
+ # KEYBERT_DIVERSITY β€” MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
126
+ # (pick the single most relevant phrase); 0.5 = standard MMR diversification.
127
+ KEYBERT_DIVERSITY = 0
128
+
129
+ # ─── Title-SEEDED KeyBERT label-scorer weights ───────────────────────────────
130
+ # Concept labels are FORMED FROM THE DESCRIPTIONS (KeyBERT candidate phrases over the
131
+ # cluster's member descriptions). The pre-colon title is a ranking SEED/anchor, not the
132
+ # label itself: LABEL_W_TITLE controls how strongly it biases the choice toward the
133
+ # human-canonical phrasing (this is "Guided/Seeded KeyBERT"). Set LABEL_W_TITLE=0 for a
134
+ # pure-description ablation. Magnitudes are relative (need not sum to 1).
135
+ LABEL_W_RELEVANCE = 0.45 # cosine(candidate, cluster centroid) β€” description fit (Ξ±)
136
+ LABEL_W_TITLE = 0.35 # cosine(candidate, pre-colon title) β€” title influence (Ξ²)
137
+ LABEL_W_CONTRAST = 0.15 # discriminativeness vs sibling clusters (Ξ³)
138
+ # NOTE: node labels are formed from DESCRIPTIONS + pre-colon TITLE only. External
139
+ # ontology sources (Cognitive Atlas / Wikidata / WordNet / PubMed) inform the embedding
140
+ # space / semantic understanding but are never used to name a node β€” so there is no
141
+ # external-grounding term in the label score.
142
+
143
+ # Corpus IDF over description n-grams; populated in build_concept_hierarchy() and
144
+ # consumed by _keybert_label when USE_CTFIDF=True.
145
+ _CORPUS_IDF: dict = {}
146
+
147
+ # Active dataset domain; set in build_concept_hierarchy(), read by the hybrid label
148
+ # scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
149
+ _ACTIVE_DOMAIN: str = 'general'
150
+
151
+ # Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from
152
+ # data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows
153
+ # embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the
154
+ # embedding text so they can neither name a node nor distort clustering. Domain-agnostic
155
+ # documentation/scale tokens only β€” not concept vocabulary.
156
+ _LABEL_BOILERPLATE = {
157
+ 'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link',
158
+ 'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category',
159
+ 'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat',
160
+ }
161
+ # Inline URLs in free text (http://…, www.…/…) β€” removed from the embedding text.
162
+ _URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
163
+ re.IGNORECASE)
164
+
165
  # ─────────────────────────────────────────────────────────────────────────────
166
  # FILE LOADING
167
  # ─────────────────────────────────────────────────────────────────────────────
 
359
  if not sem_parts:
360
  sem_parts = list(leaf_parts) if leaf_parts else []
361
  semantic_text = ' '.join(sem_parts) if sem_parts else text
362
+ # Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens
363
+ # cannot dominate either the embedding (clustering) or the KeyBERT label.
364
+ semantic_text = _URL_RE.sub(' ', semantic_text)
365
  rows.append({
366
  '_source_file': source,
367
  '_row_index': int(i),
 
396
  # ─────────────────────────────────────────────────────────────────────────────
397
  def precompute_stat_cond_facets(can):
398
  """
399
+ Pre-compute _facet_cond on can (numeric experimental conditions only).
400
+ Called before build_concept_hierarchy so that _cluster_and_label can use it to
401
+ insert Condition sub-tiers.
402
+
403
+ NOTE: the statistic tier (Mean / Median / SD / …) is NO LONGER computed here.
404
+ It used to come from a hardcoded statistic vocabulary regex, which (a) is domain
405
+ hardcoding and (b) is not derived from the data's own concept titles. Statistic
406
+ depth is now produced data-drivenly by _nest_by_measure(), which discovers the
407
+ shared measure phrase and keeps the residual (Mean/Median/SD) as children β€” no
408
+ word list. Condition detection below stays: it is structural (a digit in the
409
+ code validated against the description text), not a hardcoded vocabulary.
410
  [CAS] Castanet parallel facets Β· [HIE] HiExpan sub-set discovery
411
  """
412
  can = can.copy()
413
  sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text'
414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  # ── Condition: digit in variable code VALIDATED by description text ──────────
416
  # [FIX2][GON] GonΓ§alves et al. (ESWC 2019): structural code alignment must be
417
  # validated against description text β€” the description is the authoritative source.
 
1798
  def _do_facet_subsplit(sub_can, parent_id, current_path,
1799
  nodes, leaf_to_id, ensure_path_fn):
1800
  """
1801
+ [F4][CAS] Facet sub-split by _facet_cond (numeric condition) only.
1802
+ The statistic tier is no longer inserted here β€” it came from a hardcoded
1803
+ statistic vocabulary and is now produced data-drivenly by _nest_by_measure().
1804
+ Kept defensive: if a legacy _facet_stat column is present it is still honoured,
1805
+ but precompute_stat_cond_facets() no longer produces one.
1806
  """
1807
  # A facet tier that merely repeats the parent concept label (e.g. a "Total"
1808
  # statistic under a "Total" concept) is redundant β€” skip it.
 
1903
  # 4. [F4] For each concept cluster: facet sub-split by Statistic β†’ Condition
1904
  # 5. Store concept assignment back on each variable in can
1905
  # ─────────────────────────────────────────────────────────────────────────────
1906
+ def _noun_phrases(text, max_words=4):
1907
+ """
1908
+ Grammatical noun phrases via NLTK POS tagging (used when USE_NOUN_PHRASES=True).
1909
+ Returns [] if NLTK / the tagger is unavailable, so the caller falls back to
1910
+ n-grams. Phrases are contiguous runs of adjectives/nouns up to max_words long.
1911
+ """
1912
+ try:
1913
+ import nltk
1914
+ for _pkg in ('averaged_perceptron_tagger', 'punkt'):
1915
+ try:
1916
+ nltk.data.find(f'taggers/{_pkg}' if 'tagger' in _pkg else f'tokenizers/{_pkg}')
1917
+ except LookupError:
1918
+ nltk.download(_pkg, quiet=True)
1919
+ toks = nltk.word_tokenize(str(text))
1920
+ tags = nltk.pos_tag(toks)
1921
+ except Exception:
1922
+ return []
1923
+ phrases, cur = [], []
1924
+ for w, t in tags:
1925
+ if t.startswith('NN') or t.startswith('JJ'):
1926
+ cur.append(w)
1927
+ if len(cur) > max_words:
1928
+ cur = cur[-max_words:]
1929
+ else:
1930
+ if len(cur) >= 1:
1931
+ phrases.append(' '.join(cur))
1932
+ cur = []
1933
+ if cur:
1934
+ phrases.append(' '.join(cur))
1935
+ return [p for p in phrases if len(p) >= 3]
1936
+
1937
+
1938
+ def _keybert_label(member_texts, cluster_centroid, embedder, ancestor_words=None,
1939
+ corpus_centroid=None, used_labels=None, max_words=4,
1940
+ gen_weight=0.0, diversity=KEYBERT_DIVERSITY, cap=500):
1941
+ """
1942
+ KeyBERT-style extractive labeller. Extract candidate phrases from the cluster's
1943
+ DESCRIPTIONS, embed them, and pick by:
1944
+ score = (1 βˆ’ diversity)Β·cos(phrase, cluster_centroid)
1945
+ βˆ’ diversity Β·cos(phrase, mean candidate phrase) # MMR redundancy
1946
+ With diversity=0 this is plain cosine-to-centroid (argmax relevance). When
1947
+ USE_CTFIDF=True the relevance is modulated by corpus IDF so boilerplate (low IDF)
1948
+ is suppressed. Candidates come from noun phrases (USE_NOUN_PHRASES=True) or
1949
+ n-grams. Extractive β€” never hallucinates a label. Returns a title-cased string.
1950
+ """
1951
+ ancestor_words = ancestor_words or set()
1952
+ used = {str(u).lower() for u in (used_labels or [])}
1953
+ cand = set()
1954
+ for t in member_texts:
1955
+ raw = re.sub(r'\([^)]*\)', ' ', str(t)) # drop parentheticals
1956
+ nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
1957
+ if nps:
1958
+ for p in nps:
1959
+ toks = [w for w in p.lower().split()
1960
+ if w not in _STOP and w not in ancestor_words]
1961
+ if toks:
1962
+ cand.add(' '.join(toks))
1963
+ else:
1964
+ toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower())
1965
+ if w not in _STOP and w not in ancestor_words]
1966
+ for nlen in range(1, max_words + 1):
1967
+ for i in range(len(toks) - nlen + 1):
1968
+ cand.add(' '.join(toks[i:i + nlen]))
1969
+ # Junk filter: drop used labels, pure-number phrases, immediately-repeated words.
1970
+ cand = [c for c in cand if len(c) >= 4 and c.lower() not in used
1971
+ and not c.replace(' ', '').isdigit()
1972
+ and not re.search(r'\b(\w+)\s+\1\b', c.lower())]
1973
+ if not cand:
1974
+ return ''
1975
+ cand = cand[:cap]
1976
+ embs = np.asarray(embedder.encode(cand), dtype=float)
1977
+ sims = cosine_similarity([cluster_centroid], embs)[0] # relevance
1978
+ if USE_CTFIDF and _CORPUS_IDF:
1979
+ mx = max(_CORPUS_IDF.values()) or 1.0
1980
+ idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), mx) / mx) for c in cand])
1981
+ sims = sims * (0.5 + 0.5 * idf)
1982
+ if gen_weight and corpus_centroid is not None:
1983
+ sims = sims - gen_weight * cosine_similarity([corpus_centroid], embs)[0]
1984
+ if diversity > 0 and len(embs) > 1: # MMR penalty
1985
+ generic = cosine_similarity(embs.mean(axis=0, keepdims=True), embs)[0]
1986
+ score = (1.0 - diversity) * sims - diversity * generic
1987
+ else:
1988
+ score = sims
1989
+ return cand[int(np.argmax(score))].title()
1990
+
1991
+
1992
+ def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None,
1993
+ max_words=3, cap=500):
1994
+ """
1995
+ Extract the KeyBERT CANDIDATE phrases from a cluster's member descriptions β€”
1996
+ the same generation logic as _keybert_label but returns the full candidate list
1997
+ (un-ranked) so the caller can score them with the title-seeded scorer. Phrases
1998
+ are noun phrases (USE_NOUN_PHRASES=True) or n-grams, with ancestor/task words,
1999
+ pure numbers, used labels and immediate repeats filtered out.
2000
+ """
2001
+ ancestor_words = ancestor_words or set()
2002
+ used = {str(u).lower() for u in (used_labels or [])}
2003
+ block = _STOP | ancestor_words | _LABEL_BOILERPLATE # boilerplate/web/Likert tokens out
2004
+ cand = set()
2005
+ for t in member_texts:
2006
+ raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t)))
2007
+ nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
2008
+ if nps:
2009
+ for p in nps:
2010
+ toks = [w for w in p.lower().split() if w not in block]
2011
+ if toks:
2012
+ cand.add(' '.join(toks))
2013
+ else:
2014
+ toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block]
2015
+ for nlen in range(1, max_words + 1):
2016
+ for i in range(len(toks) - nlen + 1):
2017
+ cand.add(' '.join(toks[i:i + nlen]))
2018
+
2019
+ def _ok(c):
2020
+ words = c.split()
2021
+ if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit():
2022
+ return False
2023
+ if re.search(r'\b(\w+)\s+\1\b', c.lower()): # adjacent word repeat
2024
+ return False
2025
+ if len(words) == 4 and words[:2] == words[2:]: # phrase repeat "x y x y"
2026
+ return False
2027
+ if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)): # bare fragment/acronym
2028
+ return False
2029
+ return True
2030
+ return [c for c in cand if _ok(c)][:cap]
2031
+
2032
+
2033
  def _concept_title(text):
2034
  """
2035
  Extract the human-written concept TITLE from a metadata description.
 
2072
  used_labels = {str(u).lower() for u in (used_labels or [])}
2073
 
2074
  def _phrases(title):
2075
+ t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower())) # drop parens + URLs
2076
  toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
2077
+ if w not in _STOP and w not in ancestor_words
2078
+ and w not in _LABEL_BOILERPLATE] # web/Likert out
2079
  out = set()
2080
  for nlen in range(1, max_words + 1):
2081
  for i in range(len(toks) - nlen + 1):
 
2124
  def _label_from_own_title(title, ancestor_words, max_words=4):
2125
  """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
2126
  words and parentheticals). Returns '' for sentence-like / empty titles."""
2127
+ t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower()))
2128
  toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
2129
+ if w not in _STOP and w not in ancestor_words
2130
+ and w not in _LABEL_BOILERPLATE]
2131
  if not toks or len(toks) > 7: # >7 words β‡’ prose, not a concept title
2132
  return ''
2133
  return ' '.join(toks[:max_words]).title()
 
2266
  _aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks
2267
 
2268
  if n < 3 or concept_embs is None or len(concept_table) == 0:
2269
+ # Too few variables to cluster β€” label each from its own title [Fix5], or
2270
+ # KeyBERT over its description when no title exists. ensure_path merges it
2271
+ # into an existing concept of the same name.
2272
  pid = ensure_path_fn(path_prefix)
2273
+ _small = embedder.encode(texts) if texts else None
2274
  for i, (_, row) in enumerate(tdf.iterrows()):
2275
  lbl = _label_from_own_title(titles[i], _aw_base)
2276
+ if not lbl and _small is not None:
2277
+ lbl = _keybert_label([texts[i]], _small[i], embedder,
2278
+ ancestor_words=_aw_base, used_labels=set(),
2279
+ max_words=2, gen_weight=0.3,
2280
+ diversity=KEYBERT_DIVERSITY)
2281
  tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \
2282
  {a.lower() for a in ancestor_names} else pid
2283
  add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
 
2355
  if len(cluster_idxs) == 1:
2356
  _, row = rows_list[cluster_idxs[0]]
2357
  lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base)
2358
+ src = 'singleton_title'
2359
+ if not lbl and cluster_emb is not None:
2360
+ lbl = _keybert_label([cluster_texts_k[0]], cluster_emb, embedder,
2361
+ ancestor_words=_aw_base,
2362
+ used_labels=used_sibling_labels,
2363
+ max_words=2, gen_weight=0.3,
2364
+ diversity=KEYBERT_DIVERSITY)
2365
+ src = 'singleton_keybert'
2366
  if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}:
2367
  tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to')
2368
  can.at[row.name, '_concept_label'] = lbl
 
2371
  can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root'
2372
  add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
2373
  can.at[row.name, '_concept_score'] = 0.0
2374
+ can.at[row.name, '_concept_source'] = src
2375
  continue
2376
 
2377
  if cluster_emb is not None:
 
2390
  else:
2391
  scores = []
2392
 
2393
+ # ── TITLE-SEEDED LABEL SELECTION (Guided KeyBERT) ─────────────────────
2394
+ # The label is FORMED FROM THE DESCRIPTIONS: candidates are KeyBERT phrases
2395
+ # extracted from the cluster's member descriptions (+ scored concept-table
2396
+ # entries). The pre-colon TITLE does NOT override β€” it is a ranking SEED:
2397
+ # score = Ξ±Β·cos(cand, cluster centroid) # description fit
2398
+ # + Ξ²Β·cos(cand, title embedding) # title INFLUENCE (LABEL_W_TITLE)
2399
+ # + Ξ³Β·contrast(vs siblings)
2400
+ # + δ·external grounding
2401
+ # So the displayed label is always a description-derived phrase, pulled toward
2402
+ # the human-canonical title phrasing. Set LABEL_W_TITLE=0 for a pure-description
2403
+ # ablation. The title phrase is also added as ONE candidate so a clean title can
2404
+ # still win on merit (it is usually present verbatim in the descriptions anyway).
2405
  ancestor_words = set(re.findall(r'[a-z]{3,}',
2406
  ' '.join(ancestor_names).lower())) | _top_level_tasks
2407
  member_titles_k = [titles[i] for i in cluster_idxs]
2408
  sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k]
2409
+ sibling_texts = [all_cluster_texts[j] for j in range(n_clust) if j != k]
2410
+
2411
+ # Pre-colon title β†’ used only as the SEED ANCHOR (and one candidate), never a
2412
+ # direct override.
2413
  title_label = _title_cluster_label(member_titles_k, sibling_title_lists,
2414
+ ancestor_words=ancestor_words,
2415
+ used_labels=used_sibling_labels)
2416
+ title_emb = (embedder.encode([title_label])[0]
2417
+ if title_label else None)
2418
+
2419
+ # Candidate phrases drawn ONLY from the cluster's DESCRIPTIONS (KeyBERT) plus
2420
+ # the pre-colon title. External ontology sources (Cognitive Atlas / Wikidata /
2421
+ # WordNet / PubMed) are deliberately NOT candidates β€” per design they inform the
2422
+ # embedding space / semantic understanding only, and must never name a node.
2423
+ kb_cands = _keybert_candidates(cluster_texts_k, ancestor_words=ancestor_words,
2424
+ used_labels=used_sibling_labels, max_words=3)
2425
+ pool_src = [(c, 'keybert') for c in kb_cands]
2426
+ if title_label:
2427
+ pool_src.append((title_label, 'description_title'))
2428
+ # Dedup; title's source tag takes priority over keybert when the phrase matches.
2429
+ seen_pool = {}
2430
+ for lbl, src in pool_src:
2431
+ key = lbl.lower()
2432
+ if key not in seen_pool or src == 'description_title':
2433
+ seen_pool[key] = (lbl, src)
2434
+ pool = [v[0] for v in seen_pool.values()]
2435
+ pool_srcs = [v[1] for v in seen_pool.values()]
2436
+
2437
+ keybert_label = kb_cands[0] if kb_cands else '' # for fallback only
2438
+
2439
+ candidate_scores = []
2440
+ if pool and cluster_emb is not None:
2441
+ cand_embs = np.asarray(embedder.encode(pool), dtype=float)
2442
+ relevance = cosine_similarity([cluster_emb], cand_embs)[0]
2443
+ # c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic
2444
+ # phrases ("test", "description", "measure", "scores") lose to distinctive ones.
2445
+ if USE_CTFIDF and _CORPUS_IDF:
2446
+ _mx = max(_CORPUS_IDF.values()) or 1.0
2447
+ _idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool])
2448
+ relevance = relevance * (0.5 + 0.5 * _idf)
2449
+ if sibling_centroids:
2450
+ sib_sim = cosine_similarity(cand_embs,
2451
+ np.asarray(sibling_centroids, dtype=float)).max(axis=1)
2452
+ contrast = np.clip(relevance - sib_sim, 0.0, 1.0)
2453
+ else:
2454
+ contrast = np.zeros(len(pool))
2455
+ # Title SEED: cosine of each description-derived candidate to the title.
2456
+ if title_emb is not None:
2457
+ title_sim = cosine_similarity(cand_embs, [title_emb])[:, 0]
2458
+ else:
2459
+ title_sim = np.zeros(len(pool))
2460
+ for i, cand in enumerate(pool):
2461
+ hyb = (LABEL_W_RELEVANCE * float(relevance[i])
2462
+ + LABEL_W_TITLE * float(title_sim[i])
2463
+ + LABEL_W_CONTRAST * float(contrast[i]))
2464
+ candidate_scores.append({
2465
+ 'label': cand,
2466
+ 'score': hyb,
2467
+ 'embedding_sim': float(relevance[i]),
2468
+ 'coverage': float(relevance[i]),
2469
+ 'contrast': float(contrast[i]),
2470
+ 'specificity': 0.0,
2471
+ 'string_sim': float(title_sim[i]), # title seed alignment
2472
+ 'source': pool_srcs[i],
2473
+ 'broader_relations': [],
2474
+ '_emb': cand_embs[i],
2475
+ })
2476
+ candidate_scores.sort(key=lambda x: -x['score'])
2477
+
2478
  fallback_label = (title_label
2479
+ or keybert_label
2480
  or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts)
2481
  or f'Group {k+1}')
 
2482
 
2483
  label, provenance = assign_concept_label(
2484
  candidate_scores,
2485
  fallback=fallback_label,
2486
+ min_score=0.0,
2487
  ancestor_names=ancestor_names,
2488
  used_sibling_labels=used_sibling_labels,
2489
  top_level_tasks=_top_level_tasks,
 
2539
  pid = ensure_path_fn(path_prefix + [label],
2540
  relation='belongs_to', provenance=provenance)
2541
 
2542
+ # Store concept assignment on can (needed by Castanet facets later).
2543
+ # Provenance reflects the HYBRID winner (title / keybert / concept_table),
2544
+ # not the old semantic-only scorer β€” so the exported labels CSV is accurate.
2545
  for ci in cluster_idxs:
2546
  _, row = rows_list[ci]
2547
  can.at[row.name, '_concept_label'] = label
2548
+ can.at[row.name, '_concept_score'] = provenance.get('confidence', 0.0)
2549
+ can.at[row.name, '_concept_source'] = (provenance.get('source_evidence') or ['fallback'])[0]
2550
+
2551
+ # Attach the cluster's variables directly under the concept node. The former
2552
+ # Statistic/Condition facet sub-split is removed: the statistic tier came from
2553
+ # a hardcoded vocabulary (now produced data-drivenly by _nest_by_measure), and
2554
+ # the numeric Condition tier produced bare-digit nodes (0/4/12) that inflated
2555
+ # singleton%/n_agg and moved the tree away from gold. Castanet's Condition facet
2556
+ # still exists as a separate parallel view via detect_facets() β€” not a tier.
2557
+ for ci in cluster_idxs:
2558
+ _, row = rows_list[ci]
2559
+ add_child(nodes, pid, leaf_to_id[row['_leaf_id']])
 
 
2560
 
2561
 
2562
  def _remove_phrase(tokens, phrase_tokens):
 
2768
  return nodes
2769
 
2770
 
2771
+ def _dissolve_facet_singletons(nodes):
2772
+ """
2773
+ Dissolve FACET tier nodes (Statistic / Condition) that wrap a single variable.
2774
+ A condition or statistic node with exactly one leaf child carries no grouping
2775
+ value β€” e.g. `Standard Deviation > 0 > DMSL0SD`. We remove such nodes and
2776
+ reattach their single child to the node's parent, keeping siblings together.
2777
+
2778
+ Scope is deliberately narrow: only nodes whose relation_type is 'has_condition'
2779
+ or 'is_statistic_of' are touched, so genuine single-member CONCEPT nodes that
2780
+ carry a distinctive name are preserved (per the chosen policy).
2781
+ """
2782
+ _FACET_RELS = {'has_condition', 'is_statistic_of'}
2783
+ changed = True
2784
+ while changed:
2785
+ changed = False
2786
+ pm = build_parent_map(nodes)
2787
+ m = nmap(nodes)
2788
+ for n in list(nodes):
2789
+ if n.get('type') != 'aggregation':
2790
+ continue
2791
+ if n['info'].get('relation_type') not in _FACET_RELS:
2792
+ continue
2793
+ nid = int(n['id'])
2794
+ children = [int(c) for c in n.get('related', [])]
2795
+ # "Single variable" = exactly one child and that child is a leaf attribute.
2796
+ if len(children) == 1 and m.get(children[0], {}).get('type') == 'attribute':
2797
+ parent = pm.get(nid)
2798
+ if parent is None:
2799
+ continue
2800
+ add_child(nodes, parent, children[0])
2801
+ remove_child(nodes, parent, nid)
2802
+ nodes[:] = [x for x in nodes if int(x['id']) != nid]
2803
+ changed = True
2804
+ break
2805
+ return nodes
2806
+
2807
+
2808
  def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project',
2809
  n_clusters_per_group=8):
2810
  """
 
2844
  # is discriminative; one close to ALL of them is boilerplate. corpus_centroid
2845
  # is the global mean (generic = central). Both are derived purely from data.
2846
  sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text'
2847
+
2848
+ # Active domain β€” used by the hybrid label scorer's external-grounding signal.
2849
+ global _ACTIVE_DOMAIN
2850
+ _ACTIVE_DOMAIN = detect_domain(can)
2851
+
2852
+ # Corpus IDF over description n-grams β€” KeyBERT c-TF-IDF distinctiveness weight
2853
+ # (only consulted when USE_CTFIDF=True). Data-derived, dataset-agnostic.
2854
+ global _CORPUS_IDF
2855
+ _CORPUS_IDF = {}
2856
+ try:
2857
+ from sklearn.feature_extraction.text import CountVectorizer as _CV
2858
+ _docs = can[sem_col_all].fillna('').astype(str).tolist()
2859
+ _cv = _CV(ngram_range=(1, 3), binary=True, lowercase=True,
2860
+ token_pattern=r'[a-z][a-z\-]+')
2861
+ _dt = _cv.fit_transform(_docs)
2862
+ _dfa = np.asarray(_dt.sum(axis=0)).ravel(); _N = _dt.shape[0]
2863
+ _CORPUS_IDF = {p: float(np.log((_N + 1) / (_dfa[i] + 1)) + 1.0)
2864
+ for p, i in _cv.vocabulary_.items()}
2865
+ except Exception:
2866
+ _CORPUS_IDF = {}
2867
+
2868
  ref_centroids = corpus_centroid = None
2869
  try:
2870
  all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist())
 
2963
  # Remove empty concept nodes (no variables) β€” meaningless and they break the
2964
  # branchvalues='total' sunburst (parent value < sum of children β†’ blank render).
2965
  _prune_empty_aggregations(nodes)
2966
+ # Dissolve 1-variable Statistic/Condition facet nodes (no grouping value).
2967
+ _dissolve_facet_singletons(nodes)
2968
+ _prune_empty_aggregations(nodes)
2969
  # NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted β€”
2970
  # it regressed setOverlap (0.914β†’0.836: mis-grouping) and added depth beyond gold.
2971
  # _nest_by_category() is kept defined but intentionally NOT called.
 
4063
  # [F3][F5][CAS] These columns are needed inside _cluster_and_label
4064
  # for facet sub-splitting. They must be computed BEFORE Step G.
4065
  # detect_facets / build_castanet_facets runs AFTER hierarchy build
4066
+ # (Step I), so we pre-compute only _facet_cond here. The statistic tier
4067
+ # is produced data-drivenly later by _nest_by_measure (no hardcoded vocab).
4068
+ with st.spinner('Pre-computing Condition facets [CAS]...'):
4069
  can = precompute_stat_cond_facets(can)
 
4070
  n_cond = can['_facet_cond'].ne('').sum()
4071
+ st.info(f'Facet pre-computation: {n_cond} variables with Condition. '
4072
+ f'Statistic depth is derived from concept titles (_nest_by_measure).')
4073
 
4074
  # ── Step G: Build concept hierarchy (NΓ—M alignment) ──────────────
4075
  with st.spinner('Building concept hierarchy via NΓ—M alignment [GON][TAX]...'):
 
4086
  else:
4087
  c_embs = None
4088
  nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
4089
+ # HiExpan's width/global passes MOVE leaves between concepts; a concept
4090
+ # that loses all its leaves becomes empty. build_concept_hierarchy prunes
4091
+ # internally, but that runs BEFORE HiExpan β€” so re-prune here, else empty
4092
+ # nodes break the Plotly branchvalues='total' sunburst/treemap (parent
4093
+ # value < sum(children) β†’ blank render; node-link is unaffected).
4094
+ _prune_empty_aggregations(nodes)
4095
+ _alive = {int(n['id']) for n in nodes}
4096
+ for _n in nodes:
4097
+ _n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', []))
4098
+ if x in _alive]
4099
  st.session_state.hiexpan_report = report
4100
  wmoves = report.get('width_expansion_moves', 0)
4101
  dexp = report.get('depth_expansion_nodes', 0)
demo.py CHANGED
@@ -46,8 +46,8 @@ PREBUILT = {
46
  "facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
47
  },
48
  "HCP": {
49
- "hierarchy": ROOT / "approach_1" / "keybert" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
50
- "facets": ROOT / "approach_1" / "keybert" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
51
  },
52
  },
53
  "Approach 2": {
@@ -211,7 +211,7 @@ def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
211
  font=dict(size=13), x=0.5))
212
  return fig
213
 
214
- def plot_treemap(nodes: list, color: str):
215
  nodes = _filter_dissolved(nodes)
216
  pm = _parent_map(nodes)
217
  vm = _tree_value_map(nodes, pm)
@@ -228,7 +228,7 @@ def plot_treemap(nodes: list, color: str):
228
  fig = go.Figure(go.Treemap(
229
  ids=ids, labels=labels, parents=parents, values=values,
230
  branchvalues="total", hovertext=hover, hoverinfo="text",
231
- textinfo="label+value",
232
  marker=dict(colorscale=color, line=dict(width=1, color="white"))))
233
  fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
234
  return fig
@@ -359,6 +359,33 @@ def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_lab
359
  )
360
  return fig
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  # ─────────────────────────────────────────────────────────────────────────────
363
  # IO
364
  # ─────────────────────────────────────────────────────────────────────────────
@@ -367,12 +394,37 @@ def _load_json(path_str: str):
367
  with open(path_str, encoding="utf-8") as f:
368
  return json.load(f)
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  def count_nodes(nodes: list) -> tuple[int, int]:
371
  nodes = _filter_dissolved(nodes)
372
  leaves = sum(1 for n in nodes if n.get("type") == "attribute")
373
  aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
374
  return leaves, aggs
375
 
 
 
 
 
 
 
 
 
 
376
  # ───────────────────────────────────────────────────���─────────────────────────
377
  # SIDEBAR
378
  # ─────────────────────────────────────────────────────────────────────────────
@@ -413,6 +465,51 @@ c1, c2, c3 = st.columns(3)
413
  c1.metric("Leaf Variables", leaves)
414
  c2.metric("Aggregation Nodes", aggs)
415
  c3.metric("Total Nodes", leaves + aggs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  st.markdown("---")
417
 
418
  # ── Level-of-Detail controls (above chart β€” matches the apps) ────────────────
@@ -450,15 +547,22 @@ st.divider()
450
  display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
451
 
452
  if viz_mode == "Sunburst (drill-down)":
453
- st.plotly_chart(plot_sunburst(display_nodes, color, depth), use_container_width=True)
 
 
 
 
454
  elif viz_mode == "Treemap":
455
- st.plotly_chart(plot_treemap(display_nodes, color), use_container_width=True)
 
 
 
 
456
  else:
457
  st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
458
  use_container_width=True)
459
 
460
  # ── Facets (Approach 1 only) ─────────────────────────────────────────────────
461
- facet_path = paths.get("facets")
462
  if facet_path is not None and facet_path.exists():
463
  st.markdown("---")
464
  st.subheader("πŸ”€ Parallel facets")
 
46
  "facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
47
  },
48
  "HCP": {
49
+ "hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
50
+ "facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
51
  },
52
  },
53
  "Approach 2": {
 
211
  font=dict(size=13), x=0.5))
212
  return fig
213
 
214
+ def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
215
  nodes = _filter_dissolved(nodes)
216
  pm = _parent_map(nodes)
217
  vm = _tree_value_map(nodes, pm)
 
228
  fig = go.Figure(go.Treemap(
229
  ids=ids, labels=labels, parents=parents, values=values,
230
  branchvalues="total", hovertext=hover, hoverinfo="text",
231
+ textinfo="label+value", maxdepth=max_depth,
232
  marker=dict(colorscale=color, line=dict(width=1, color="white"))))
233
  fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
234
  return fig
 
359
  )
360
  return fig
361
 
362
+ # ─────────────────────────────────────────────────────────────────────────────
363
+ # STATS / SAFE RENDERING
364
+ # ─────────────────────────────────────────────────────────────────────────────
365
+ def _tree_depth(nodes: list) -> int:
366
+ """Max depth of the rendered single-parent tree (root = depth 0)."""
367
+ nodes = _filter_dissolved(nodes)
368
+ m = {int(n["id"]): n for n in nodes}
369
+ best = {"d": 0}
370
+ def rec(nid, d):
371
+ best["d"] = max(best["d"], d)
372
+ for c in m.get(int(nid), {}).get("related", []):
373
+ if int(c) in m:
374
+ rec(int(c), d + 1)
375
+ rec(0, 0)
376
+ return best["d"]
377
+
378
+ def safe_render_depth(nodes: list, requested: int) -> int:
379
+ """Plotly sunburst/treemap silently blank when asked to draw too many sectors
380
+ at once (large hierarchies like HCP). Cap the *initial* render depth β€” the
381
+ chart stays fully drillable by clicking, so no data is lost."""
382
+ n = len(_filter_dissolved(nodes))
383
+ if n > 400:
384
+ return min(requested, 3)
385
+ if n > 150:
386
+ return min(requested, 4)
387
+ return requested
388
+
389
  # ─────────────────────────────────────────────────────────────────────────────
390
  # IO
391
  # ─────────────────────────────────────────────────────────────────────────────
 
394
  with open(path_str, encoding="utf-8") as f:
395
  return json.load(f)
396
 
397
+ def _read_bytes(path_str: str) -> bytes:
398
+ with open(path_str, "rb") as f:
399
+ return f.read()
400
+
401
+ @st.cache_data(show_spinner=False)
402
+ def _outputs_zip(root_str: str) -> bytes:
403
+ """Zip the entire bundled outputs/ folder for one-click download."""
404
+ import io, zipfile
405
+ root = Path(root_str)
406
+ buf = io.BytesIO()
407
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
408
+ for p in sorted(root.rglob("*")):
409
+ if p.is_file():
410
+ zf.write(p, arcname=p.relative_to(root.parent).as_posix())
411
+ return buf.getvalue()
412
+
413
  def count_nodes(nodes: list) -> tuple[int, int]:
414
  nodes = _filter_dissolved(nodes)
415
  leaves = sum(1 for n in nodes if n.get("type") == "attribute")
416
  aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
417
  return leaves, aggs
418
 
419
+ def concept_aligned_pct(nodes: list) -> float | None:
420
+ """% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
421
+ aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
422
+ if not aggs:
423
+ return None
424
+ aligned = sum(1 for n in aggs
425
+ if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
426
+ return 100.0 * aligned / len(aggs) if aligned else None
427
+
428
  # ───────────────────────────────────────────────────���─────────────────────────
429
  # SIDEBAR
430
  # ─────────────────────────────────────────────────────────────────────────────
 
465
  c1.metric("Leaf Variables", leaves)
466
  c2.metric("Aggregation Nodes", aggs)
467
  c3.metric("Total Nodes", leaves + aggs)
468
+
469
+ # ── Build summary (collapsed) ────────────────────────────────────────────────
470
+ facet_path = paths.get("facets")
471
+ n_facets = None
472
+ if facet_path is not None and facet_path.exists():
473
+ try:
474
+ n_facets = len(_load_json(str(facet_path)))
475
+ except Exception:
476
+ n_facets = None
477
+
478
+ with st.expander("ℹ️ Build summary", expanded=False):
479
+ bs1, bs2, bs3, bs4 = st.columns(4)
480
+ bs1.metric("Variables", leaves)
481
+ bs2.metric("Internal nodes", aggs)
482
+ bs3.metric("Tree depth", _tree_depth(raw_nodes))
483
+ bs4.metric("Facets", n_facets if n_facets is not None else "β€”")
484
+ pct = concept_aligned_pct(raw_nodes)
485
+ if pct is not None:
486
+ st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
487
+ st.caption(
488
+ f"Source file: `{hier_path.name}` Β· "
489
+ f"Approach: **{approach}** Β· Dataset: **{dataset}**. "
490
+ "Tree topology and labels are reproduced exactly from the pre-built "
491
+ "thesis output (the algorithms are not re-run in this viewer)."
492
+ )
493
+
494
+ # ── Downloads ────────────────────────────────────────────────────────────────
495
+ d1, d2, d3 = st.columns(3)
496
+ with d1:
497
+ st.download_button("⬇️ Hierarchy JSON", data=_read_bytes(str(hier_path)),
498
+ file_name=hier_path.name, mime="application/json",
499
+ use_container_width=True)
500
+ with d2:
501
+ if facet_path is not None and facet_path.exists():
502
+ st.download_button("⬇️ Facets JSON", data=_read_bytes(str(facet_path)),
503
+ file_name=facet_path.name, mime="application/json",
504
+ use_container_width=True)
505
+ else:
506
+ st.button("⬇️ Facets JSON", disabled=True, use_container_width=True,
507
+ help="This approach/dataset has no facet tree.")
508
+ with d3:
509
+ st.download_button("⬇️ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
510
+ file_name="metadata_hierarchy_outputs.zip",
511
+ mime="application/zip", use_container_width=True)
512
+
513
  st.markdown("---")
514
 
515
  # ── Level-of-Detail controls (above chart β€” matches the apps) ────────────────
 
547
  display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
548
 
549
  if viz_mode == "Sunburst (drill-down)":
550
+ eff = safe_render_depth(display_nodes, depth)
551
+ if eff < depth:
552
+ st.caption(f"Large hierarchy β€” showing {eff} levels initially to render "
553
+ "reliably. **Click any sector to drill deeper.**")
554
+ st.plotly_chart(plot_sunburst(display_nodes, color, eff), use_container_width=True)
555
  elif viz_mode == "Treemap":
556
+ eff = safe_render_depth(display_nodes, depth)
557
+ if eff < depth:
558
+ st.caption(f"Large hierarchy β€” showing {eff} levels initially to render "
559
+ "reliably. **Click a tile to drill deeper.**")
560
+ st.plotly_chart(plot_treemap(display_nodes, color, eff), use_container_width=True)
561
  else:
562
  st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
563
  use_container_width=True)
564
 
565
  # ── Facets (Approach 1 only) ─────────────────────────────────────────────────
 
566
  if facet_path is not None and facet_path.exists():
567
  st.markdown("---")
568
  st.subheader("πŸ”€ Parallel facets")
outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json β†’ HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json} RENAMED
The diff for this file is too large to render. See raw diff
 
outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json β†’ HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json} RENAMED
The diff for this file is too large to render. See raw diff
 
outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json CHANGED
@@ -3527,16 +3527,18 @@
3527
  62,
3528
  69,
3529
  76,
 
 
3530
  91,
3531
- 93,
3532
- 95,
3533
- 99,
3534
- 105,
3535
- 111,
3536
  114,
3537
- 118,
3538
- 120,
3539
- 129,
3540
  132
3541
  ],
3542
  "desc": "Facet: Measure Type"
@@ -3686,7 +3688,7 @@
3686
  },
3687
  {
3688
  "id": 10,
3689
- "name": "Correct Latency",
3690
  "related": [
3691
  11,
3692
  12,
@@ -3699,13 +3701,7 @@
3699
  19,
3700
  20,
3701
  21,
3702
- 22,
3703
- 83,
3704
- 84,
3705
- 85,
3706
- 86,
3707
- 87,
3708
- 88
3709
  ],
3710
  "type": "aggregation",
3711
  "info": {
@@ -3717,7 +3713,7 @@
3717
  "relation_label": "semantically related to"
3718
  },
3719
  "isShown": true,
3720
- "desc": "Measure Type: Correct Latency",
3721
  "dtype": "determine",
3722
  "recover": true
3723
  },
@@ -3891,16 +3887,14 @@
3891
  },
3892
  {
3893
  "id": 23,
3894
- "name": "Percent Correct",
3895
  "related": [
3896
  24,
3897
  25,
3898
  26,
3899
  27,
3900
  28,
3901
- 29,
3902
- 89,
3903
- 90
3904
  ],
3905
  "type": "aggregation",
3906
  "info": {
@@ -3912,7 +3906,7 @@
3912
  "relation_label": "semantically related to"
3913
  },
3914
  "isShown": true,
3915
- "desc": "Measure Type: Percent Correct",
3916
  "dtype": "determine",
3917
  "recover": true
3918
  },
@@ -4002,7 +3996,7 @@
4002
  },
4003
  {
4004
  "id": 30,
4005
- "name": "Probability Error",
4006
  "related": [
4007
  31,
4008
  32
@@ -4017,7 +4011,7 @@
4017
  "relation_label": "semantically related to"
4018
  },
4019
  "isShown": true,
4020
- "desc": "Measure Type: Probability Error",
4021
  "dtype": "determine",
4022
  "recover": true
4023
  },
@@ -4160,16 +4154,15 @@
4160
  },
4161
  {
4162
  "id": 40,
4163
- "name": "Total Errors",
4164
  "related": [
4165
  41,
4166
  42,
4167
- 79,
4168
- 124,
4169
- 125,
4170
- 126,
4171
  127,
4172
- 128
 
 
 
4173
  ],
4174
  "type": "aggregation",
4175
  "info": {
@@ -4181,7 +4174,7 @@
4181
  "relation_label": "semantically related to"
4182
  },
4183
  "isShown": true,
4184
- "desc": "Measure Type: Total Errors",
4185
  "dtype": "determine",
4186
  "recover": true
4187
  },
@@ -4524,7 +4517,7 @@
4524
  },
4525
  {
4526
  "id": 62,
4527
- "name": "Total Attempts Patterns",
4528
  "related": [
4529
  63,
4530
  64,
@@ -4543,7 +4536,7 @@
4543
  "relation_label": "semantically related to"
4544
  },
4545
  "isShown": true,
4546
- "desc": "Measure Type: Total Attempts Patterns",
4547
  "dtype": "determine",
4548
  "recover": true
4549
  },
@@ -4746,9 +4739,9 @@
4746
  "related": [
4747
  77,
4748
  78,
4749
- 80,
4750
  81,
4751
- 82
 
4752
  ],
4753
  "type": "aggregation",
4754
  "info": {
@@ -4794,6 +4787,26 @@
4794
  },
4795
  {
4796
  "id": 79,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4797
  "name": "PALTEA28",
4798
  "dtype": "determine",
4799
  "related": [],
@@ -4807,7 +4820,7 @@
4807
  }
4808
  },
4809
  {
4810
- "id": 80,
4811
  "name": "PALTEA4",
4812
  "dtype": "determine",
4813
  "related": [],
@@ -4821,7 +4834,7 @@
4821
  }
4822
  },
4823
  {
4824
- "id": 81,
4825
  "name": "PALTEA6",
4826
  "dtype": "determine",
4827
  "related": [],
@@ -4835,7 +4848,7 @@
4835
  }
4836
  },
4837
  {
4838
- "id": 82,
4839
  "name": "PALTEA8",
4840
  "dtype": "determine",
4841
  "related": [],
@@ -4849,7 +4862,32 @@
4849
  }
4850
  },
4851
  {
4852
- "id": 83,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4853
  "name": "PRMCLSDD",
4854
  "dtype": "determine",
4855
  "related": [],
@@ -4863,7 +4901,7 @@
4863
  }
4864
  },
4865
  {
4866
- "id": 84,
4867
  "name": "PRMCLSDI",
4868
  "dtype": "determine",
4869
  "related": [],
@@ -4877,7 +4915,7 @@
4877
  }
4878
  },
4879
  {
4880
- "id": 85,
4881
  "name": "PRMMCLD",
4882
  "dtype": "determine",
4883
  "related": [],
@@ -4891,7 +4929,7 @@
4891
  }
4892
  },
4893
  {
4894
- "id": 86,
4895
  "name": "PRMMCLI",
4896
  "dtype": "determine",
4897
  "related": [],
@@ -4905,7 +4943,7 @@
4905
  }
4906
  },
4907
  {
4908
- "id": 87,
4909
  "name": "PRMMDCLD",
4910
  "dtype": "determine",
4911
  "related": [],
@@ -4919,7 +4957,7 @@
4919
  }
4920
  },
4921
  {
4922
- "id": 88,
4923
  "name": "PRMMDCLI",
4924
  "dtype": "determine",
4925
  "related": [],
@@ -4933,7 +4971,28 @@
4933
  }
4934
  },
4935
  {
4936
- "id": 89,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4937
  "name": "PRMPCD",
4938
  "dtype": "determine",
4939
  "related": [],
@@ -4947,7 +5006,7 @@
4947
  }
4948
  },
4949
  {
4950
- "id": 90,
4951
  "name": "PRMPCI",
4952
  "dtype": "determine",
4953
  "related": [],
@@ -4961,10 +5020,10 @@
4961
  }
4962
  },
4963
  {
4964
- "id": 91,
4965
  "name": "Time Since Delayed Stimuli",
4966
  "related": [
4967
- 92
4968
  ],
4969
  "type": "aggregation",
4970
  "info": {
@@ -4981,7 +5040,7 @@
4981
  "recover": true
4982
  },
4983
  {
4984
- "id": 92,
4985
  "name": "PRMTSDSP",
4986
  "dtype": "determine",
4987
  "related": [],
@@ -4995,10 +5054,10 @@
4995
  }
4996
  },
4997
  {
4998
- "id": 93,
4999
  "name": "Detection Measure",
5000
  "related": [
5001
- 94
5002
  ],
5003
  "type": "aggregation",
5004
  "info": {
@@ -5015,7 +5074,7 @@
5015
  "recover": true
5016
  },
5017
  {
5018
- "id": 94,
5019
  "name": "RVPA",
5020
  "dtype": "determine",
5021
  "related": [],
@@ -5029,12 +5088,12 @@
5029
  }
5030
  },
5031
  {
5032
- "id": 95,
5033
- "name": "Response Latency",
5034
  "related": [
5035
- 96,
5036
- 97,
5037
- 98
5038
  ],
5039
  "type": "aggregation",
5040
  "info": {
@@ -5046,12 +5105,12 @@
5046
  "relation_label": "semantically related to"
5047
  },
5048
  "isShown": true,
5049
- "desc": "Measure Type: Response Latency",
5050
  "dtype": "determine",
5051
  "recover": true
5052
  },
5053
  {
5054
- "id": 96,
5055
  "name": "RVPLSD",
5056
  "dtype": "determine",
5057
  "related": [],
@@ -5065,7 +5124,7 @@
5065
  }
5066
  },
5067
  {
5068
- "id": 97,
5069
  "name": "RVPMDL",
5070
  "dtype": "determine",
5071
  "related": [],
@@ -5079,7 +5138,7 @@
5079
  }
5080
  },
5081
  {
5082
- "id": 98,
5083
  "name": "RVPML",
5084
  "dtype": "determine",
5085
  "related": [],
@@ -5093,14 +5152,14 @@
5093
  }
5094
  },
5095
  {
5096
- "id": 99,
5097
  "name": "Total",
5098
  "related": [
5099
- 100,
5100
- 101,
5101
- 102,
5102
  103,
5103
- 104
 
 
 
5104
  ],
5105
  "type": "aggregation",
5106
  "info": {
@@ -5117,7 +5176,7 @@
5117
  "recover": true
5118
  },
5119
  {
5120
- "id": 100,
5121
  "name": "RVPPFA",
5122
  "dtype": "determine",
5123
  "related": [],
@@ -5131,7 +5190,7 @@
5131
  }
5132
  },
5133
  {
5134
- "id": 101,
5135
  "name": "RVPPH",
5136
  "dtype": "determine",
5137
  "related": [],
@@ -5145,7 +5204,7 @@
5145
  }
5146
  },
5147
  {
5148
- "id": 102,
5149
  "name": "RVPTFA",
5150
  "dtype": "determine",
5151
  "related": [],
@@ -5159,7 +5218,7 @@
5159
  }
5160
  },
5161
  {
5162
- "id": 103,
5163
  "name": "RVPTH",
5164
  "dtype": "determine",
5165
  "related": [],
@@ -5173,7 +5232,7 @@
5173
  }
5174
  },
5175
  {
5176
- "id": 104,
5177
  "name": "RVPTM",
5178
  "dtype": "determine",
5179
  "related": [],
@@ -5187,14 +5246,14 @@
5187
  }
5188
  },
5189
  {
5190
- "id": 105,
5191
- "name": "Errors Boxes",
5192
  "related": [
5193
- 106,
5194
- 107,
5195
- 108,
5196
  109,
5197
- 110
 
 
 
5198
  ],
5199
  "type": "aggregation",
5200
  "info": {
@@ -5206,12 +5265,12 @@
5206
  "relation_label": "semantically related to"
5207
  },
5208
  "isShown": true,
5209
- "desc": "Measure Type: Errors Boxes",
5210
  "dtype": "determine",
5211
  "recover": true
5212
  },
5213
  {
5214
- "id": 106,
5215
  "name": "SWMBE12",
5216
  "dtype": "determine",
5217
  "related": [],
@@ -5225,7 +5284,7 @@
5225
  }
5226
  },
5227
  {
5228
- "id": 107,
5229
  "name": "SWMBE4",
5230
  "dtype": "determine",
5231
  "related": [],
@@ -5239,7 +5298,7 @@
5239
  }
5240
  },
5241
  {
5242
- "id": 108,
5243
  "name": "SWMBE468",
5244
  "dtype": "determine",
5245
  "related": [],
@@ -5253,7 +5312,7 @@
5253
  }
5254
  },
5255
  {
5256
- "id": 109,
5257
  "name": "SWMBE6",
5258
  "dtype": "determine",
5259
  "related": [],
@@ -5267,7 +5326,7 @@
5267
  }
5268
  },
5269
  {
5270
- "id": 110,
5271
  "name": "SWMBE8",
5272
  "dtype": "determine",
5273
  "related": [],
@@ -5281,13 +5340,13 @@
5281
  }
5282
  },
5283
  {
5284
- "id": 111,
5285
  "name": "Double Errors Boxes",
5286
  "related": [
5287
- 112,
5288
- 113,
5289
  116,
5290
- 117
 
5291
  ],
5292
  "type": "aggregation",
5293
  "info": {
@@ -5304,7 +5363,7 @@
5304
  "recover": true
5305
  },
5306
  {
5307
- "id": 112,
5308
  "name": "SWMDE12",
5309
  "dtype": "determine",
5310
  "related": [],
@@ -5318,7 +5377,7 @@
5318
  }
5319
  },
5320
  {
5321
- "id": 113,
5322
  "name": "SWMDE4",
5323
  "dtype": "determine",
5324
  "related": [],
@@ -5332,10 +5391,10 @@
5332
  }
5333
  },
5334
  {
5335
- "id": 114,
5336
  "name": "Double Errors",
5337
  "related": [
5338
- 115
5339
  ],
5340
  "type": "aggregation",
5341
  "info": {
@@ -5352,7 +5411,7 @@
5352
  "recover": true
5353
  },
5354
  {
5355
- "id": 115,
5356
  "name": "SWMDE468",
5357
  "dtype": "determine",
5358
  "related": [],
@@ -5366,7 +5425,7 @@
5366
  }
5367
  },
5368
  {
5369
- "id": 116,
5370
  "name": "SWMDE6",
5371
  "dtype": "determine",
5372
  "related": [],
@@ -5380,7 +5439,7 @@
5380
  }
5381
  },
5382
  {
5383
- "id": 117,
5384
  "name": "SWMDE8",
5385
  "dtype": "determine",
5386
  "related": [],
@@ -5394,10 +5453,10 @@
5394
  }
5395
  },
5396
  {
5397
- "id": 118,
5398
  "name": "Problem Reached",
5399
  "related": [
5400
- 119
5401
  ],
5402
  "type": "aggregation",
5403
  "info": {
@@ -5414,7 +5473,7 @@
5414
  "recover": true
5415
  },
5416
  {
5417
- "id": 119,
5418
  "name": "SWMPR",
5419
  "dtype": "determine",
5420
  "related": [],
@@ -5428,12 +5487,12 @@
5428
  }
5429
  },
5430
  {
5431
- "id": 120,
5432
- "name": "Strategy",
5433
  "related": [
5434
- 121,
5435
- 122,
5436
- 123
5437
  ],
5438
  "type": "aggregation",
5439
  "info": {
@@ -5445,12 +5504,12 @@
5445
  "relation_label": "semantically related to"
5446
  },
5447
  "isShown": true,
5448
- "desc": "Measure Type: Strategy",
5449
  "dtype": "determine",
5450
  "recover": true
5451
  },
5452
  {
5453
- "id": 121,
5454
  "name": "SWMS",
5455
  "dtype": "determine",
5456
  "related": [],
@@ -5464,7 +5523,7 @@
5464
  }
5465
  },
5466
  {
5467
- "id": 122,
5468
  "name": "SWMS6",
5469
  "dtype": "determine",
5470
  "related": [],
@@ -5478,7 +5537,7 @@
5478
  }
5479
  },
5480
  {
5481
- "id": 123,
5482
  "name": "SWMSX",
5483
  "dtype": "determine",
5484
  "related": [],
@@ -5492,7 +5551,7 @@
5492
  }
5493
  },
5494
  {
5495
- "id": 124,
5496
  "name": "SWMTE12",
5497
  "dtype": "determine",
5498
  "related": [],
@@ -5506,7 +5565,7 @@
5506
  }
5507
  },
5508
  {
5509
- "id": 125,
5510
  "name": "SWMTE4",
5511
  "dtype": "determine",
5512
  "related": [],
@@ -5520,7 +5579,7 @@
5520
  }
5521
  },
5522
  {
5523
- "id": 126,
5524
  "name": "SWMTE468",
5525
  "dtype": "determine",
5526
  "related": [],
@@ -5534,7 +5593,7 @@
5534
  }
5535
  },
5536
  {
5537
- "id": 127,
5538
  "name": "SWMTE6",
5539
  "dtype": "determine",
5540
  "related": [],
@@ -5548,7 +5607,7 @@
5548
  }
5549
  },
5550
  {
5551
- "id": 128,
5552
  "name": "SWMTE8",
5553
  "dtype": "determine",
5554
  "related": [],
@@ -5562,13 +5621,14 @@
5562
  }
5563
  },
5564
  {
5565
- "id": 129,
5566
- "name": "Within Errors Boxes",
5567
  "related": [
5568
- 130,
5569
- 131,
5570
  134,
5571
- 135
 
 
5572
  ],
5573
  "type": "aggregation",
5574
  "info": {
@@ -5580,12 +5640,12 @@
5580
  "relation_label": "semantically related to"
5581
  },
5582
  "isShown": true,
5583
- "desc": "Measure Type: Within Errors Boxes",
5584
  "dtype": "determine",
5585
  "recover": true
5586
  },
5587
  {
5588
- "id": 130,
5589
  "name": "SWMWE12",
5590
  "dtype": "determine",
5591
  "related": [],
@@ -5599,7 +5659,7 @@
5599
  }
5600
  },
5601
  {
5602
- "id": 131,
5603
  "name": "SWMWE4",
5604
  "dtype": "determine",
5605
  "related": [],
@@ -5613,27 +5673,7 @@
5613
  }
5614
  },
5615
  {
5616
- "id": 132,
5617
- "name": "Within Errors",
5618
- "related": [
5619
- 133
5620
- ],
5621
- "type": "aggregation",
5622
- "info": {
5623
- "operation": "concat",
5624
- "usedAttributes": [],
5625
- "formula": "",
5626
- "exec": "",
5627
- "relation_type": "related_to",
5628
- "relation_label": "semantically related to"
5629
- },
5630
- "isShown": true,
5631
- "desc": "Measure Type: Within Errors",
5632
- "dtype": "determine",
5633
- "recover": true
5634
- },
5635
- {
5636
- "id": 133,
5637
  "name": "SWMWE468",
5638
  "dtype": "determine",
5639
  "related": [],
@@ -5647,7 +5687,7 @@
5647
  }
5648
  },
5649
  {
5650
- "id": 134,
5651
  "name": "SWMWE6",
5652
  "dtype": "determine",
5653
  "related": [],
@@ -5661,7 +5701,7 @@
5661
  }
5662
  },
5663
  {
5664
- "id": 135,
5665
  "name": "SWMWE8",
5666
  "dtype": "determine",
5667
  "related": [],
 
3527
  62,
3528
  69,
3529
  76,
3530
+ 79,
3531
+ 84,
3532
  91,
3533
+ 94,
3534
+ 96,
3535
+ 98,
3536
+ 102,
3537
+ 108,
3538
  114,
3539
+ 117,
3540
+ 121,
3541
+ 123,
3542
  132
3543
  ],
3544
  "desc": "Facet: Measure Type"
 
3688
  },
3689
  {
3690
  "id": 10,
3691
+ "name": "Correct Latency Mean",
3692
  "related": [
3693
  11,
3694
  12,
 
3701
  19,
3702
  20,
3703
  21,
3704
+ 22
 
 
 
 
 
 
3705
  ],
3706
  "type": "aggregation",
3707
  "info": {
 
3713
  "relation_label": "semantically related to"
3714
  },
3715
  "isShown": true,
3716
+ "desc": "Measure Type: Correct Latency Mean",
3717
  "dtype": "determine",
3718
  "recover": true
3719
  },
 
3887
  },
3888
  {
3889
  "id": 23,
3890
+ "name": "Percent Correct Percentage",
3891
  "related": [
3892
  24,
3893
  25,
3894
  26,
3895
  27,
3896
  28,
3897
+ 29
 
 
3898
  ],
3899
  "type": "aggregation",
3900
  "info": {
 
3906
  "relation_label": "semantically related to"
3907
  },
3908
  "isShown": true,
3909
+ "desc": "Measure Type: Percent Correct Percentage",
3910
  "dtype": "determine",
3911
  "recover": true
3912
  },
 
3996
  },
3997
  {
3998
  "id": 30,
3999
+ "name": "Probability Error Occurring",
4000
  "related": [
4001
  31,
4002
  32
 
4011
  "relation_label": "semantically related to"
4012
  },
4013
  "isShown": true,
4014
+ "desc": "Measure Type: Probability Error Occurring",
4015
  "dtype": "determine",
4016
  "recover": true
4017
  },
 
4154
  },
4155
  {
4156
  "id": 40,
4157
+ "name": "Errors Total",
4158
  "related": [
4159
  41,
4160
  42,
 
 
 
 
4161
  127,
4162
+ 128,
4163
+ 129,
4164
+ 130,
4165
+ 131
4166
  ],
4167
  "type": "aggregation",
4168
  "info": {
 
4174
  "relation_label": "semantically related to"
4175
  },
4176
  "isShown": true,
4177
+ "desc": "Measure Type: Errors Total",
4178
  "dtype": "determine",
4179
  "recover": true
4180
  },
 
4517
  },
4518
  {
4519
  "id": 62,
4520
+ "name": "Attempts Patterns Total",
4521
  "related": [
4522
  63,
4523
  64,
 
4536
  "relation_label": "semantically related to"
4537
  },
4538
  "isShown": true,
4539
+ "desc": "Measure Type: Attempts Patterns Total",
4540
  "dtype": "determine",
4541
  "recover": true
4542
  },
 
4739
  "related": [
4740
  77,
4741
  78,
 
4742
  81,
4743
+ 82,
4744
+ 83
4745
  ],
4746
  "type": "aggregation",
4747
  "info": {
 
4787
  },
4788
  {
4789
  "id": 79,
4790
+ "name": "Total Errors",
4791
+ "related": [
4792
+ 80
4793
+ ],
4794
+ "type": "aggregation",
4795
+ "info": {
4796
+ "operation": "concat",
4797
+ "usedAttributes": [],
4798
+ "formula": "",
4799
+ "exec": "",
4800
+ "relation_type": "related_to",
4801
+ "relation_label": "semantically related to"
4802
+ },
4803
+ "isShown": true,
4804
+ "desc": "Measure Type: Total Errors",
4805
+ "dtype": "determine",
4806
+ "recover": true
4807
+ },
4808
+ {
4809
+ "id": 80,
4810
  "name": "PALTEA28",
4811
  "dtype": "determine",
4812
  "related": [],
 
4820
  }
4821
  },
4822
  {
4823
+ "id": 81,
4824
  "name": "PALTEA4",
4825
  "dtype": "determine",
4826
  "related": [],
 
4834
  }
4835
  },
4836
  {
4837
+ "id": 82,
4838
  "name": "PALTEA6",
4839
  "dtype": "determine",
4840
  "related": [],
 
4848
  }
4849
  },
4850
  {
4851
+ "id": 83,
4852
  "name": "PALTEA8",
4853
  "dtype": "determine",
4854
  "related": [],
 
4862
  }
4863
  },
4864
  {
4865
+ "id": 84,
4866
+ "name": "Latency Immediate Standard",
4867
+ "related": [
4868
+ 85,
4869
+ 86,
4870
+ 87,
4871
+ 88,
4872
+ 89,
4873
+ 90
4874
+ ],
4875
+ "type": "aggregation",
4876
+ "info": {
4877
+ "operation": "concat",
4878
+ "usedAttributes": [],
4879
+ "formula": "",
4880
+ "exec": "",
4881
+ "relation_type": "related_to",
4882
+ "relation_label": "semantically related to"
4883
+ },
4884
+ "isShown": true,
4885
+ "desc": "Measure Type: Latency Immediate Standard",
4886
+ "dtype": "determine",
4887
+ "recover": true
4888
+ },
4889
+ {
4890
+ "id": 85,
4891
  "name": "PRMCLSDD",
4892
  "dtype": "determine",
4893
  "related": [],
 
4901
  }
4902
  },
4903
  {
4904
+ "id": 86,
4905
  "name": "PRMCLSDI",
4906
  "dtype": "determine",
4907
  "related": [],
 
4915
  }
4916
  },
4917
  {
4918
+ "id": 87,
4919
  "name": "PRMMCLD",
4920
  "dtype": "determine",
4921
  "related": [],
 
4929
  }
4930
  },
4931
  {
4932
+ "id": 88,
4933
  "name": "PRMMCLI",
4934
  "dtype": "determine",
4935
  "related": [],
 
4943
  }
4944
  },
4945
  {
4946
+ "id": 89,
4947
  "name": "PRMMDCLD",
4948
  "dtype": "determine",
4949
  "related": [],
 
4957
  }
4958
  },
4959
  {
4960
+ "id": 90,
4961
  "name": "PRMMDCLI",
4962
  "dtype": "determine",
4963
  "related": [],
 
4971
  }
4972
  },
4973
  {
4974
+ "id": 91,
4975
+ "name": "Percent Correct Immediate",
4976
+ "related": [
4977
+ 92,
4978
+ 93
4979
+ ],
4980
+ "type": "aggregation",
4981
+ "info": {
4982
+ "operation": "concat",
4983
+ "usedAttributes": [],
4984
+ "formula": "",
4985
+ "exec": "",
4986
+ "relation_type": "related_to",
4987
+ "relation_label": "semantically related to"
4988
+ },
4989
+ "isShown": true,
4990
+ "desc": "Measure Type: Percent Correct Immediate",
4991
+ "dtype": "determine",
4992
+ "recover": true
4993
+ },
4994
+ {
4995
+ "id": 92,
4996
  "name": "PRMPCD",
4997
  "dtype": "determine",
4998
  "related": [],
 
5006
  }
5007
  },
5008
  {
5009
+ "id": 93,
5010
  "name": "PRMPCI",
5011
  "dtype": "determine",
5012
  "related": [],
 
5020
  }
5021
  },
5022
  {
5023
+ "id": 94,
5024
  "name": "Time Since Delayed Stimuli",
5025
  "related": [
5026
+ 95
5027
  ],
5028
  "type": "aggregation",
5029
  "info": {
 
5040
  "recover": true
5041
  },
5042
  {
5043
+ "id": 95,
5044
  "name": "PRMTSDSP",
5045
  "dtype": "determine",
5046
  "related": [],
 
5054
  }
5055
  },
5056
  {
5057
+ "id": 96,
5058
  "name": "Detection Measure",
5059
  "related": [
5060
+ 97
5061
  ],
5062
  "type": "aggregation",
5063
  "info": {
 
5074
  "recover": true
5075
  },
5076
  {
5077
+ "id": 97,
5078
  "name": "RVPA",
5079
  "dtype": "determine",
5080
  "related": [],
 
5088
  }
5089
  },
5090
  {
5091
+ "id": 98,
5092
+ "name": "Response Latency Mean",
5093
  "related": [
5094
+ 99,
5095
+ 100,
5096
+ 101
5097
  ],
5098
  "type": "aggregation",
5099
  "info": {
 
5105
  "relation_label": "semantically related to"
5106
  },
5107
  "isShown": true,
5108
+ "desc": "Measure Type: Response Latency Mean",
5109
  "dtype": "determine",
5110
  "recover": true
5111
  },
5112
  {
5113
+ "id": 99,
5114
  "name": "RVPLSD",
5115
  "dtype": "determine",
5116
  "related": [],
 
5124
  }
5125
  },
5126
  {
5127
+ "id": 100,
5128
  "name": "RVPMDL",
5129
  "dtype": "determine",
5130
  "related": [],
 
5138
  }
5139
  },
5140
  {
5141
+ "id": 101,
5142
  "name": "RVPML",
5143
  "dtype": "determine",
5144
  "related": [],
 
5152
  }
5153
  },
5154
  {
5155
+ "id": 102,
5156
  "name": "Total",
5157
  "related": [
 
 
 
5158
  103,
5159
+ 104,
5160
+ 105,
5161
+ 106,
5162
+ 107
5163
  ],
5164
  "type": "aggregation",
5165
  "info": {
 
5176
  "recover": true
5177
  },
5178
  {
5179
+ "id": 103,
5180
  "name": "RVPPFA",
5181
  "dtype": "determine",
5182
  "related": [],
 
5190
  }
5191
  },
5192
  {
5193
+ "id": 104,
5194
  "name": "RVPPH",
5195
  "dtype": "determine",
5196
  "related": [],
 
5204
  }
5205
  },
5206
  {
5207
+ "id": 105,
5208
  "name": "RVPTFA",
5209
  "dtype": "determine",
5210
  "related": [],
 
5218
  }
5219
  },
5220
  {
5221
+ "id": 106,
5222
  "name": "RVPTH",
5223
  "dtype": "determine",
5224
  "related": [],
 
5232
  }
5233
  },
5234
  {
5235
+ "id": 107,
5236
  "name": "RVPTM",
5237
  "dtype": "determine",
5238
  "related": [],
 
5246
  }
5247
  },
5248
  {
5249
+ "id": 108,
5250
+ "name": "Errors Boxes Times",
5251
  "related": [
 
 
 
5252
  109,
5253
+ 110,
5254
+ 111,
5255
+ 112,
5256
+ 113
5257
  ],
5258
  "type": "aggregation",
5259
  "info": {
 
5265
  "relation_label": "semantically related to"
5266
  },
5267
  "isShown": true,
5268
+ "desc": "Measure Type: Errors Boxes Times",
5269
  "dtype": "determine",
5270
  "recover": true
5271
  },
5272
  {
5273
+ "id": 109,
5274
  "name": "SWMBE12",
5275
  "dtype": "determine",
5276
  "related": [],
 
5284
  }
5285
  },
5286
  {
5287
+ "id": 110,
5288
  "name": "SWMBE4",
5289
  "dtype": "determine",
5290
  "related": [],
 
5298
  }
5299
  },
5300
  {
5301
+ "id": 111,
5302
  "name": "SWMBE468",
5303
  "dtype": "determine",
5304
  "related": [],
 
5312
  }
5313
  },
5314
  {
5315
+ "id": 112,
5316
  "name": "SWMBE6",
5317
  "dtype": "determine",
5318
  "related": [],
 
5326
  }
5327
  },
5328
  {
5329
+ "id": 113,
5330
  "name": "SWMBE8",
5331
  "dtype": "determine",
5332
  "related": [],
 
5340
  }
5341
  },
5342
  {
5343
+ "id": 114,
5344
  "name": "Double Errors Boxes",
5345
  "related": [
5346
+ 115,
 
5347
  116,
5348
+ 119,
5349
+ 120
5350
  ],
5351
  "type": "aggregation",
5352
  "info": {
 
5363
  "recover": true
5364
  },
5365
  {
5366
+ "id": 115,
5367
  "name": "SWMDE12",
5368
  "dtype": "determine",
5369
  "related": [],
 
5377
  }
5378
  },
5379
  {
5380
+ "id": 116,
5381
  "name": "SWMDE4",
5382
  "dtype": "determine",
5383
  "related": [],
 
5391
  }
5392
  },
5393
  {
5394
+ "id": 117,
5395
  "name": "Double Errors",
5396
  "related": [
5397
+ 118
5398
  ],
5399
  "type": "aggregation",
5400
  "info": {
 
5411
  "recover": true
5412
  },
5413
  {
5414
+ "id": 118,
5415
  "name": "SWMDE468",
5416
  "dtype": "determine",
5417
  "related": [],
 
5425
  }
5426
  },
5427
  {
5428
+ "id": 119,
5429
  "name": "SWMDE6",
5430
  "dtype": "determine",
5431
  "related": [],
 
5439
  }
5440
  },
5441
  {
5442
+ "id": 120,
5443
  "name": "SWMDE8",
5444
  "dtype": "determine",
5445
  "related": [],
 
5453
  }
5454
  },
5455
  {
5456
+ "id": 121,
5457
  "name": "Problem Reached",
5458
  "related": [
5459
+ 122
5460
  ],
5461
  "type": "aggregation",
5462
  "info": {
 
5473
  "recover": true
5474
  },
5475
  {
5476
+ "id": 122,
5477
  "name": "SWMPR",
5478
  "dtype": "determine",
5479
  "related": [],
 
5487
  }
5488
  },
5489
  {
5490
+ "id": 123,
5491
+ "name": "Strategy High",
5492
  "related": [
5493
+ 124,
5494
+ 125,
5495
+ 126
5496
  ],
5497
  "type": "aggregation",
5498
  "info": {
 
5504
  "relation_label": "semantically related to"
5505
  },
5506
  "isShown": true,
5507
+ "desc": "Measure Type: Strategy High",
5508
  "dtype": "determine",
5509
  "recover": true
5510
  },
5511
  {
5512
+ "id": 124,
5513
  "name": "SWMS",
5514
  "dtype": "determine",
5515
  "related": [],
 
5523
  }
5524
  },
5525
  {
5526
+ "id": 125,
5527
  "name": "SWMS6",
5528
  "dtype": "determine",
5529
  "related": [],
 
5537
  }
5538
  },
5539
  {
5540
+ "id": 126,
5541
  "name": "SWMSX",
5542
  "dtype": "determine",
5543
  "related": [],
 
5551
  }
5552
  },
5553
  {
5554
+ "id": 127,
5555
  "name": "SWMTE12",
5556
  "dtype": "determine",
5557
  "related": [],
 
5565
  }
5566
  },
5567
  {
5568
+ "id": 128,
5569
  "name": "SWMTE4",
5570
  "dtype": "determine",
5571
  "related": [],
 
5579
  }
5580
  },
5581
  {
5582
+ "id": 129,
5583
  "name": "SWMTE468",
5584
  "dtype": "determine",
5585
  "related": [],
 
5593
  }
5594
  },
5595
  {
5596
+ "id": 130,
5597
  "name": "SWMTE6",
5598
  "dtype": "determine",
5599
  "related": [],
 
5607
  }
5608
  },
5609
  {
5610
+ "id": 131,
5611
  "name": "SWMTE8",
5612
  "dtype": "determine",
5613
  "related": [],
 
5621
  }
5622
  },
5623
  {
5624
+ "id": 132,
5625
+ "name": "Within Errors",
5626
  "related": [
5627
+ 133,
 
5628
  134,
5629
+ 135,
5630
+ 136,
5631
+ 137
5632
  ],
5633
  "type": "aggregation",
5634
  "info": {
 
5640
  "relation_label": "semantically related to"
5641
  },
5642
  "isShown": true,
5643
+ "desc": "Measure Type: Within Errors",
5644
  "dtype": "determine",
5645
  "recover": true
5646
  },
5647
  {
5648
+ "id": 133,
5649
  "name": "SWMWE12",
5650
  "dtype": "determine",
5651
  "related": [],
 
5659
  }
5660
  },
5661
  {
5662
+ "id": 134,
5663
  "name": "SWMWE4",
5664
  "dtype": "determine",
5665
  "related": [],
 
5673
  }
5674
  },
5675
  {
5676
+ "id": 135,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5677
  "name": "SWMWE468",
5678
  "dtype": "determine",
5679
  "related": [],
 
5687
  }
5688
  },
5689
  {
5690
+ "id": 136,
5691
  "name": "SWMWE6",
5692
  "dtype": "determine",
5693
  "related": [],
 
5701
  }
5702
  },
5703
  {
5704
+ "id": 137,
5705
  "name": "SWMWE8",
5706
  "dtype": "determine",
5707
  "related": [],
outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json CHANGED
@@ -1645,7 +1645,7 @@
1645
  115,
1646
  116,
1647
  117,
1648
- 146
1649
  ],
1650
  "type": "aggregation",
1651
  "info": {
@@ -1687,16 +1687,19 @@
1687
  "recover": true,
1688
  "concept_provenance": {
1689
  "node_label": "Total Correct",
1690
- "confidence": 0.0,
1691
- "alternatives": [],
 
 
 
1692
  "source_evidence": [
1693
- "tfidf_fallback"
1694
  ],
1695
- "embedding_sim": 0.0,
1696
- "string_sim": 0.0,
1697
- "coverage": 0.0,
1698
- "contrast": 0.0,
1699
- "specificity": 0.0
1700
  }
1701
  },
1702
  {
@@ -1722,21 +1725,65 @@
1722
  "recover": true,
1723
  "concept_provenance": {
1724
  "node_label": "Error",
1725
- "confidence": 0.0,
1726
- "alternatives": [],
 
 
 
 
1727
  "source_evidence": [
1728
- "tfidf_fallback"
1729
  ],
1730
- "embedding_sim": 0.0,
1731
- "string_sim": 0.0,
1732
- "coverage": 0.0,
1733
  "contrast": 0.0,
1734
- "specificity": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1735
  }
1736
  },
1737
  {
1738
  "id": 113,
1739
- "name": "Total Errors",
1740
  "related": [
1741
  34,
1742
  35
@@ -1751,21 +1798,24 @@
1751
  "relation_label": "belongs to"
1752
  },
1753
  "isShown": true,
1754
- "desc": "Concept group: DMS > Total Errors",
1755
  "dtype": "determine",
1756
  "recover": true,
1757
  "concept_provenance": {
1758
- "node_label": "Total Errors",
1759
- "confidence": 0.0,
1760
- "alternatives": [],
 
 
 
1761
  "source_evidence": [
1762
- "tfidf_fallback"
1763
  ],
1764
- "embedding_sim": 0.0,
1765
- "string_sim": 0.0,
1766
- "coverage": 0.0,
1767
- "contrast": 0.0,
1768
- "specificity": 0.0
1769
  }
1770
  },
1771
  {
@@ -1794,21 +1844,24 @@
1794
  "recover": true,
1795
  "concept_provenance": {
1796
  "node_label": "Correct Latency Standard Deviation",
1797
- "confidence": 0.0,
1798
- "alternatives": [],
 
 
 
1799
  "source_evidence": [
1800
- "tfidf_fallback"
1801
  ],
1802
- "embedding_sim": 0.0,
1803
- "string_sim": 0.0,
1804
- "coverage": 0.0,
1805
- "contrast": 0.0,
1806
- "specificity": 0.0
1807
  }
1808
  },
1809
  {
1810
  "id": 115,
1811
- "name": "Probability Error",
1812
  "related": [
1813
  26,
1814
  27
@@ -1823,26 +1876,30 @@
1823
  "relation_label": "belongs to"
1824
  },
1825
  "isShown": true,
1826
- "desc": "Concept group: DMS > Probability Error",
1827
  "dtype": "determine",
1828
  "recover": true,
1829
  "concept_provenance": {
1830
- "node_label": "Probability Error",
1831
- "confidence": 0.0,
1832
- "alternatives": [],
 
 
 
 
1833
  "source_evidence": [
1834
- "tfidf_fallback"
1835
  ],
1836
- "embedding_sim": 0.0,
1837
- "string_sim": 0.0,
1838
- "coverage": 0.0,
1839
- "contrast": 0.0,
1840
- "specificity": 0.0
1841
  }
1842
  },
1843
  {
1844
  "id": 116,
1845
- "name": "Percent Correct",
1846
  "related": [
1847
  20,
1848
  21,
@@ -1861,21 +1918,25 @@
1861
  "relation_label": "belongs to"
1862
  },
1863
  "isShown": true,
1864
- "desc": "Concept group: DMS > Percent Correct",
1865
  "dtype": "determine",
1866
  "recover": true,
1867
  "concept_provenance": {
1868
- "node_label": "Percent Correct",
1869
- "confidence": 0.0,
1870
- "alternatives": [],
 
 
 
 
1871
  "source_evidence": [
1872
- "tfidf_fallback"
1873
  ],
1874
- "embedding_sim": 0.0,
1875
- "string_sim": 0.0,
1876
- "coverage": 0.0,
1877
- "contrast": 0.0,
1878
- "specificity": 0.0
1879
  }
1880
  },
1881
  {
@@ -1941,11 +2002,11 @@
1941
  "recover": true,
1942
  "concept_provenance": {
1943
  "node_label": "Latency Display Stimulus",
1944
- "confidence": 0.678,
1945
  "alternatives": [
1946
- "Mean Latency",
1947
- "Latency",
1948
- "Response Latency"
1949
  ],
1950
  "source_evidence": [
1951
  "keybert"
@@ -1953,7 +2014,7 @@
1953
  "embedding_sim": 0.732,
1954
  "coverage": 0.732,
1955
  "contrast": 0.595,
1956
- "specificity": 1.0,
1957
  "string_sim": 0.0
1958
  }
1959
  },
@@ -1979,10 +2040,11 @@
1979
  "recover": true,
1980
  "concept_provenance": {
1981
  "node_label": "Total Assessment Trials",
1982
- "confidence": 0.534,
1983
  "alternatives": [
1984
- "Assessment Trials",
1985
- "Assessed Trials Simultaneous"
 
1986
  ],
1987
  "source_evidence": [
1988
  "keybert"
@@ -1990,7 +2052,7 @@
1990
  "embedding_sim": 0.629,
1991
  "coverage": 0.629,
1992
  "contrast": 0.204,
1993
- "specificity": 1.0,
1994
  "string_sim": 0.0
1995
  }
1996
  },
@@ -1998,11 +2060,10 @@
1998
  "id": 121,
1999
  "name": "PAL",
2000
  "related": [
2001
- 122,
2002
  123,
2003
- 124,
2004
  125,
2005
  126,
 
2006
  147
2007
  ],
2008
  "type": "aggregation",
@@ -2021,7 +2082,7 @@
2021
  },
2022
  {
2023
  "id": 122,
2024
- "name": "Total Attempts Patterns",
2025
  "related": [
2026
  49,
2027
  50,
@@ -2040,21 +2101,24 @@
2040
  "relation_label": "belongs to"
2041
  },
2042
  "isShown": true,
2043
- "desc": "Concept group: PAL > Total Attempts Patterns",
2044
  "dtype": "determine",
2045
  "recover": true,
2046
  "concept_provenance": {
2047
- "node_label": "Total Attempts Patterns",
2048
- "confidence": 0.0,
2049
- "alternatives": [],
 
 
 
2050
  "source_evidence": [
2051
- "tfidf_fallback"
2052
  ],
2053
- "embedding_sim": 0.0,
2054
- "string_sim": 0.0,
2055
- "coverage": 0.0,
2056
- "contrast": 0.0,
2057
- "specificity": 0.0
2058
  }
2059
  },
2060
  {
@@ -2079,7 +2143,7 @@
2079
  },
2080
  {
2081
  "id": 124,
2082
- "name": "Errors Patterns Total",
2083
  "related": [
2084
  55,
2085
  56,
@@ -2103,11 +2167,11 @@
2103
  "recover": true,
2104
  "concept_provenance": {
2105
  "node_label": "Errors Patterns Total",
2106
- "confidence": 0.507,
2107
  "alternatives": [
2108
- "Correct Box Attempt",
2109
- "Subject Revisits Box",
2110
- "Box Attempt"
2111
  ],
2112
  "source_evidence": [
2113
  "keybert"
@@ -2115,7 +2179,7 @@
2115
  "embedding_sim": 0.619,
2116
  "coverage": 0.619,
2117
  "contrast": 0.115,
2118
- "specificity": 1.0,
2119
  "string_sim": 0.0
2120
  }
2121
  },
@@ -2184,16 +2248,20 @@
2184
  "recover": true,
2185
  "concept_provenance": {
2186
  "node_label": "Include Total Errors Shapes",
2187
- "confidence": 0.0,
2188
- "alternatives": [],
 
 
 
 
2189
  "source_evidence": [
2190
- "tfidf_fallback"
2191
  ],
2192
- "embedding_sim": 0.0,
2193
- "string_sim": 0.0,
2194
- "coverage": 0.0,
2195
- "contrast": 0.0,
2196
- "specificity": 0.0
2197
  }
2198
  },
2199
  {
@@ -2220,10 +2288,14 @@
2220
  },
2221
  {
2222
  "id": 130,
2223
- "name": "Correct Latency",
2224
  "related": [
2225
- 152,
2226
- 153
 
 
 
 
2227
  ],
2228
  "type": "aggregation",
2229
  "info": {
@@ -2235,21 +2307,25 @@
2235
  "relation_label": "belongs to"
2236
  },
2237
  "isShown": true,
2238
- "desc": "Concept group: PRM > Correct Latency",
2239
  "dtype": "determine",
2240
  "recover": true,
2241
  "concept_provenance": {
2242
- "node_label": "Correct Latency",
2243
- "confidence": 0.0,
2244
- "alternatives": [],
 
 
 
 
2245
  "source_evidence": [
2246
- "tfidf_fallback"
2247
  ],
2248
- "embedding_sim": 0.0,
2249
- "string_sim": 0.0,
2250
- "coverage": 0.0,
2251
- "contrast": 0.0,
2252
- "specificity": 0.0
2253
  }
2254
  },
2255
  {
@@ -2274,7 +2350,7 @@
2274
  },
2275
  {
2276
  "id": 132,
2277
- "name": "Percent Correct",
2278
  "related": [
2279
  73,
2280
  74
@@ -2289,21 +2365,25 @@
2289
  "relation_label": "belongs to"
2290
  },
2291
  "isShown": true,
2292
- "desc": "Concept group: PRM > Percent Correct",
2293
  "dtype": "determine",
2294
  "recover": true,
2295
  "concept_provenance": {
2296
- "node_label": "Percent Correct",
2297
- "confidence": 0.0,
2298
- "alternatives": [],
 
 
 
 
2299
  "source_evidence": [
2300
- "tfidf_fallback"
2301
  ],
2302
- "embedding_sim": 0.0,
2303
- "string_sim": 0.0,
2304
- "coverage": 0.0,
2305
- "contrast": 0.0,
2306
- "specificity": 0.0
2307
  }
2308
  },
2309
  {
@@ -2353,21 +2433,24 @@
2353
  "recover": true,
2354
  "concept_provenance": {
2355
  "node_label": "Total",
2356
- "confidence": 0.0,
2357
- "alternatives": [],
 
 
 
2358
  "source_evidence": [
2359
- "tfidf_fallback"
2360
  ],
2361
- "embedding_sim": 0.0,
2362
- "string_sim": 0.0,
2363
- "coverage": 0.0,
2364
- "contrast": 0.0,
2365
- "specificity": 0.0
2366
  }
2367
  },
2368
  {
2369
  "id": 135,
2370
- "name": "Response Latency",
2371
  "related": [
2372
  77,
2373
  78,
@@ -2383,21 +2466,25 @@
2383
  "relation_label": "belongs to"
2384
  },
2385
  "isShown": true,
2386
- "desc": "Concept group: RVP > Response Latency",
2387
  "dtype": "determine",
2388
  "recover": true,
2389
  "concept_provenance": {
2390
- "node_label": "Response Latency",
2391
- "confidence": 0.0,
2392
- "alternatives": [],
 
 
 
 
2393
  "source_evidence": [
2394
- "tfidf_fallback"
2395
  ],
2396
- "embedding_sim": 0.0,
2397
- "string_sim": 0.0,
2398
- "coverage": 0.0,
2399
- "contrast": 0.0,
2400
- "specificity": 0.0
2401
  }
2402
  },
2403
  {
@@ -2426,9 +2513,9 @@
2426
  "related": [
2427
  140,
2428
  141,
 
2429
  143,
2430
  144,
2431
- 145,
2432
  148
2433
  ],
2434
  "type": "aggregation",
@@ -2445,9 +2532,50 @@
2445
  "dtype": "determine",
2446
  "recover": true
2447
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2448
  {
2449
  "id": 140,
2450
- "name": "Strategy",
2451
  "related": [
2452
  96,
2453
  97,
@@ -2463,21 +2591,25 @@
2463
  "relation_label": "belongs to"
2464
  },
2465
  "isShown": true,
2466
- "desc": "Concept group: SWM > Strategy",
2467
  "dtype": "determine",
2468
  "recover": true,
2469
  "concept_provenance": {
2470
- "node_label": "Strategy",
2471
- "confidence": 0.0,
2472
- "alternatives": [],
 
 
 
 
2473
  "source_evidence": [
2474
- "tfidf_fallback"
2475
  ],
2476
- "embedding_sim": 0.0,
2477
- "string_sim": 0.0,
2478
- "coverage": 0.0,
2479
- "contrast": 0.0,
2480
- "specificity": 0.0
2481
  }
2482
  },
2483
  {
@@ -2501,14 +2633,14 @@
2501
  "recover": true
2502
  },
2503
  {
2504
- "id": 143,
2505
- "name": "Total Errors",
2506
  "related": [
2507
- 99,
2508
- 100,
2509
- 101,
2510
- 102,
2511
- 103
2512
  ],
2513
  "type": "aggregation",
2514
  "info": {
@@ -2520,32 +2652,34 @@
2520
  "relation_label": "belongs to"
2521
  },
2522
  "isShown": true,
2523
- "desc": "Concept group: SWM > Total Errors",
2524
  "dtype": "determine",
2525
  "recover": true,
2526
  "concept_provenance": {
2527
- "node_label": "Total Errors",
2528
- "confidence": 0.0,
2529
- "alternatives": [],
 
 
2530
  "source_evidence": [
2531
- "tfidf_fallback"
2532
  ],
2533
- "embedding_sim": 0.0,
2534
- "string_sim": 0.0,
2535
- "coverage": 0.0,
2536
  "contrast": 0.0,
2537
- "specificity": 0.0
 
2538
  }
2539
  },
2540
  {
2541
- "id": 144,
2542
- "name": "Within Errors",
2543
  "related": [
2544
- 106,
2545
- 104,
2546
- 105,
2547
- 107,
2548
- 108
2549
  ],
2550
  "type": "aggregation",
2551
  "info": {
@@ -2557,12 +2691,28 @@
2557
  "relation_label": "belongs to"
2558
  },
2559
  "isShown": true,
2560
- "desc": "Concept group: SWM > Within Errors",
2561
  "dtype": "determine",
2562
- "recover": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2563
  },
2564
  {
2565
- "id": 145,
2566
  "name": "Double Errors",
2567
  "related": [
2568
  92,
@@ -2586,14 +2736,11 @@
2586
  "recover": true
2587
  },
2588
  {
2589
- "id": 146,
2590
  "name": "Correct Latency",
2591
  "related": [
2592
- 114,
2593
- 154,
2594
- 155,
2595
- 156,
2596
- 157
2597
  ],
2598
  "type": "aggregation",
2599
  "info": {
@@ -2609,6 +2756,27 @@
2609
  "dtype": "determine",
2610
  "recover": true
2611
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2612
  {
2613
  "id": 147,
2614
  "name": "Total Errors",
@@ -2634,11 +2802,7 @@
2634
  "id": 148,
2635
  "name": "Errors Boxes",
2636
  "related": [
2637
- 85,
2638
- 86,
2639
- 87,
2640
- 88,
2641
- 89
2642
  ],
2643
  "type": "aggregation",
2644
  "info": {
@@ -2719,50 +2883,6 @@
2719
  },
2720
  {
2721
  "id": 152,
2722
- "name": "Delayed",
2723
- "related": [
2724
- 67,
2725
- 69,
2726
- 71
2727
- ],
2728
- "type": "aggregation",
2729
- "info": {
2730
- "operation": "concat",
2731
- "usedAttributes": [],
2732
- "formula": "",
2733
- "exec": "",
2734
- "relation_type": "belongs_to",
2735
- "relation_label": "belongs to"
2736
- },
2737
- "isShown": true,
2738
- "desc": "Sub-group: Delayed",
2739
- "dtype": "determine",
2740
- "recover": true
2741
- },
2742
- {
2743
- "id": 153,
2744
- "name": "Immediate",
2745
- "related": [
2746
- 72,
2747
- 68,
2748
- 70
2749
- ],
2750
- "type": "aggregation",
2751
- "info": {
2752
- "operation": "concat",
2753
- "usedAttributes": [],
2754
- "formula": "",
2755
- "exec": "",
2756
- "relation_type": "belongs_to",
2757
- "relation_label": "belongs to"
2758
- },
2759
- "isShown": true,
2760
- "desc": "Sub-group: Immediate",
2761
- "dtype": "determine",
2762
- "recover": true
2763
- },
2764
- {
2765
- "id": 154,
2766
  "name": "Median Seconds Delay",
2767
  "related": [
2768
  9,
@@ -2784,8 +2904,8 @@
2784
  "recover": true
2785
  },
2786
  {
2787
- "id": 155,
2788
- "name": "Mean Seconds Delay",
2789
  "related": [
2790
  16,
2791
  17,
@@ -2801,12 +2921,12 @@
2801
  "relation_label": "belongs to"
2802
  },
2803
  "isShown": true,
2804
- "desc": "Sub-group: Mean Seconds Delay",
2805
  "dtype": "determine",
2806
  "recover": true
2807
  },
2808
  {
2809
- "id": 156,
2810
  "name": "Median",
2811
  "related": [
2812
  8,
@@ -2826,27 +2946,5 @@
2826
  "desc": "Sub-group: Median",
2827
  "dtype": "determine",
2828
  "recover": true
2829
- },
2830
- {
2831
- "id": 157,
2832
- "name": "Mean",
2833
- "related": [
2834
- 18,
2835
- 19,
2836
- 14
2837
- ],
2838
- "type": "aggregation",
2839
- "info": {
2840
- "operation": "concat",
2841
- "usedAttributes": [],
2842
- "formula": "",
2843
- "exec": "",
2844
- "relation_type": "belongs_to",
2845
- "relation_label": "belongs to"
2846
- },
2847
- "isShown": true,
2848
- "desc": "Sub-group: Mean",
2849
- "dtype": "determine",
2850
- "recover": true
2851
  }
2852
  ]
 
1645
  115,
1646
  116,
1647
  117,
1648
+ 145
1649
  ],
1650
  "type": "aggregation",
1651
  "info": {
 
1687
  "recover": true,
1688
  "concept_provenance": {
1689
  "node_label": "Total Correct",
1690
+ "confidence": 0.507,
1691
+ "alternatives": [
1692
+ "correct total",
1693
+ "correct total times"
1694
+ ],
1695
  "source_evidence": [
1696
+ "description_title"
1697
  ],
1698
+ "embedding_sim": 0.319,
1699
+ "coverage": 0.319,
1700
+ "contrast": 0.086,
1701
+ "specificity": 0.0,
1702
+ "string_sim": 1.0
1703
  }
1704
  },
1705
  {
 
1725
  "recover": true,
1726
  "concept_provenance": {
1727
  "node_label": "Error",
1728
+ "confidence": 0.447,
1729
+ "alternatives": [
1730
+ "error times subject",
1731
+ "error times",
1732
+ "failed"
1733
+ ],
1734
  "source_evidence": [
1735
+ "description_title"
1736
  ],
1737
+ "embedding_sim": 0.216,
1738
+ "coverage": 0.216,
 
1739
  "contrast": 0.0,
1740
+ "specificity": 0.0,
1741
+ "string_sim": 1.0
1742
+ }
1743
+ },
1744
+ {
1745
+ "id": 112,
1746
+ "name": "Mean Latency",
1747
+ "related": [
1748
+ 14,
1749
+ 18,
1750
+ 19,
1751
+ 152,
1752
+ 153,
1753
+ 154
1754
+ ],
1755
+ "type": "aggregation",
1756
+ "info": {
1757
+ "operation": "concat",
1758
+ "usedAttributes": [],
1759
+ "formula": "",
1760
+ "exec": "",
1761
+ "relation_type": "belongs_to",
1762
+ "relation_label": "belongs to"
1763
+ },
1764
+ "isShown": true,
1765
+ "desc": "Concept group: DMS > Correct Latency Mean",
1766
+ "dtype": "determine",
1767
+ "recover": true,
1768
+ "concept_provenance": {
1769
+ "node_label": "Correct Latency Mean",
1770
+ "confidence": 0.625,
1771
+ "alternatives": [
1772
+ "latency mean"
1773
+ ],
1774
+ "source_evidence": [
1775
+ "keybert"
1776
+ ],
1777
+ "embedding_sim": 0.676,
1778
+ "coverage": 0.676,
1779
+ "contrast": 0.076,
1780
+ "specificity": 0.0,
1781
+ "string_sim": 0.884
1782
  }
1783
  },
1784
  {
1785
  "id": 113,
1786
+ "name": "Errors Total",
1787
  "related": [
1788
  34,
1789
  35
 
1798
  "relation_label": "belongs to"
1799
  },
1800
  "isShown": true,
1801
+ "desc": "Concept group: DMS > Errors Total",
1802
  "dtype": "determine",
1803
  "recover": true,
1804
  "concept_provenance": {
1805
+ "node_label": "Errors Total",
1806
+ "confidence": 0.604,
1807
+ "alternatives": [
1808
+ "errors total times",
1809
+ "Total Errors"
1810
+ ],
1811
  "source_evidence": [
1812
+ "keybert"
1813
  ],
1814
+ "embedding_sim": 0.543,
1815
+ "coverage": 0.543,
1816
+ "contrast": 0.125,
1817
+ "specificity": 0.0,
1818
+ "string_sim": 0.974
1819
  }
1820
  },
1821
  {
 
1844
  "recover": true,
1845
  "concept_provenance": {
1846
  "node_label": "Correct Latency Standard Deviation",
1847
+ "confidence": 0.687,
1848
+ "alternatives": [
1849
+ "latency standard deviation",
1850
+ "deviation response latencies"
1851
+ ],
1852
  "source_evidence": [
1853
+ "description_title"
1854
  ],
1855
+ "embedding_sim": 0.684,
1856
+ "coverage": 0.684,
1857
+ "contrast": 0.193,
1858
+ "specificity": 0.0,
1859
+ "string_sim": 1.0
1860
  }
1861
  },
1862
  {
1863
  "id": 115,
1864
+ "name": "Probability Error Occurring",
1865
  "related": [
1866
  26,
1867
  27
 
1876
  "relation_label": "belongs to"
1877
  },
1878
  "isShown": true,
1879
+ "desc": "Concept group: DMS > Probability Error Occurring",
1880
  "dtype": "determine",
1881
  "recover": true,
1882
  "concept_provenance": {
1883
+ "node_label": "Probability Error Occurring",
1884
+ "confidence": 0.619,
1885
+ "alternatives": [
1886
+ "Probability Error",
1887
+ "probability error made",
1888
+ "reports probability error"
1889
+ ],
1890
  "source_evidence": [
1891
+ "keybert"
1892
  ],
1893
+ "embedding_sim": 0.578,
1894
+ "coverage": 0.578,
1895
+ "contrast": 0.142,
1896
+ "specificity": 0.0,
1897
+ "string_sim": 0.966
1898
  }
1899
  },
1900
  {
1901
  "id": 116,
1902
+ "name": "Percent Correct Percentage",
1903
  "related": [
1904
  20,
1905
  21,
 
1918
  "relation_label": "belongs to"
1919
  },
1920
  "isShown": true,
1921
+ "desc": "Concept group: DMS > Percent Correct Percentage",
1922
  "dtype": "determine",
1923
  "recover": true,
1924
  "concept_provenance": {
1925
+ "node_label": "Percent Correct Percentage",
1926
+ "confidence": 0.54,
1927
+ "alternatives": [
1928
+ "correct percentage assessment",
1929
+ "correct percentage",
1930
+ "Percent Correct"
1931
+ ],
1932
  "source_evidence": [
1933
+ "keybert"
1934
  ],
1935
+ "embedding_sim": 0.473,
1936
+ "coverage": 0.473,
1937
+ "contrast": 0.156,
1938
+ "specificity": 0.0,
1939
+ "string_sim": 0.868
1940
  }
1941
  },
1942
  {
 
2002
  "recover": true,
2003
  "concept_provenance": {
2004
  "node_label": "Latency Display Stimulus",
2005
+ "confidence": 0.418,
2006
  "alternatives": [
2007
+ "mean latency display",
2008
+ "standard deviation latency",
2009
+ "deviation latency calculated"
2010
  ],
2011
  "source_evidence": [
2012
  "keybert"
 
2014
  "embedding_sim": 0.732,
2015
  "coverage": 0.732,
2016
  "contrast": 0.595,
2017
+ "specificity": 0.0,
2018
  "string_sim": 0.0
2019
  }
2020
  },
 
2040
  "recover": true,
2041
  "concept_provenance": {
2042
  "node_label": "Total Assessment Trials",
2043
+ "confidence": 0.313,
2044
  "alternatives": [
2045
+ "assessment trials subject",
2046
+ "trials subject failed",
2047
+ "trials subject"
2048
  ],
2049
  "source_evidence": [
2050
  "keybert"
 
2052
  "embedding_sim": 0.629,
2053
  "coverage": 0.629,
2054
  "contrast": 0.204,
2055
+ "specificity": 0.0,
2056
  "string_sim": 0.0
2057
  }
2058
  },
 
2060
  "id": 121,
2061
  "name": "PAL",
2062
  "related": [
 
2063
  123,
 
2064
  125,
2065
  126,
2066
+ 146,
2067
  147
2068
  ],
2069
  "type": "aggregation",
 
2082
  },
2083
  {
2084
  "id": 122,
2085
+ "name": "Attempts Patterns",
2086
  "related": [
2087
  49,
2088
  50,
 
2101
  "relation_label": "belongs to"
2102
  },
2103
  "isShown": true,
2104
+ "desc": "Concept group: PAL > Attempts Patterns Total",
2105
  "dtype": "determine",
2106
  "recover": true,
2107
  "concept_provenance": {
2108
+ "node_label": "Attempts Patterns Total",
2109
+ "confidence": 0.633,
2110
+ "alternatives": [
2111
+ "patterns total attempts",
2112
+ "Total Attempts Patterns"
2113
+ ],
2114
  "source_evidence": [
2115
+ "keybert"
2116
  ],
2117
+ "embedding_sim": 0.598,
2118
+ "coverage": 0.598,
2119
+ "contrast": 0.151,
2120
+ "specificity": 0.0,
2121
+ "string_sim": 0.975
2122
  }
2123
  },
2124
  {
 
2143
  },
2144
  {
2145
  "id": 124,
2146
+ "name": "Errors Patterns",
2147
  "related": [
2148
  55,
2149
  56,
 
2167
  "recover": true,
2168
  "concept_provenance": {
2169
  "node_label": "Errors Patterns Total",
2170
+ "confidence": 0.296,
2171
  "alternatives": [
2172
+ "box stimulus assessment",
2173
+ "stimulus assessment problems",
2174
+ "incorrect box stimulus"
2175
  ],
2176
  "source_evidence": [
2177
  "keybert"
 
2179
  "embedding_sim": 0.619,
2180
  "coverage": 0.619,
2181
  "contrast": 0.115,
2182
+ "specificity": 0.0,
2183
  "string_sim": 0.0
2184
  }
2185
  },
 
2248
  "recover": true,
2249
  "concept_provenance": {
2250
  "node_label": "Include Total Errors Shapes",
2251
+ "confidence": 0.609,
2252
+ "alternatives": [
2253
+ "total errors shapes",
2254
+ "errors shapes times",
2255
+ "errors shapes"
2256
+ ],
2257
  "source_evidence": [
2258
+ "description_title"
2259
  ],
2260
+ "embedding_sim": 0.549,
2261
+ "coverage": 0.549,
2262
+ "contrast": 0.08,
2263
+ "specificity": 0.0,
2264
+ "string_sim": 1.0
2265
  }
2266
  },
2267
  {
 
2288
  },
2289
  {
2290
  "id": 130,
2291
+ "name": "Latency Immediate Standard",
2292
  "related": [
2293
+ 67,
2294
+ 68,
2295
+ 69,
2296
+ 70,
2297
+ 71,
2298
+ 72
2299
  ],
2300
  "type": "aggregation",
2301
  "info": {
 
2307
  "relation_label": "belongs to"
2308
  },
2309
  "isShown": true,
2310
+ "desc": "Concept group: PRM > Latency Immediate Standard",
2311
  "dtype": "determine",
2312
  "recover": true,
2313
  "concept_provenance": {
2314
+ "node_label": "Latency Immediate Standard",
2315
+ "confidence": 0.653,
2316
+ "alternatives": [
2317
+ "correct latency immediate",
2318
+ "latency immediate",
2319
+ "correct latency delayed"
2320
+ ],
2321
  "source_evidence": [
2322
+ "keybert"
2323
  ],
2324
+ "embedding_sim": 0.715,
2325
+ "coverage": 0.715,
2326
+ "contrast": 0.34,
2327
+ "specificity": 0.0,
2328
+ "string_sim": 0.801
2329
  }
2330
  },
2331
  {
 
2350
  },
2351
  {
2352
  "id": 132,
2353
+ "name": "Percent Correct Immediate",
2354
  "related": [
2355
  73,
2356
  74
 
2365
  "relation_label": "belongs to"
2366
  },
2367
  "isShown": true,
2368
+ "desc": "Concept group: PRM > Percent Correct Immediate",
2369
  "dtype": "determine",
2370
  "recover": true,
2371
  "concept_provenance": {
2372
+ "node_label": "Percent Correct Immediate",
2373
+ "confidence": 0.596,
2374
+ "alternatives": [
2375
+ "Percent Correct",
2376
+ "key percent correct",
2377
+ "percent correct delayed"
2378
+ ],
2379
  "source_evidence": [
2380
+ "keybert"
2381
  ],
2382
+ "embedding_sim": 0.671,
2383
+ "coverage": 0.671,
2384
+ "contrast": 0.245,
2385
+ "specificity": 0.0,
2386
+ "string_sim": 0.735
2387
  }
2388
  },
2389
  {
 
2433
  "recover": true,
2434
  "concept_provenance": {
2435
  "node_label": "Total",
2436
+ "confidence": 0.407,
2437
+ "alternatives": [
2438
+ "total hits",
2439
+ "hits total"
2440
+ ],
2441
  "source_evidence": [
2442
+ "description_title"
2443
  ],
2444
+ "embedding_sim": 0.111,
2445
+ "coverage": 0.111,
2446
+ "contrast": 0.05,
2447
+ "specificity": 0.0,
2448
+ "string_sim": 1.0
2449
  }
2450
  },
2451
  {
2452
  "id": 135,
2453
+ "name": "Response Latency Mean",
2454
  "related": [
2455
  77,
2456
  78,
 
2466
  "relation_label": "belongs to"
2467
  },
2468
  "isShown": true,
2469
+ "desc": "Concept group: RVP > Response Latency Mean",
2470
  "dtype": "determine",
2471
  "recover": true,
2472
  "concept_provenance": {
2473
+ "node_label": "Response Latency Mean",
2474
+ "confidence": 0.676,
2475
+ "alternatives": [
2476
+ "Response Latency",
2477
+ "response latency trials",
2478
+ "latency mean response"
2479
+ ],
2480
  "source_evidence": [
2481
+ "keybert"
2482
  ],
2483
+ "embedding_sim": 0.683,
2484
+ "coverage": 0.683,
2485
+ "contrast": 0.311,
2486
+ "specificity": 0.0,
2487
+ "string_sim": 0.92
2488
  }
2489
  },
2490
  {
 
2513
  "related": [
2514
  140,
2515
  141,
2516
+ 142,
2517
  143,
2518
  144,
 
2519
  148
2520
  ],
2521
  "type": "aggregation",
 
2532
  "dtype": "determine",
2533
  "recover": true
2534
  },
2535
+ {
2536
+ "id": 138,
2537
+ "name": "Times Errors",
2538
+ "related": [
2539
+ 85,
2540
+ 86,
2541
+ 87,
2542
+ 88,
2543
+ 89
2544
+ ],
2545
+ "type": "aggregation",
2546
+ "info": {
2547
+ "operation": "concat",
2548
+ "usedAttributes": [],
2549
+ "formula": "",
2550
+ "exec": "",
2551
+ "relation_type": "belongs_to",
2552
+ "relation_label": "belongs to"
2553
+ },
2554
+ "isShown": true,
2555
+ "desc": "Concept group: SWM > Errors Boxes Times",
2556
+ "dtype": "determine",
2557
+ "recover": true,
2558
+ "concept_provenance": {
2559
+ "node_label": "Errors Boxes Times",
2560
+ "confidence": 0.515,
2561
+ "alternatives": [
2562
+ "Errors Boxes",
2563
+ "key errors boxes",
2564
+ "errors times"
2565
+ ],
2566
+ "source_evidence": [
2567
+ "keybert"
2568
+ ],
2569
+ "embedding_sim": 0.447,
2570
+ "coverage": 0.447,
2571
+ "contrast": 0.0,
2572
+ "specificity": 0.0,
2573
+ "string_sim": 0.896
2574
+ }
2575
+ },
2576
  {
2577
  "id": 140,
2578
+ "name": "Strategy High",
2579
  "related": [
2580
  96,
2581
  97,
 
2591
  "relation_label": "belongs to"
2592
  },
2593
  "isShown": true,
2594
+ "desc": "Concept group: SWM > Strategy High",
2595
  "dtype": "determine",
2596
  "recover": true,
2597
  "concept_provenance": {
2598
+ "node_label": "Strategy High",
2599
+ "confidence": 0.569,
2600
+ "alternatives": [
2601
+ "Strategy",
2602
+ "strategy finding",
2603
+ "high strategy"
2604
+ ],
2605
  "source_evidence": [
2606
+ "keybert"
2607
  ],
2608
+ "embedding_sim": 0.509,
2609
+ "coverage": 0.509,
2610
+ "contrast": 0.362,
2611
+ "specificity": 0.0,
2612
+ "string_sim": 0.814
2613
  }
2614
  },
2615
  {
 
2633
  "recover": true
2634
  },
2635
  {
2636
+ "id": 142,
2637
+ "name": "Within Errors",
2638
  "related": [
2639
+ 104,
2640
+ 105,
2641
+ 107,
2642
+ 108,
2643
+ 106
2644
  ],
2645
  "type": "aggregation",
2646
  "info": {
 
2652
  "relation_label": "belongs to"
2653
  },
2654
  "isShown": true,
2655
+ "desc": "Concept group: SWM > Within Errors",
2656
  "dtype": "determine",
2657
  "recover": true,
2658
  "concept_provenance": {
2659
+ "node_label": "Within Errors",
2660
+ "confidence": 0.412,
2661
+ "alternatives": [
2662
+ "boxes times subject"
2663
+ ],
2664
  "source_evidence": [
2665
+ "keybert"
2666
  ],
2667
+ "embedding_sim": 0.303,
2668
+ "coverage": 0.303,
 
2669
  "contrast": 0.0,
2670
+ "specificity": 0.0,
2671
+ "string_sim": 0.787
2672
  }
2673
  },
2674
  {
2675
+ "id": 143,
2676
+ "name": "Errors Total",
2677
  "related": [
2678
+ 99,
2679
+ 100,
2680
+ 101,
2681
+ 102,
2682
+ 103
2683
  ],
2684
  "type": "aggregation",
2685
  "info": {
 
2691
  "relation_label": "belongs to"
2692
  },
2693
  "isShown": true,
2694
+ "desc": "Concept group: SWM > Errors Total",
2695
  "dtype": "determine",
2696
+ "recover": true,
2697
+ "concept_provenance": {
2698
+ "node_label": "Errors Total",
2699
+ "confidence": 0.593,
2700
+ "alternatives": [
2701
+ "errors total times",
2702
+ "Total Errors"
2703
+ ],
2704
+ "source_evidence": [
2705
+ "keybert"
2706
+ ],
2707
+ "embedding_sim": 0.537,
2708
+ "coverage": 0.537,
2709
+ "contrast": 0.07,
2710
+ "specificity": 0.0,
2711
+ "string_sim": 0.974
2712
+ }
2713
  },
2714
  {
2715
+ "id": 144,
2716
  "name": "Double Errors",
2717
  "related": [
2718
  92,
 
2736
  "recover": true
2737
  },
2738
  {
2739
+ "id": 145,
2740
  "name": "Correct Latency",
2741
  "related": [
2742
+ 112,
2743
+ 114
 
 
 
2744
  ],
2745
  "type": "aggregation",
2746
  "info": {
 
2756
  "dtype": "determine",
2757
  "recover": true
2758
  },
2759
+ {
2760
+ "id": 146,
2761
+ "name": "Patterns Total",
2762
+ "related": [
2763
+ 122,
2764
+ 124
2765
+ ],
2766
+ "type": "aggregation",
2767
+ "info": {
2768
+ "operation": "concat",
2769
+ "usedAttributes": [],
2770
+ "formula": "",
2771
+ "exec": "",
2772
+ "relation_type": "belongs_to",
2773
+ "relation_label": "belongs to"
2774
+ },
2775
+ "isShown": true,
2776
+ "desc": "Measure: Patterns Total",
2777
+ "dtype": "determine",
2778
+ "recover": true
2779
+ },
2780
  {
2781
  "id": 147,
2782
  "name": "Total Errors",
 
2802
  "id": 148,
2803
  "name": "Errors Boxes",
2804
  "related": [
2805
+ 138
 
 
 
 
2806
  ],
2807
  "type": "aggregation",
2808
  "info": {
 
2883
  },
2884
  {
2885
  "id": 152,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2886
  "name": "Median Seconds Delay",
2887
  "related": [
2888
  9,
 
2904
  "recover": true
2905
  },
2906
  {
2907
+ "id": 153,
2908
+ "name": "Seconds Delay",
2909
  "related": [
2910
  16,
2911
  17,
 
2921
  "relation_label": "belongs to"
2922
  },
2923
  "isShown": true,
2924
+ "desc": "Sub-group: Seconds Delay",
2925
  "dtype": "determine",
2926
  "recover": true
2927
  },
2928
  {
2929
+ "id": 154,
2930
  "name": "Median",
2931
  "related": [
2932
  8,
 
2946
  "desc": "Sub-group: Median",
2947
  "dtype": "determine",
2948
  "recover": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2949
  }
2950
  ]
pages/2_Approach_1.py CHANGED
@@ -121,7 +121,7 @@ _STOP = {
121
  USE_NOUN_PHRASES = False
122
  # USE_CTFIDF β€” True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
123
  # boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
124
- USE_CTFIDF = False
125
  # KEYBERT_DIVERSITY β€” MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
126
  # (pick the single most relevant phrase); 0.5 = standard MMR diversification.
127
  KEYBERT_DIVERSITY = 0
@@ -148,6 +148,20 @@ _CORPUS_IDF: dict = {}
148
  # scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
149
  _ACTIVE_DOMAIN: str = 'general'
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  # ─────────────────────────────────────────────────────────────────────────────
152
  # FILE LOADING
153
  # ─────────────────────────────────────────────────────────────────────────────
@@ -345,6 +359,9 @@ def build_canonical(df, cfg, source):
345
  if not sem_parts:
346
  sem_parts = list(leaf_parts) if leaf_parts else []
347
  semantic_text = ' '.join(sem_parts) if sem_parts else text
 
 
 
348
  rows.append({
349
  '_source_file': source,
350
  '_row_index': int(i),
@@ -1983,26 +2000,34 @@ def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None,
1983
  """
1984
  ancestor_words = ancestor_words or set()
1985
  used = {str(u).lower() for u in (used_labels or [])}
 
1986
  cand = set()
1987
  for t in member_texts:
1988
- raw = re.sub(r'\([^)]*\)', ' ', str(t))
1989
  nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
1990
  if nps:
1991
  for p in nps:
1992
- toks = [w for w in p.lower().split()
1993
- if w not in _STOP and w not in ancestor_words]
1994
  if toks:
1995
  cand.add(' '.join(toks))
1996
  else:
1997
- toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower())
1998
- if w not in _STOP and w not in ancestor_words]
1999
  for nlen in range(1, max_words + 1):
2000
  for i in range(len(toks) - nlen + 1):
2001
  cand.add(' '.join(toks[i:i + nlen]))
2002
- cand = [c for c in cand if len(c) >= 4 and c.lower() not in used
2003
- and not c.replace(' ', '').isdigit()
2004
- and not re.search(r'\b(\w+)\s+\1\b', c.lower())]
2005
- return cand[:cap]
 
 
 
 
 
 
 
 
 
2006
 
2007
 
2008
  def _concept_title(text):
@@ -2047,9 +2072,10 @@ def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None
2047
  used_labels = {str(u).lower() for u in (used_labels or [])}
2048
 
2049
  def _phrases(title):
2050
- t = re.sub(r'\([^)]*\)', ' ', title.lower()) # drop parenthetical conditions
2051
  toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
2052
- if w not in _STOP and w not in ancestor_words]
 
2053
  out = set()
2054
  for nlen in range(1, max_words + 1):
2055
  for i in range(len(toks) - nlen + 1):
@@ -2098,9 +2124,10 @@ def _raw_title(text):
2098
  def _label_from_own_title(title, ancestor_words, max_words=4):
2099
  """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
2100
  words and parentheticals). Returns '' for sentence-like / empty titles."""
2101
- t = re.sub(r'\([^)]*\)', ' ', str(title).lower())
2102
  toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
2103
- if w not in _STOP and w not in ancestor_words]
 
2104
  if not toks or len(toks) > 7: # >7 words β‡’ prose, not a concept title
2105
  return ''
2106
  return ' '.join(toks[:max_words]).title()
@@ -2413,6 +2440,12 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
2413
  if pool and cluster_emb is not None:
2414
  cand_embs = np.asarray(embedder.encode(pool), dtype=float)
2415
  relevance = cosine_similarity([cluster_emb], cand_embs)[0]
 
 
 
 
 
 
2416
  if sibling_centroids:
2417
  sib_sim = cosine_similarity(cand_embs,
2418
  np.asarray(sibling_centroids, dtype=float)).max(axis=1)
@@ -4053,6 +4086,16 @@ if uploads:
4053
  else:
4054
  c_embs = None
4055
  nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
 
 
 
 
 
 
 
 
 
 
4056
  st.session_state.hiexpan_report = report
4057
  wmoves = report.get('width_expansion_moves', 0)
4058
  dexp = report.get('depth_expansion_nodes', 0)
 
121
  USE_NOUN_PHRASES = False
122
  # USE_CTFIDF β€” True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
123
  # boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
124
+ USE_CTFIDF = True
125
  # KEYBERT_DIVERSITY β€” MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
126
  # (pick the single most relevant phrase); 0.5 = standard MMR diversification.
127
  KEYBERT_DIVERSITY = 0
 
148
  # scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
149
  _ACTIVE_DOMAIN: str = 'general'
150
 
151
+ # Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from
152
+ # data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows
153
+ # embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the
154
+ # embedding text so they can neither name a node nor distort clustering. Domain-agnostic
155
+ # documentation/scale tokens only β€” not concept vocabulary.
156
+ _LABEL_BOILERPLATE = {
157
+ 'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link',
158
+ 'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category',
159
+ 'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat',
160
+ }
161
+ # Inline URLs in free text (http://…, www.…/…) β€” removed from the embedding text.
162
+ _URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
163
+ re.IGNORECASE)
164
+
165
  # ─────────────────────────────────────────────────────────────────────────────
166
  # FILE LOADING
167
  # ─────────────────────────────────────────────────────────────────────────────
 
359
  if not sem_parts:
360
  sem_parts = list(leaf_parts) if leaf_parts else []
361
  semantic_text = ' '.join(sem_parts) if sem_parts else text
362
+ # Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens
363
+ # cannot dominate either the embedding (clustering) or the KeyBERT label.
364
+ semantic_text = _URL_RE.sub(' ', semantic_text)
365
  rows.append({
366
  '_source_file': source,
367
  '_row_index': int(i),
 
2000
  """
2001
  ancestor_words = ancestor_words or set()
2002
  used = {str(u).lower() for u in (used_labels or [])}
2003
+ block = _STOP | ancestor_words | _LABEL_BOILERPLATE # boilerplate/web/Likert tokens out
2004
  cand = set()
2005
  for t in member_texts:
2006
+ raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t)))
2007
  nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
2008
  if nps:
2009
  for p in nps:
2010
+ toks = [w for w in p.lower().split() if w not in block]
 
2011
  if toks:
2012
  cand.add(' '.join(toks))
2013
  else:
2014
+ toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block]
 
2015
  for nlen in range(1, max_words + 1):
2016
  for i in range(len(toks) - nlen + 1):
2017
  cand.add(' '.join(toks[i:i + nlen]))
2018
+
2019
+ def _ok(c):
2020
+ words = c.split()
2021
+ if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit():
2022
+ return False
2023
+ if re.search(r'\b(\w+)\s+\1\b', c.lower()): # adjacent word repeat
2024
+ return False
2025
+ if len(words) == 4 and words[:2] == words[2:]: # phrase repeat "x y x y"
2026
+ return False
2027
+ if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)): # bare fragment/acronym
2028
+ return False
2029
+ return True
2030
+ return [c for c in cand if _ok(c)][:cap]
2031
 
2032
 
2033
  def _concept_title(text):
 
2072
  used_labels = {str(u).lower() for u in (used_labels or [])}
2073
 
2074
  def _phrases(title):
2075
+ t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower())) # drop parens + URLs
2076
  toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
2077
+ if w not in _STOP and w not in ancestor_words
2078
+ and w not in _LABEL_BOILERPLATE] # web/Likert out
2079
  out = set()
2080
  for nlen in range(1, max_words + 1):
2081
  for i in range(len(toks) - nlen + 1):
 
2124
  def _label_from_own_title(title, ancestor_words, max_words=4):
2125
  """[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
2126
  words and parentheticals). Returns '' for sentence-like / empty titles."""
2127
+ t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower()))
2128
  toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
2129
+ if w not in _STOP and w not in ancestor_words
2130
+ and w not in _LABEL_BOILERPLATE]
2131
  if not toks or len(toks) > 7: # >7 words β‡’ prose, not a concept title
2132
  return ''
2133
  return ' '.join(toks[:max_words]).title()
 
2440
  if pool and cluster_emb is not None:
2441
  cand_embs = np.asarray(embedder.encode(pool), dtype=float)
2442
  relevance = cosine_similarity([cluster_emb], cand_embs)[0]
2443
+ # c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic
2444
+ # phrases ("test", "description", "measure", "scores") lose to distinctive ones.
2445
+ if USE_CTFIDF and _CORPUS_IDF:
2446
+ _mx = max(_CORPUS_IDF.values()) or 1.0
2447
+ _idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool])
2448
+ relevance = relevance * (0.5 + 0.5 * _idf)
2449
  if sibling_centroids:
2450
  sib_sim = cosine_similarity(cand_embs,
2451
  np.asarray(sibling_centroids, dtype=float)).max(axis=1)
 
4086
  else:
4087
  c_embs = None
4088
  nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
4089
+ # HiExpan's width/global passes MOVE leaves between concepts; a concept
4090
+ # that loses all its leaves becomes empty. build_concept_hierarchy prunes
4091
+ # internally, but that runs BEFORE HiExpan β€” so re-prune here, else empty
4092
+ # nodes break the Plotly branchvalues='total' sunburst/treemap (parent
4093
+ # value < sum(children) β†’ blank render; node-link is unaffected).
4094
+ _prune_empty_aggregations(nodes)
4095
+ _alive = {int(n['id']) for n in nodes}
4096
+ for _n in nodes:
4097
+ _n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', []))
4098
+ if x in _alive]
4099
  st.session_state.hiexpan_report = report
4100
  wmoves = report.get('width_expansion_moves', 0)
4101
  dexp = report.get('depth_expansion_nodes', 0)