RoophaSharon commited on
Commit Β·
2b56f2e
1
Parent(s): 51c62ea
Sync demo (downloads, build summary, HCP depth fix) + latest approach_1; clean canonical outputs
Browse files- approach_1.py +394 -66
- demo.py +111 -7
- outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json β HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json} +0 -0
- outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json β HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json} +0 -0
- outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json +182 -142
- outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json +353 -255
- pages/2_Approach_1.py +57 -14
approach_1.py
CHANGED
|
@@ -111,6 +111,57 @@ _STOP = {
|
|
| 111 |
'using','use','based','given','defined','number','value','values','score',
|
| 112 |
}
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 115 |
# FILE LOADING
|
| 116 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -308,6 +359,9 @@ def build_canonical(df, cfg, source):
|
|
| 308 |
if not sem_parts:
|
| 309 |
sem_parts = list(leaf_parts) if leaf_parts else []
|
| 310 |
semantic_text = ' '.join(sem_parts) if sem_parts else text
|
|
|
|
|
|
|
|
|
|
| 311 |
rows.append({
|
| 312 |
'_source_file': source,
|
| 313 |
'_row_index': int(i),
|
|
@@ -342,34 +396,22 @@ def build_canonical(df, cfg, source):
|
|
| 342 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 343 |
def precompute_stat_cond_facets(can):
|
| 344 |
"""
|
| 345 |
-
Pre-compute
|
| 346 |
-
Called before build_concept_hierarchy so that _cluster_and_label can use
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
[CAS] Castanet parallel facets Β· [HIE] HiExpan sub-set discovery
|
| 350 |
"""
|
| 351 |
can = can.copy()
|
| 352 |
sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text'
|
| 353 |
|
| 354 |
-
# ββ Statistic type: detected from description text βββββββββββββββββββββββββ
|
| 355 |
-
_stat_re = re.compile(
|
| 356 |
-
r'\b(mean|average|median|standard deviation|std|percent|proportion|'
|
| 357 |
-
r'probability|total|sum|count|maximum|minimum|range|variance|'
|
| 358 |
-
r'coefficient|ratio|rate|frequency)\b', re.IGNORECASE
|
| 359 |
-
)
|
| 360 |
-
_stat_norm = {
|
| 361 |
-
'average': 'Mean', 'std': 'Standard Deviation', 'proportion': 'Percent',
|
| 362 |
-
'sum': 'Total', 'count': 'Total', 'frequency': 'Rate',
|
| 363 |
-
}
|
| 364 |
-
def _extract_stat(row):
|
| 365 |
-
hits = _stat_re.findall(str(row.get(sem_col, row.get('_text', ''))).lower())
|
| 366 |
-
if not hits:
|
| 367 |
-
return ''
|
| 368 |
-
h = hits[0].lower()
|
| 369 |
-
return _stat_norm.get(h, h.title())
|
| 370 |
-
stat_col = can.apply(_extract_stat, axis=1)
|
| 371 |
-
can['_facet_stat'] = stat_col.where(stat_col != '', '')
|
| 372 |
-
|
| 373 |
# ββ Condition: digit in variable code VALIDATED by description text ββββββββββ
|
| 374 |
# [FIX2][GON] GonΓ§alves et al. (ESWC 2019): structural code alignment must be
|
| 375 |
# validated against description text β the description is the authoritative source.
|
|
@@ -1756,8 +1798,11 @@ _MIN_FACET_GROUP = 2 # minimum variables per facet sub-group
|
|
| 1756 |
def _do_facet_subsplit(sub_can, parent_id, current_path,
|
| 1757 |
nodes, leaf_to_id, ensure_path_fn):
|
| 1758 |
"""
|
| 1759 |
-
[F4][CAS]
|
| 1760 |
-
|
|
|
|
|
|
|
|
|
|
| 1761 |
"""
|
| 1762 |
# A facet tier that merely repeats the parent concept label (e.g. a "Total"
|
| 1763 |
# statistic under a "Total" concept) is redundant β skip it.
|
|
@@ -1858,6 +1903,133 @@ def _do_cond_subsplit(sub_can, parent_id, current_path,
|
|
| 1858 |
# 4. [F4] For each concept cluster: facet sub-split by Statistic β Condition
|
| 1859 |
# 5. Store concept assignment back on each variable in can
|
| 1860 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1861 |
def _concept_title(text):
|
| 1862 |
"""
|
| 1863 |
Extract the human-written concept TITLE from a metadata description.
|
|
@@ -1900,9 +2072,10 @@ def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None
|
|
| 1900 |
used_labels = {str(u).lower() for u in (used_labels or [])}
|
| 1901 |
|
| 1902 |
def _phrases(title):
|
| 1903 |
-
t = re.sub(r'\([^)]*\)', ' ', title.lower())
|
| 1904 |
toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
|
| 1905 |
-
if w not in _STOP and w not in ancestor_words
|
|
|
|
| 1906 |
out = set()
|
| 1907 |
for nlen in range(1, max_words + 1):
|
| 1908 |
for i in range(len(toks) - nlen + 1):
|
|
@@ -1951,9 +2124,10 @@ def _raw_title(text):
|
|
| 1951 |
def _label_from_own_title(title, ancestor_words, max_words=4):
|
| 1952 |
"""[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
|
| 1953 |
words and parentheticals). Returns '' for sentence-like / empty titles."""
|
| 1954 |
-
t = re.sub(r'\([^)]*\)', ' ', str(title).lower())
|
| 1955 |
toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
|
| 1956 |
-
if w not in _STOP and w not in ancestor_words
|
|
|
|
| 1957 |
if not toks or len(toks) > 7: # >7 words β prose, not a concept title
|
| 1958 |
return ''
|
| 1959 |
return ' '.join(toks[:max_words]).title()
|
|
@@ -2092,11 +2266,18 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
|
|
| 2092 |
_aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks
|
| 2093 |
|
| 2094 |
if n < 3 or concept_embs is None or len(concept_table) == 0:
|
| 2095 |
-
# Too few variables to cluster β label each from its own title [Fix5]
|
| 2096 |
-
#
|
|
|
|
| 2097 |
pid = ensure_path_fn(path_prefix)
|
|
|
|
| 2098 |
for i, (_, row) in enumerate(tdf.iterrows()):
|
| 2099 |
lbl = _label_from_own_title(titles[i], _aw_base)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2100 |
tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \
|
| 2101 |
{a.lower() for a in ancestor_names} else pid
|
| 2102 |
add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
|
|
@@ -2174,6 +2355,14 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
|
|
| 2174 |
if len(cluster_idxs) == 1:
|
| 2175 |
_, row = rows_list[cluster_idxs[0]]
|
| 2176 |
lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2177 |
if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}:
|
| 2178 |
tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to')
|
| 2179 |
can.at[row.name, '_concept_label'] = lbl
|
|
@@ -2182,7 +2371,7 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
|
|
| 2182 |
can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root'
|
| 2183 |
add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
|
| 2184 |
can.at[row.name, '_concept_score'] = 0.0
|
| 2185 |
-
can.at[row.name, '_concept_source'] =
|
| 2186 |
continue
|
| 2187 |
|
| 2188 |
if cluster_emb is not None:
|
|
@@ -2201,32 +2390,100 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
|
|
| 2201 |
else:
|
| 2202 |
scores = []
|
| 2203 |
|
| 2204 |
-
#
|
| 2205 |
-
#
|
| 2206 |
-
#
|
| 2207 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2208 |
ancestor_words = set(re.findall(r'[a-z]{3,}',
|
| 2209 |
' '.join(ancestor_names).lower())) | _top_level_tasks
|
| 2210 |
member_titles_k = [titles[i] for i in cluster_idxs]
|
| 2211 |
sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2212 |
title_label = _title_cluster_label(member_titles_k, sibling_title_lists,
|
| 2213 |
-
|
| 2214 |
-
|
| 2215 |
-
|
| 2216 |
-
|
| 2217 |
-
|
| 2218 |
-
#
|
| 2219 |
-
#
|
| 2220 |
-
#
|
| 2221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2222 |
fallback_label = (title_label
|
|
|
|
| 2223 |
or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts)
|
| 2224 |
or f'Group {k+1}')
|
| 2225 |
-
candidate_scores = [] if title_label else scores
|
| 2226 |
|
| 2227 |
label, provenance = assign_concept_label(
|
| 2228 |
candidate_scores,
|
| 2229 |
fallback=fallback_label,
|
|
|
|
| 2230 |
ancestor_names=ancestor_names,
|
| 2231 |
used_sibling_labels=used_sibling_labels,
|
| 2232 |
top_level_tasks=_top_level_tasks,
|
|
@@ -2282,24 +2539,24 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
|
|
| 2282 |
pid = ensure_path_fn(path_prefix + [label],
|
| 2283 |
relation='belongs_to', provenance=provenance)
|
| 2284 |
|
| 2285 |
-
# Store concept assignment on can (needed by Castanet facets later)
|
|
|
|
|
|
|
| 2286 |
for ci in cluster_idxs:
|
| 2287 |
_, row = rows_list[ci]
|
| 2288 |
can.at[row.name, '_concept_label'] = label
|
| 2289 |
-
can.at[row.name, '_concept_score'] =
|
| 2290 |
-
can.at[row.name, '_concept_source'] =
|
| 2291 |
-
|
| 2292 |
-
#
|
| 2293 |
-
#
|
| 2294 |
-
# (
|
| 2295 |
-
#
|
| 2296 |
-
#
|
| 2297 |
-
|
| 2298 |
-
|
| 2299 |
-
|
| 2300 |
-
|
| 2301 |
-
nodes, leaf_to_id, ensure_path_fn
|
| 2302 |
-
)
|
| 2303 |
|
| 2304 |
|
| 2305 |
def _remove_phrase(tokens, phrase_tokens):
|
|
@@ -2511,6 +2768,43 @@ def _prune_empty_aggregations(nodes):
|
|
| 2511 |
return nodes
|
| 2512 |
|
| 2513 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2514 |
def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project',
|
| 2515 |
n_clusters_per_group=8):
|
| 2516 |
"""
|
|
@@ -2550,6 +2844,27 @@ def build_concept_hierarchy(can, embedder, concept_table, project='metadata_proj
|
|
| 2550 |
# is discriminative; one close to ALL of them is boilerplate. corpus_centroid
|
| 2551 |
# is the global mean (generic = central). Both are derived purely from data.
|
| 2552 |
sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2553 |
ref_centroids = corpus_centroid = None
|
| 2554 |
try:
|
| 2555 |
all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist())
|
|
@@ -2648,6 +2963,9 @@ def build_concept_hierarchy(can, embedder, concept_table, project='metadata_proj
|
|
| 2648 |
# Remove empty concept nodes (no variables) β meaningless and they break the
|
| 2649 |
# branchvalues='total' sunburst (parent value < sum of children β blank render).
|
| 2650 |
_prune_empty_aggregations(nodes)
|
|
|
|
|
|
|
|
|
|
| 2651 |
# NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted β
|
| 2652 |
# it regressed setOverlap (0.914β0.836: mis-grouping) and added depth beyond gold.
|
| 2653 |
# _nest_by_category() is kept defined but intentionally NOT called.
|
|
@@ -3745,13 +4063,13 @@ if uploads:
|
|
| 3745 |
# [F3][F5][CAS] These columns are needed inside _cluster_and_label
|
| 3746 |
# for facet sub-splitting. They must be computed BEFORE Step G.
|
| 3747 |
# detect_facets / build_castanet_facets runs AFTER hierarchy build
|
| 3748 |
-
# (Step I), so we pre-compute only
|
| 3749 |
-
|
|
|
|
| 3750 |
can = precompute_stat_cond_facets(can)
|
| 3751 |
-
n_stat = can['_facet_stat'].ne('').sum()
|
| 3752 |
n_cond = can['_facet_cond'].ne('').sum()
|
| 3753 |
-
st.info(f'Facet pre-computation: {
|
| 3754 |
-
f'
|
| 3755 |
|
| 3756 |
# ββ Step G: Build concept hierarchy (NΓM alignment) ββββββββββββββ
|
| 3757 |
with st.spinner('Building concept hierarchy via NΓM alignment [GON][TAX]...'):
|
|
@@ -3768,6 +4086,16 @@ if uploads:
|
|
| 3768 |
else:
|
| 3769 |
c_embs = None
|
| 3770 |
nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3771 |
st.session_state.hiexpan_report = report
|
| 3772 |
wmoves = report.get('width_expansion_moves', 0)
|
| 3773 |
dexp = report.get('depth_expansion_nodes', 0)
|
|
|
|
| 111 |
'using','use','based','given','defined','number','value','values','score',
|
| 112 |
}
|
| 113 |
|
| 114 |
+
# βββ KeyBERT / labelling configuration βββββββββββββββββββββββββββββββββββββββ
|
| 115 |
+
# These tune the KeyBERT label synthesizer used in the hybrid scorer.
|
| 116 |
+
#
|
| 117 |
+
# USE_NOUN_PHRASES β True: candidate phrases are NLTK POS-tagged noun phrases
|
| 118 |
+
# (needs the 'averaged_perceptron_tagger' corpus); False: plain n-gram candidates
|
| 119 |
+
# from tokens. False is robust for short CANTAB/AI-MIND descriptions and avoids the
|
| 120 |
+
# extra NLTK dependency.
|
| 121 |
+
USE_NOUN_PHRASES = False
|
| 122 |
+
# USE_CTFIDF β True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
|
| 123 |
+
# boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
|
| 124 |
+
USE_CTFIDF = True
|
| 125 |
+
# KEYBERT_DIVERSITY β MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
|
| 126 |
+
# (pick the single most relevant phrase); 0.5 = standard MMR diversification.
|
| 127 |
+
KEYBERT_DIVERSITY = 0
|
| 128 |
+
|
| 129 |
+
# βββ Title-SEEDED KeyBERT label-scorer weights βββββββββββββββββββββββββββββββ
|
| 130 |
+
# Concept labels are FORMED FROM THE DESCRIPTIONS (KeyBERT candidate phrases over the
|
| 131 |
+
# cluster's member descriptions). The pre-colon title is a ranking SEED/anchor, not the
|
| 132 |
+
# label itself: LABEL_W_TITLE controls how strongly it biases the choice toward the
|
| 133 |
+
# human-canonical phrasing (this is "Guided/Seeded KeyBERT"). Set LABEL_W_TITLE=0 for a
|
| 134 |
+
# pure-description ablation. Magnitudes are relative (need not sum to 1).
|
| 135 |
+
LABEL_W_RELEVANCE = 0.45 # cosine(candidate, cluster centroid) β description fit (Ξ±)
|
| 136 |
+
LABEL_W_TITLE = 0.35 # cosine(candidate, pre-colon title) β title influence (Ξ²)
|
| 137 |
+
LABEL_W_CONTRAST = 0.15 # discriminativeness vs sibling clusters (Ξ³)
|
| 138 |
+
# NOTE: node labels are formed from DESCRIPTIONS + pre-colon TITLE only. External
|
| 139 |
+
# ontology sources (Cognitive Atlas / Wikidata / WordNet / PubMed) inform the embedding
|
| 140 |
+
# space / semantic understanding but are never used to name a node β so there is no
|
| 141 |
+
# external-grounding term in the label score.
|
| 142 |
+
|
| 143 |
+
# Corpus IDF over description n-grams; populated in build_concept_hierarchy() and
|
| 144 |
+
# consumed by _keybert_label when USE_CTFIDF=True.
|
| 145 |
+
_CORPUS_IDF: dict = {}
|
| 146 |
+
|
| 147 |
+
# Active dataset domain; set in build_concept_hierarchy(), read by the hybrid label
|
| 148 |
+
# scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
|
| 149 |
+
_ACTIVE_DOMAIN: str = 'general'
|
| 150 |
+
|
| 151 |
+
# Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from
|
| 152 |
+
# data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows
|
| 153 |
+
# embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the
|
| 154 |
+
# embedding text so they can neither name a node nor distort clustering. Domain-agnostic
|
| 155 |
+
# documentation/scale tokens only β not concept vocabulary.
|
| 156 |
+
_LABEL_BOILERPLATE = {
|
| 157 |
+
'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link',
|
| 158 |
+
'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category',
|
| 159 |
+
'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat',
|
| 160 |
+
}
|
| 161 |
+
# Inline URLs in free text (http://β¦, www.β¦/β¦) β removed from the embedding text.
|
| 162 |
+
_URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
|
| 163 |
+
re.IGNORECASE)
|
| 164 |
+
|
| 165 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
# FILE LOADING
|
| 167 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 359 |
if not sem_parts:
|
| 360 |
sem_parts = list(leaf_parts) if leaf_parts else []
|
| 361 |
semantic_text = ' '.join(sem_parts) if sem_parts else text
|
| 362 |
+
# Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens
|
| 363 |
+
# cannot dominate either the embedding (clustering) or the KeyBERT label.
|
| 364 |
+
semantic_text = _URL_RE.sub(' ', semantic_text)
|
| 365 |
rows.append({
|
| 366 |
'_source_file': source,
|
| 367 |
'_row_index': int(i),
|
|
|
|
| 396 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 397 |
def precompute_stat_cond_facets(can):
|
| 398 |
"""
|
| 399 |
+
Pre-compute _facet_cond on can (numeric experimental conditions only).
|
| 400 |
+
Called before build_concept_hierarchy so that _cluster_and_label can use it to
|
| 401 |
+
insert Condition sub-tiers.
|
| 402 |
+
|
| 403 |
+
NOTE: the statistic tier (Mean / Median / SD / β¦) is NO LONGER computed here.
|
| 404 |
+
It used to come from a hardcoded statistic vocabulary regex, which (a) is domain
|
| 405 |
+
hardcoding and (b) is not derived from the data's own concept titles. Statistic
|
| 406 |
+
depth is now produced data-drivenly by _nest_by_measure(), which discovers the
|
| 407 |
+
shared measure phrase and keeps the residual (Mean/Median/SD) as children β no
|
| 408 |
+
word list. Condition detection below stays: it is structural (a digit in the
|
| 409 |
+
code validated against the description text), not a hardcoded vocabulary.
|
| 410 |
[CAS] Castanet parallel facets Β· [HIE] HiExpan sub-set discovery
|
| 411 |
"""
|
| 412 |
can = can.copy()
|
| 413 |
sem_col = '_semantic_text' if '_semantic_text' in can.columns else '_text'
|
| 414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
# ββ Condition: digit in variable code VALIDATED by description text ββββββββββ
|
| 416 |
# [FIX2][GON] GonΓ§alves et al. (ESWC 2019): structural code alignment must be
|
| 417 |
# validated against description text β the description is the authoritative source.
|
|
|
|
| 1798 |
def _do_facet_subsplit(sub_can, parent_id, current_path,
|
| 1799 |
nodes, leaf_to_id, ensure_path_fn):
|
| 1800 |
"""
|
| 1801 |
+
[F4][CAS] Facet sub-split by _facet_cond (numeric condition) only.
|
| 1802 |
+
The statistic tier is no longer inserted here β it came from a hardcoded
|
| 1803 |
+
statistic vocabulary and is now produced data-drivenly by _nest_by_measure().
|
| 1804 |
+
Kept defensive: if a legacy _facet_stat column is present it is still honoured,
|
| 1805 |
+
but precompute_stat_cond_facets() no longer produces one.
|
| 1806 |
"""
|
| 1807 |
# A facet tier that merely repeats the parent concept label (e.g. a "Total"
|
| 1808 |
# statistic under a "Total" concept) is redundant β skip it.
|
|
|
|
| 1903 |
# 4. [F4] For each concept cluster: facet sub-split by Statistic β Condition
|
| 1904 |
# 5. Store concept assignment back on each variable in can
|
| 1905 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1906 |
+
def _noun_phrases(text, max_words=4):
|
| 1907 |
+
"""
|
| 1908 |
+
Grammatical noun phrases via NLTK POS tagging (used when USE_NOUN_PHRASES=True).
|
| 1909 |
+
Returns [] if NLTK / the tagger is unavailable, so the caller falls back to
|
| 1910 |
+
n-grams. Phrases are contiguous runs of adjectives/nouns up to max_words long.
|
| 1911 |
+
"""
|
| 1912 |
+
try:
|
| 1913 |
+
import nltk
|
| 1914 |
+
for _pkg in ('averaged_perceptron_tagger', 'punkt'):
|
| 1915 |
+
try:
|
| 1916 |
+
nltk.data.find(f'taggers/{_pkg}' if 'tagger' in _pkg else f'tokenizers/{_pkg}')
|
| 1917 |
+
except LookupError:
|
| 1918 |
+
nltk.download(_pkg, quiet=True)
|
| 1919 |
+
toks = nltk.word_tokenize(str(text))
|
| 1920 |
+
tags = nltk.pos_tag(toks)
|
| 1921 |
+
except Exception:
|
| 1922 |
+
return []
|
| 1923 |
+
phrases, cur = [], []
|
| 1924 |
+
for w, t in tags:
|
| 1925 |
+
if t.startswith('NN') or t.startswith('JJ'):
|
| 1926 |
+
cur.append(w)
|
| 1927 |
+
if len(cur) > max_words:
|
| 1928 |
+
cur = cur[-max_words:]
|
| 1929 |
+
else:
|
| 1930 |
+
if len(cur) >= 1:
|
| 1931 |
+
phrases.append(' '.join(cur))
|
| 1932 |
+
cur = []
|
| 1933 |
+
if cur:
|
| 1934 |
+
phrases.append(' '.join(cur))
|
| 1935 |
+
return [p for p in phrases if len(p) >= 3]
|
| 1936 |
+
|
| 1937 |
+
|
| 1938 |
+
def _keybert_label(member_texts, cluster_centroid, embedder, ancestor_words=None,
|
| 1939 |
+
corpus_centroid=None, used_labels=None, max_words=4,
|
| 1940 |
+
gen_weight=0.0, diversity=KEYBERT_DIVERSITY, cap=500):
|
| 1941 |
+
"""
|
| 1942 |
+
KeyBERT-style extractive labeller. Extract candidate phrases from the cluster's
|
| 1943 |
+
DESCRIPTIONS, embed them, and pick by:
|
| 1944 |
+
score = (1 β diversity)Β·cos(phrase, cluster_centroid)
|
| 1945 |
+
β diversity Β·cos(phrase, mean candidate phrase) # MMR redundancy
|
| 1946 |
+
With diversity=0 this is plain cosine-to-centroid (argmax relevance). When
|
| 1947 |
+
USE_CTFIDF=True the relevance is modulated by corpus IDF so boilerplate (low IDF)
|
| 1948 |
+
is suppressed. Candidates come from noun phrases (USE_NOUN_PHRASES=True) or
|
| 1949 |
+
n-grams. Extractive β never hallucinates a label. Returns a title-cased string.
|
| 1950 |
+
"""
|
| 1951 |
+
ancestor_words = ancestor_words or set()
|
| 1952 |
+
used = {str(u).lower() for u in (used_labels or [])}
|
| 1953 |
+
cand = set()
|
| 1954 |
+
for t in member_texts:
|
| 1955 |
+
raw = re.sub(r'\([^)]*\)', ' ', str(t)) # drop parentheticals
|
| 1956 |
+
nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
|
| 1957 |
+
if nps:
|
| 1958 |
+
for p in nps:
|
| 1959 |
+
toks = [w for w in p.lower().split()
|
| 1960 |
+
if w not in _STOP and w not in ancestor_words]
|
| 1961 |
+
if toks:
|
| 1962 |
+
cand.add(' '.join(toks))
|
| 1963 |
+
else:
|
| 1964 |
+
toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower())
|
| 1965 |
+
if w not in _STOP and w not in ancestor_words]
|
| 1966 |
+
for nlen in range(1, max_words + 1):
|
| 1967 |
+
for i in range(len(toks) - nlen + 1):
|
| 1968 |
+
cand.add(' '.join(toks[i:i + nlen]))
|
| 1969 |
+
# Junk filter: drop used labels, pure-number phrases, immediately-repeated words.
|
| 1970 |
+
cand = [c for c in cand if len(c) >= 4 and c.lower() not in used
|
| 1971 |
+
and not c.replace(' ', '').isdigit()
|
| 1972 |
+
and not re.search(r'\b(\w+)\s+\1\b', c.lower())]
|
| 1973 |
+
if not cand:
|
| 1974 |
+
return ''
|
| 1975 |
+
cand = cand[:cap]
|
| 1976 |
+
embs = np.asarray(embedder.encode(cand), dtype=float)
|
| 1977 |
+
sims = cosine_similarity([cluster_centroid], embs)[0] # relevance
|
| 1978 |
+
if USE_CTFIDF and _CORPUS_IDF:
|
| 1979 |
+
mx = max(_CORPUS_IDF.values()) or 1.0
|
| 1980 |
+
idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), mx) / mx) for c in cand])
|
| 1981 |
+
sims = sims * (0.5 + 0.5 * idf)
|
| 1982 |
+
if gen_weight and corpus_centroid is not None:
|
| 1983 |
+
sims = sims - gen_weight * cosine_similarity([corpus_centroid], embs)[0]
|
| 1984 |
+
if diversity > 0 and len(embs) > 1: # MMR penalty
|
| 1985 |
+
generic = cosine_similarity(embs.mean(axis=0, keepdims=True), embs)[0]
|
| 1986 |
+
score = (1.0 - diversity) * sims - diversity * generic
|
| 1987 |
+
else:
|
| 1988 |
+
score = sims
|
| 1989 |
+
return cand[int(np.argmax(score))].title()
|
| 1990 |
+
|
| 1991 |
+
|
| 1992 |
+
def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None,
|
| 1993 |
+
max_words=3, cap=500):
|
| 1994 |
+
"""
|
| 1995 |
+
Extract the KeyBERT CANDIDATE phrases from a cluster's member descriptions β
|
| 1996 |
+
the same generation logic as _keybert_label but returns the full candidate list
|
| 1997 |
+
(un-ranked) so the caller can score them with the title-seeded scorer. Phrases
|
| 1998 |
+
are noun phrases (USE_NOUN_PHRASES=True) or n-grams, with ancestor/task words,
|
| 1999 |
+
pure numbers, used labels and immediate repeats filtered out.
|
| 2000 |
+
"""
|
| 2001 |
+
ancestor_words = ancestor_words or set()
|
| 2002 |
+
used = {str(u).lower() for u in (used_labels or [])}
|
| 2003 |
+
block = _STOP | ancestor_words | _LABEL_BOILERPLATE # boilerplate/web/Likert tokens out
|
| 2004 |
+
cand = set()
|
| 2005 |
+
for t in member_texts:
|
| 2006 |
+
raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t)))
|
| 2007 |
+
nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
|
| 2008 |
+
if nps:
|
| 2009 |
+
for p in nps:
|
| 2010 |
+
toks = [w for w in p.lower().split() if w not in block]
|
| 2011 |
+
if toks:
|
| 2012 |
+
cand.add(' '.join(toks))
|
| 2013 |
+
else:
|
| 2014 |
+
toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block]
|
| 2015 |
+
for nlen in range(1, max_words + 1):
|
| 2016 |
+
for i in range(len(toks) - nlen + 1):
|
| 2017 |
+
cand.add(' '.join(toks[i:i + nlen]))
|
| 2018 |
+
|
| 2019 |
+
def _ok(c):
|
| 2020 |
+
words = c.split()
|
| 2021 |
+
if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit():
|
| 2022 |
+
return False
|
| 2023 |
+
if re.search(r'\b(\w+)\s+\1\b', c.lower()): # adjacent word repeat
|
| 2024 |
+
return False
|
| 2025 |
+
if len(words) == 4 and words[:2] == words[2:]: # phrase repeat "x y x y"
|
| 2026 |
+
return False
|
| 2027 |
+
if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)): # bare fragment/acronym
|
| 2028 |
+
return False
|
| 2029 |
+
return True
|
| 2030 |
+
return [c for c in cand if _ok(c)][:cap]
|
| 2031 |
+
|
| 2032 |
+
|
| 2033 |
def _concept_title(text):
|
| 2034 |
"""
|
| 2035 |
Extract the human-written concept TITLE from a metadata description.
|
|
|
|
| 2072 |
used_labels = {str(u).lower() for u in (used_labels or [])}
|
| 2073 |
|
| 2074 |
def _phrases(title):
|
| 2075 |
+
t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower())) # drop parens + URLs
|
| 2076 |
toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
|
| 2077 |
+
if w not in _STOP and w not in ancestor_words
|
| 2078 |
+
and w not in _LABEL_BOILERPLATE] # web/Likert out
|
| 2079 |
out = set()
|
| 2080 |
for nlen in range(1, max_words + 1):
|
| 2081 |
for i in range(len(toks) - nlen + 1):
|
|
|
|
| 2124 |
def _label_from_own_title(title, ancestor_words, max_words=4):
|
| 2125 |
"""[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
|
| 2126 |
words and parentheticals). Returns '' for sentence-like / empty titles."""
|
| 2127 |
+
t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower()))
|
| 2128 |
toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
|
| 2129 |
+
if w not in _STOP and w not in ancestor_words
|
| 2130 |
+
and w not in _LABEL_BOILERPLATE]
|
| 2131 |
if not toks or len(toks) > 7: # >7 words β prose, not a concept title
|
| 2132 |
return ''
|
| 2133 |
return ' '.join(toks[:max_words]).title()
|
|
|
|
| 2266 |
_aw_base = set(re.findall(r'[a-z]{3,}', ' '.join(ancestor_names).lower())) | _top_level_tasks
|
| 2267 |
|
| 2268 |
if n < 3 or concept_embs is None or len(concept_table) == 0:
|
| 2269 |
+
# Too few variables to cluster β label each from its own title [Fix5], or
|
| 2270 |
+
# KeyBERT over its description when no title exists. ensure_path merges it
|
| 2271 |
+
# into an existing concept of the same name.
|
| 2272 |
pid = ensure_path_fn(path_prefix)
|
| 2273 |
+
_small = embedder.encode(texts) if texts else None
|
| 2274 |
for i, (_, row) in enumerate(tdf.iterrows()):
|
| 2275 |
lbl = _label_from_own_title(titles[i], _aw_base)
|
| 2276 |
+
if not lbl and _small is not None:
|
| 2277 |
+
lbl = _keybert_label([texts[i]], _small[i], embedder,
|
| 2278 |
+
ancestor_words=_aw_base, used_labels=set(),
|
| 2279 |
+
max_words=2, gen_weight=0.3,
|
| 2280 |
+
diversity=KEYBERT_DIVERSITY)
|
| 2281 |
tgt = ensure_path_fn(path_prefix + [lbl]) if lbl and lbl.lower() not in \
|
| 2282 |
{a.lower() for a in ancestor_names} else pid
|
| 2283 |
add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
|
|
|
|
| 2355 |
if len(cluster_idxs) == 1:
|
| 2356 |
_, row = rows_list[cluster_idxs[0]]
|
| 2357 |
lbl = _label_from_own_title(titles[cluster_idxs[0]], _aw_base)
|
| 2358 |
+
src = 'singleton_title'
|
| 2359 |
+
if not lbl and cluster_emb is not None:
|
| 2360 |
+
lbl = _keybert_label([cluster_texts_k[0]], cluster_emb, embedder,
|
| 2361 |
+
ancestor_words=_aw_base,
|
| 2362 |
+
used_labels=used_sibling_labels,
|
| 2363 |
+
max_words=2, gen_weight=0.3,
|
| 2364 |
+
diversity=KEYBERT_DIVERSITY)
|
| 2365 |
+
src = 'singleton_keybert'
|
| 2366 |
if lbl and lbl.lower() not in {a.lower() for a in ancestor_names}:
|
| 2367 |
tgt = ensure_path_fn(path_prefix + [lbl], relation='belongs_to')
|
| 2368 |
can.at[row.name, '_concept_label'] = lbl
|
|
|
|
| 2371 |
can.at[row.name, '_concept_label'] = path_prefix[-1] if path_prefix else 'root'
|
| 2372 |
add_child(nodes, tgt, leaf_to_id[row['_leaf_id']])
|
| 2373 |
can.at[row.name, '_concept_score'] = 0.0
|
| 2374 |
+
can.at[row.name, '_concept_source'] = src
|
| 2375 |
continue
|
| 2376 |
|
| 2377 |
if cluster_emb is not None:
|
|
|
|
| 2390 |
else:
|
| 2391 |
scores = []
|
| 2392 |
|
| 2393 |
+
# ββ TITLE-SEEDED LABEL SELECTION (Guided KeyBERT) βββββββββββββββββββββ
|
| 2394 |
+
# The label is FORMED FROM THE DESCRIPTIONS: candidates are KeyBERT phrases
|
| 2395 |
+
# extracted from the cluster's member descriptions (+ scored concept-table
|
| 2396 |
+
# entries). The pre-colon TITLE does NOT override β it is a ranking SEED:
|
| 2397 |
+
# score = Ξ±Β·cos(cand, cluster centroid) # description fit
|
| 2398 |
+
# + Ξ²Β·cos(cand, title embedding) # title INFLUENCE (LABEL_W_TITLE)
|
| 2399 |
+
# + Ξ³Β·contrast(vs siblings)
|
| 2400 |
+
# + δ·external grounding
|
| 2401 |
+
# So the displayed label is always a description-derived phrase, pulled toward
|
| 2402 |
+
# the human-canonical title phrasing. Set LABEL_W_TITLE=0 for a pure-description
|
| 2403 |
+
# ablation. The title phrase is also added as ONE candidate so a clean title can
|
| 2404 |
+
# still win on merit (it is usually present verbatim in the descriptions anyway).
|
| 2405 |
ancestor_words = set(re.findall(r'[a-z]{3,}',
|
| 2406 |
' '.join(ancestor_names).lower())) | _top_level_tasks
|
| 2407 |
member_titles_k = [titles[i] for i in cluster_idxs]
|
| 2408 |
sibling_title_lists = [all_cluster_titles[j] for j in range(n_clust) if j != k]
|
| 2409 |
+
sibling_texts = [all_cluster_texts[j] for j in range(n_clust) if j != k]
|
| 2410 |
+
|
| 2411 |
+
# Pre-colon title β used only as the SEED ANCHOR (and one candidate), never a
|
| 2412 |
+
# direct override.
|
| 2413 |
title_label = _title_cluster_label(member_titles_k, sibling_title_lists,
|
| 2414 |
+
ancestor_words=ancestor_words,
|
| 2415 |
+
used_labels=used_sibling_labels)
|
| 2416 |
+
title_emb = (embedder.encode([title_label])[0]
|
| 2417 |
+
if title_label else None)
|
| 2418 |
+
|
| 2419 |
+
# Candidate phrases drawn ONLY from the cluster's DESCRIPTIONS (KeyBERT) plus
|
| 2420 |
+
# the pre-colon title. External ontology sources (Cognitive Atlas / Wikidata /
|
| 2421 |
+
# WordNet / PubMed) are deliberately NOT candidates β per design they inform the
|
| 2422 |
+
# embedding space / semantic understanding only, and must never name a node.
|
| 2423 |
+
kb_cands = _keybert_candidates(cluster_texts_k, ancestor_words=ancestor_words,
|
| 2424 |
+
used_labels=used_sibling_labels, max_words=3)
|
| 2425 |
+
pool_src = [(c, 'keybert') for c in kb_cands]
|
| 2426 |
+
if title_label:
|
| 2427 |
+
pool_src.append((title_label, 'description_title'))
|
| 2428 |
+
# Dedup; title's source tag takes priority over keybert when the phrase matches.
|
| 2429 |
+
seen_pool = {}
|
| 2430 |
+
for lbl, src in pool_src:
|
| 2431 |
+
key = lbl.lower()
|
| 2432 |
+
if key not in seen_pool or src == 'description_title':
|
| 2433 |
+
seen_pool[key] = (lbl, src)
|
| 2434 |
+
pool = [v[0] for v in seen_pool.values()]
|
| 2435 |
+
pool_srcs = [v[1] for v in seen_pool.values()]
|
| 2436 |
+
|
| 2437 |
+
keybert_label = kb_cands[0] if kb_cands else '' # for fallback only
|
| 2438 |
+
|
| 2439 |
+
candidate_scores = []
|
| 2440 |
+
if pool and cluster_emb is not None:
|
| 2441 |
+
cand_embs = np.asarray(embedder.encode(pool), dtype=float)
|
| 2442 |
+
relevance = cosine_similarity([cluster_emb], cand_embs)[0]
|
| 2443 |
+
# c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic
|
| 2444 |
+
# phrases ("test", "description", "measure", "scores") lose to distinctive ones.
|
| 2445 |
+
if USE_CTFIDF and _CORPUS_IDF:
|
| 2446 |
+
_mx = max(_CORPUS_IDF.values()) or 1.0
|
| 2447 |
+
_idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool])
|
| 2448 |
+
relevance = relevance * (0.5 + 0.5 * _idf)
|
| 2449 |
+
if sibling_centroids:
|
| 2450 |
+
sib_sim = cosine_similarity(cand_embs,
|
| 2451 |
+
np.asarray(sibling_centroids, dtype=float)).max(axis=1)
|
| 2452 |
+
contrast = np.clip(relevance - sib_sim, 0.0, 1.0)
|
| 2453 |
+
else:
|
| 2454 |
+
contrast = np.zeros(len(pool))
|
| 2455 |
+
# Title SEED: cosine of each description-derived candidate to the title.
|
| 2456 |
+
if title_emb is not None:
|
| 2457 |
+
title_sim = cosine_similarity(cand_embs, [title_emb])[:, 0]
|
| 2458 |
+
else:
|
| 2459 |
+
title_sim = np.zeros(len(pool))
|
| 2460 |
+
for i, cand in enumerate(pool):
|
| 2461 |
+
hyb = (LABEL_W_RELEVANCE * float(relevance[i])
|
| 2462 |
+
+ LABEL_W_TITLE * float(title_sim[i])
|
| 2463 |
+
+ LABEL_W_CONTRAST * float(contrast[i]))
|
| 2464 |
+
candidate_scores.append({
|
| 2465 |
+
'label': cand,
|
| 2466 |
+
'score': hyb,
|
| 2467 |
+
'embedding_sim': float(relevance[i]),
|
| 2468 |
+
'coverage': float(relevance[i]),
|
| 2469 |
+
'contrast': float(contrast[i]),
|
| 2470 |
+
'specificity': 0.0,
|
| 2471 |
+
'string_sim': float(title_sim[i]), # title seed alignment
|
| 2472 |
+
'source': pool_srcs[i],
|
| 2473 |
+
'broader_relations': [],
|
| 2474 |
+
'_emb': cand_embs[i],
|
| 2475 |
+
})
|
| 2476 |
+
candidate_scores.sort(key=lambda x: -x['score'])
|
| 2477 |
+
|
| 2478 |
fallback_label = (title_label
|
| 2479 |
+
or keybert_label
|
| 2480 |
or get_discriminative_tfidf_label(cluster_texts_k, sibling_texts)
|
| 2481 |
or f'Group {k+1}')
|
|
|
|
| 2482 |
|
| 2483 |
label, provenance = assign_concept_label(
|
| 2484 |
candidate_scores,
|
| 2485 |
fallback=fallback_label,
|
| 2486 |
+
min_score=0.0,
|
| 2487 |
ancestor_names=ancestor_names,
|
| 2488 |
used_sibling_labels=used_sibling_labels,
|
| 2489 |
top_level_tasks=_top_level_tasks,
|
|
|
|
| 2539 |
pid = ensure_path_fn(path_prefix + [label],
|
| 2540 |
relation='belongs_to', provenance=provenance)
|
| 2541 |
|
| 2542 |
+
# Store concept assignment on can (needed by Castanet facets later).
|
| 2543 |
+
# Provenance reflects the HYBRID winner (title / keybert / concept_table),
|
| 2544 |
+
# not the old semantic-only scorer β so the exported labels CSV is accurate.
|
| 2545 |
for ci in cluster_idxs:
|
| 2546 |
_, row = rows_list[ci]
|
| 2547 |
can.at[row.name, '_concept_label'] = label
|
| 2548 |
+
can.at[row.name, '_concept_score'] = provenance.get('confidence', 0.0)
|
| 2549 |
+
can.at[row.name, '_concept_source'] = (provenance.get('source_evidence') or ['fallback'])[0]
|
| 2550 |
+
|
| 2551 |
+
# Attach the cluster's variables directly under the concept node. The former
|
| 2552 |
+
# Statistic/Condition facet sub-split is removed: the statistic tier came from
|
| 2553 |
+
# a hardcoded vocabulary (now produced data-drivenly by _nest_by_measure), and
|
| 2554 |
+
# the numeric Condition tier produced bare-digit nodes (0/4/12) that inflated
|
| 2555 |
+
# singleton%/n_agg and moved the tree away from gold. Castanet's Condition facet
|
| 2556 |
+
# still exists as a separate parallel view via detect_facets() β not a tier.
|
| 2557 |
+
for ci in cluster_idxs:
|
| 2558 |
+
_, row = rows_list[ci]
|
| 2559 |
+
add_child(nodes, pid, leaf_to_id[row['_leaf_id']])
|
|
|
|
|
|
|
| 2560 |
|
| 2561 |
|
| 2562 |
def _remove_phrase(tokens, phrase_tokens):
|
|
|
|
| 2768 |
return nodes
|
| 2769 |
|
| 2770 |
|
| 2771 |
+
def _dissolve_facet_singletons(nodes):
|
| 2772 |
+
"""
|
| 2773 |
+
Dissolve FACET tier nodes (Statistic / Condition) that wrap a single variable.
|
| 2774 |
+
A condition or statistic node with exactly one leaf child carries no grouping
|
| 2775 |
+
value β e.g. `Standard Deviation > 0 > DMSL0SD`. We remove such nodes and
|
| 2776 |
+
reattach their single child to the node's parent, keeping siblings together.
|
| 2777 |
+
|
| 2778 |
+
Scope is deliberately narrow: only nodes whose relation_type is 'has_condition'
|
| 2779 |
+
or 'is_statistic_of' are touched, so genuine single-member CONCEPT nodes that
|
| 2780 |
+
carry a distinctive name are preserved (per the chosen policy).
|
| 2781 |
+
"""
|
| 2782 |
+
_FACET_RELS = {'has_condition', 'is_statistic_of'}
|
| 2783 |
+
changed = True
|
| 2784 |
+
while changed:
|
| 2785 |
+
changed = False
|
| 2786 |
+
pm = build_parent_map(nodes)
|
| 2787 |
+
m = nmap(nodes)
|
| 2788 |
+
for n in list(nodes):
|
| 2789 |
+
if n.get('type') != 'aggregation':
|
| 2790 |
+
continue
|
| 2791 |
+
if n['info'].get('relation_type') not in _FACET_RELS:
|
| 2792 |
+
continue
|
| 2793 |
+
nid = int(n['id'])
|
| 2794 |
+
children = [int(c) for c in n.get('related', [])]
|
| 2795 |
+
# "Single variable" = exactly one child and that child is a leaf attribute.
|
| 2796 |
+
if len(children) == 1 and m.get(children[0], {}).get('type') == 'attribute':
|
| 2797 |
+
parent = pm.get(nid)
|
| 2798 |
+
if parent is None:
|
| 2799 |
+
continue
|
| 2800 |
+
add_child(nodes, parent, children[0])
|
| 2801 |
+
remove_child(nodes, parent, nid)
|
| 2802 |
+
nodes[:] = [x for x in nodes if int(x['id']) != nid]
|
| 2803 |
+
changed = True
|
| 2804 |
+
break
|
| 2805 |
+
return nodes
|
| 2806 |
+
|
| 2807 |
+
|
| 2808 |
def build_concept_hierarchy(can, embedder, concept_table, project='metadata_project',
|
| 2809 |
n_clusters_per_group=8):
|
| 2810 |
"""
|
|
|
|
| 2844 |
# is discriminative; one close to ALL of them is boilerplate. corpus_centroid
|
| 2845 |
# is the global mean (generic = central). Both are derived purely from data.
|
| 2846 |
sem_col_all = '_semantic_text' if '_semantic_text' in can.columns else '_text'
|
| 2847 |
+
|
| 2848 |
+
# Active domain β used by the hybrid label scorer's external-grounding signal.
|
| 2849 |
+
global _ACTIVE_DOMAIN
|
| 2850 |
+
_ACTIVE_DOMAIN = detect_domain(can)
|
| 2851 |
+
|
| 2852 |
+
# Corpus IDF over description n-grams β KeyBERT c-TF-IDF distinctiveness weight
|
| 2853 |
+
# (only consulted when USE_CTFIDF=True). Data-derived, dataset-agnostic.
|
| 2854 |
+
global _CORPUS_IDF
|
| 2855 |
+
_CORPUS_IDF = {}
|
| 2856 |
+
try:
|
| 2857 |
+
from sklearn.feature_extraction.text import CountVectorizer as _CV
|
| 2858 |
+
_docs = can[sem_col_all].fillna('').astype(str).tolist()
|
| 2859 |
+
_cv = _CV(ngram_range=(1, 3), binary=True, lowercase=True,
|
| 2860 |
+
token_pattern=r'[a-z][a-z\-]+')
|
| 2861 |
+
_dt = _cv.fit_transform(_docs)
|
| 2862 |
+
_dfa = np.asarray(_dt.sum(axis=0)).ravel(); _N = _dt.shape[0]
|
| 2863 |
+
_CORPUS_IDF = {p: float(np.log((_N + 1) / (_dfa[i] + 1)) + 1.0)
|
| 2864 |
+
for p, i in _cv.vocabulary_.items()}
|
| 2865 |
+
except Exception:
|
| 2866 |
+
_CORPUS_IDF = {}
|
| 2867 |
+
|
| 2868 |
ref_centroids = corpus_centroid = None
|
| 2869 |
try:
|
| 2870 |
all_var_embs = embedder.encode(can[sem_col_all].fillna('').astype(str).tolist())
|
|
|
|
| 2963 |
# Remove empty concept nodes (no variables) β meaningless and they break the
|
| 2964 |
# branchvalues='total' sunburst (parent value < sum of children β blank render).
|
| 2965 |
_prune_empty_aggregations(nodes)
|
| 2966 |
+
# Dissolve 1-variable Statistic/Condition facet nodes (no grouping value).
|
| 2967 |
+
_dissolve_facet_singletons(nodes)
|
| 2968 |
+
_prune_empty_aggregations(nodes)
|
| 2969 |
# NOTE: a head-noun "category" tier (Errors/Correct) was tried and reverted β
|
| 2970 |
# it regressed setOverlap (0.914β0.836: mis-grouping) and added depth beyond gold.
|
| 2971 |
# _nest_by_category() is kept defined but intentionally NOT called.
|
|
|
|
| 4063 |
# [F3][F5][CAS] These columns are needed inside _cluster_and_label
|
| 4064 |
# for facet sub-splitting. They must be computed BEFORE Step G.
|
| 4065 |
# detect_facets / build_castanet_facets runs AFTER hierarchy build
|
| 4066 |
+
# (Step I), so we pre-compute only _facet_cond here. The statistic tier
|
| 4067 |
+
# is produced data-drivenly later by _nest_by_measure (no hardcoded vocab).
|
| 4068 |
+
with st.spinner('Pre-computing Condition facets [CAS]...'):
|
| 4069 |
can = precompute_stat_cond_facets(can)
|
|
|
|
| 4070 |
n_cond = can['_facet_cond'].ne('').sum()
|
| 4071 |
+
st.info(f'Facet pre-computation: {n_cond} variables with Condition. '
|
| 4072 |
+
f'Statistic depth is derived from concept titles (_nest_by_measure).')
|
| 4073 |
|
| 4074 |
# ββ Step G: Build concept hierarchy (NΓM alignment) ββββββββββββββ
|
| 4075 |
with st.spinner('Building concept hierarchy via NΓM alignment [GON][TAX]...'):
|
|
|
|
| 4086 |
else:
|
| 4087 |
c_embs = None
|
| 4088 |
nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
|
| 4089 |
+
# HiExpan's width/global passes MOVE leaves between concepts; a concept
|
| 4090 |
+
# that loses all its leaves becomes empty. build_concept_hierarchy prunes
|
| 4091 |
+
# internally, but that runs BEFORE HiExpan β so re-prune here, else empty
|
| 4092 |
+
# nodes break the Plotly branchvalues='total' sunburst/treemap (parent
|
| 4093 |
+
# value < sum(children) β blank render; node-link is unaffected).
|
| 4094 |
+
_prune_empty_aggregations(nodes)
|
| 4095 |
+
_alive = {int(n['id']) for n in nodes}
|
| 4096 |
+
for _n in nodes:
|
| 4097 |
+
_n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', []))
|
| 4098 |
+
if x in _alive]
|
| 4099 |
st.session_state.hiexpan_report = report
|
| 4100 |
wmoves = report.get('width_expansion_moves', 0)
|
| 4101 |
dexp = report.get('depth_expansion_nodes', 0)
|
demo.py
CHANGED
|
@@ -46,8 +46,8 @@ PREBUILT = {
|
|
| 46 |
"facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
|
| 47 |
},
|
| 48 |
"HCP": {
|
| 49 |
-
"hierarchy": ROOT / "approach_1" / "
|
| 50 |
-
"facets": ROOT / "approach_1" / "
|
| 51 |
},
|
| 52 |
},
|
| 53 |
"Approach 2": {
|
|
@@ -211,7 +211,7 @@ def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
|
|
| 211 |
font=dict(size=13), x=0.5))
|
| 212 |
return fig
|
| 213 |
|
| 214 |
-
def plot_treemap(nodes: list, color: str):
|
| 215 |
nodes = _filter_dissolved(nodes)
|
| 216 |
pm = _parent_map(nodes)
|
| 217 |
vm = _tree_value_map(nodes, pm)
|
|
@@ -228,7 +228,7 @@ def plot_treemap(nodes: list, color: str):
|
|
| 228 |
fig = go.Figure(go.Treemap(
|
| 229 |
ids=ids, labels=labels, parents=parents, values=values,
|
| 230 |
branchvalues="total", hovertext=hover, hoverinfo="text",
|
| 231 |
-
textinfo="label+value",
|
| 232 |
marker=dict(colorscale=color, line=dict(width=1, color="white"))))
|
| 233 |
fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
|
| 234 |
return fig
|
|
@@ -359,6 +359,33 @@ def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_lab
|
|
| 359 |
)
|
| 360 |
return fig
|
| 361 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 363 |
# IO
|
| 364 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -367,12 +394,37 @@ def _load_json(path_str: str):
|
|
| 367 |
with open(path_str, encoding="utf-8") as f:
|
| 368 |
return json.load(f)
|
| 369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
def count_nodes(nodes: list) -> tuple[int, int]:
|
| 371 |
nodes = _filter_dissolved(nodes)
|
| 372 |
leaves = sum(1 for n in nodes if n.get("type") == "attribute")
|
| 373 |
aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
|
| 374 |
return leaves, aggs
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββ
|
| 377 |
# SIDEBAR
|
| 378 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -413,6 +465,51 @@ c1, c2, c3 = st.columns(3)
|
|
| 413 |
c1.metric("Leaf Variables", leaves)
|
| 414 |
c2.metric("Aggregation Nodes", aggs)
|
| 415 |
c3.metric("Total Nodes", leaves + aggs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
st.markdown("---")
|
| 417 |
|
| 418 |
# ββ Level-of-Detail controls (above chart β matches the apps) ββββββββββββββββ
|
|
@@ -450,15 +547,22 @@ st.divider()
|
|
| 450 |
display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
|
| 451 |
|
| 452 |
if viz_mode == "Sunburst (drill-down)":
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
elif viz_mode == "Treemap":
|
| 455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
else:
|
| 457 |
st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
|
| 458 |
use_container_width=True)
|
| 459 |
|
| 460 |
# ββ Facets (Approach 1 only) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 461 |
-
facet_path = paths.get("facets")
|
| 462 |
if facet_path is not None and facet_path.exists():
|
| 463 |
st.markdown("---")
|
| 464 |
st.subheader("π Parallel facets")
|
|
|
|
| 46 |
"facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
|
| 47 |
},
|
| 48 |
"HCP": {
|
| 49 |
+
"hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
|
| 50 |
+
"facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
|
| 51 |
},
|
| 52 |
},
|
| 53 |
"Approach 2": {
|
|
|
|
| 211 |
font=dict(size=13), x=0.5))
|
| 212 |
return fig
|
| 213 |
|
| 214 |
+
def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
|
| 215 |
nodes = _filter_dissolved(nodes)
|
| 216 |
pm = _parent_map(nodes)
|
| 217 |
vm = _tree_value_map(nodes, pm)
|
|
|
|
| 228 |
fig = go.Figure(go.Treemap(
|
| 229 |
ids=ids, labels=labels, parents=parents, values=values,
|
| 230 |
branchvalues="total", hovertext=hover, hoverinfo="text",
|
| 231 |
+
textinfo="label+value", maxdepth=max_depth,
|
| 232 |
marker=dict(colorscale=color, line=dict(width=1, color="white"))))
|
| 233 |
fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
|
| 234 |
return fig
|
|
|
|
| 359 |
)
|
| 360 |
return fig
|
| 361 |
|
| 362 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 363 |
+
# STATS / SAFE RENDERING
|
| 364 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 365 |
+
def _tree_depth(nodes: list) -> int:
|
| 366 |
+
"""Max depth of the rendered single-parent tree (root = depth 0)."""
|
| 367 |
+
nodes = _filter_dissolved(nodes)
|
| 368 |
+
m = {int(n["id"]): n for n in nodes}
|
| 369 |
+
best = {"d": 0}
|
| 370 |
+
def rec(nid, d):
|
| 371 |
+
best["d"] = max(best["d"], d)
|
| 372 |
+
for c in m.get(int(nid), {}).get("related", []):
|
| 373 |
+
if int(c) in m:
|
| 374 |
+
rec(int(c), d + 1)
|
| 375 |
+
rec(0, 0)
|
| 376 |
+
return best["d"]
|
| 377 |
+
|
| 378 |
+
def safe_render_depth(nodes: list, requested: int) -> int:
|
| 379 |
+
"""Plotly sunburst/treemap silently blank when asked to draw too many sectors
|
| 380 |
+
at once (large hierarchies like HCP). Cap the *initial* render depth β the
|
| 381 |
+
chart stays fully drillable by clicking, so no data is lost."""
|
| 382 |
+
n = len(_filter_dissolved(nodes))
|
| 383 |
+
if n > 400:
|
| 384 |
+
return min(requested, 3)
|
| 385 |
+
if n > 150:
|
| 386 |
+
return min(requested, 4)
|
| 387 |
+
return requested
|
| 388 |
+
|
| 389 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 390 |
# IO
|
| 391 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 394 |
with open(path_str, encoding="utf-8") as f:
|
| 395 |
return json.load(f)
|
| 396 |
|
| 397 |
+
def _read_bytes(path_str: str) -> bytes:
|
| 398 |
+
with open(path_str, "rb") as f:
|
| 399 |
+
return f.read()
|
| 400 |
+
|
| 401 |
+
@st.cache_data(show_spinner=False)
|
| 402 |
+
def _outputs_zip(root_str: str) -> bytes:
|
| 403 |
+
"""Zip the entire bundled outputs/ folder for one-click download."""
|
| 404 |
+
import io, zipfile
|
| 405 |
+
root = Path(root_str)
|
| 406 |
+
buf = io.BytesIO()
|
| 407 |
+
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 408 |
+
for p in sorted(root.rglob("*")):
|
| 409 |
+
if p.is_file():
|
| 410 |
+
zf.write(p, arcname=p.relative_to(root.parent).as_posix())
|
| 411 |
+
return buf.getvalue()
|
| 412 |
+
|
| 413 |
def count_nodes(nodes: list) -> tuple[int, int]:
|
| 414 |
nodes = _filter_dissolved(nodes)
|
| 415 |
leaves = sum(1 for n in nodes if n.get("type") == "attribute")
|
| 416 |
aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
|
| 417 |
return leaves, aggs
|
| 418 |
|
| 419 |
+
def concept_aligned_pct(nodes: list) -> float | None:
|
| 420 |
+
"""% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
|
| 421 |
+
aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
|
| 422 |
+
if not aggs:
|
| 423 |
+
return None
|
| 424 |
+
aligned = sum(1 for n in aggs
|
| 425 |
+
if n.get("provenance") or n.get("concept") or n.get("source_evidence"))
|
| 426 |
+
return 100.0 * aligned / len(aggs) if aligned else None
|
| 427 |
+
|
| 428 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββββ
|
| 429 |
# SIDEBAR
|
| 430 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 465 |
c1.metric("Leaf Variables", leaves)
|
| 466 |
c2.metric("Aggregation Nodes", aggs)
|
| 467 |
c3.metric("Total Nodes", leaves + aggs)
|
| 468 |
+
|
| 469 |
+
# ββ Build summary (collapsed) ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 470 |
+
facet_path = paths.get("facets")
|
| 471 |
+
n_facets = None
|
| 472 |
+
if facet_path is not None and facet_path.exists():
|
| 473 |
+
try:
|
| 474 |
+
n_facets = len(_load_json(str(facet_path)))
|
| 475 |
+
except Exception:
|
| 476 |
+
n_facets = None
|
| 477 |
+
|
| 478 |
+
with st.expander("βΉοΈ Build summary", expanded=False):
|
| 479 |
+
bs1, bs2, bs3, bs4 = st.columns(4)
|
| 480 |
+
bs1.metric("Variables", leaves)
|
| 481 |
+
bs2.metric("Internal nodes", aggs)
|
| 482 |
+
bs3.metric("Tree depth", _tree_depth(raw_nodes))
|
| 483 |
+
bs4.metric("Facets", n_facets if n_facets is not None else "β")
|
| 484 |
+
pct = concept_aligned_pct(raw_nodes)
|
| 485 |
+
if pct is not None:
|
| 486 |
+
st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
|
| 487 |
+
st.caption(
|
| 488 |
+
f"Source file: `{hier_path.name}` Β· "
|
| 489 |
+
f"Approach: **{approach}** Β· Dataset: **{dataset}**. "
|
| 490 |
+
"Tree topology and labels are reproduced exactly from the pre-built "
|
| 491 |
+
"thesis output (the algorithms are not re-run in this viewer)."
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
# ββ Downloads ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 495 |
+
d1, d2, d3 = st.columns(3)
|
| 496 |
+
with d1:
|
| 497 |
+
st.download_button("β¬οΈ Hierarchy JSON", data=_read_bytes(str(hier_path)),
|
| 498 |
+
file_name=hier_path.name, mime="application/json",
|
| 499 |
+
use_container_width=True)
|
| 500 |
+
with d2:
|
| 501 |
+
if facet_path is not None and facet_path.exists():
|
| 502 |
+
st.download_button("β¬οΈ Facets JSON", data=_read_bytes(str(facet_path)),
|
| 503 |
+
file_name=facet_path.name, mime="application/json",
|
| 504 |
+
use_container_width=True)
|
| 505 |
+
else:
|
| 506 |
+
st.button("β¬οΈ Facets JSON", disabled=True, use_container_width=True,
|
| 507 |
+
help="This approach/dataset has no facet tree.")
|
| 508 |
+
with d3:
|
| 509 |
+
st.download_button("β¬οΈ All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
|
| 510 |
+
file_name="metadata_hierarchy_outputs.zip",
|
| 511 |
+
mime="application/zip", use_container_width=True)
|
| 512 |
+
|
| 513 |
st.markdown("---")
|
| 514 |
|
| 515 |
# ββ Level-of-Detail controls (above chart β matches the apps) ββββββββββββββββ
|
|
|
|
| 547 |
display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
|
| 548 |
|
| 549 |
if viz_mode == "Sunburst (drill-down)":
|
| 550 |
+
eff = safe_render_depth(display_nodes, depth)
|
| 551 |
+
if eff < depth:
|
| 552 |
+
st.caption(f"Large hierarchy β showing {eff} levels initially to render "
|
| 553 |
+
"reliably. **Click any sector to drill deeper.**")
|
| 554 |
+
st.plotly_chart(plot_sunburst(display_nodes, color, eff), use_container_width=True)
|
| 555 |
elif viz_mode == "Treemap":
|
| 556 |
+
eff = safe_render_depth(display_nodes, depth)
|
| 557 |
+
if eff < depth:
|
| 558 |
+
st.caption(f"Large hierarchy β showing {eff} levels initially to render "
|
| 559 |
+
"reliably. **Click a tile to drill deeper.**")
|
| 560 |
+
st.plotly_chart(plot_treemap(display_nodes, color, eff), use_container_width=True)
|
| 561 |
else:
|
| 562 |
st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
|
| 563 |
use_container_width=True)
|
| 564 |
|
| 565 |
# ββ Facets (Approach 1 only) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 566 |
if facet_path is not None and facet_path.exists():
|
| 567 |
st.markdown("---")
|
| 568 |
st.subheader("π Parallel facets")
|
outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json β HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json}
RENAMED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/approach_1/{keybert/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json β HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json}
RENAMED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json
CHANGED
|
@@ -3527,16 +3527,18 @@
|
|
| 3527 |
62,
|
| 3528 |
69,
|
| 3529 |
76,
|
|
|
|
|
|
|
| 3530 |
91,
|
| 3531 |
-
|
| 3532 |
-
|
| 3533 |
-
|
| 3534 |
-
|
| 3535 |
-
|
| 3536 |
114,
|
| 3537 |
-
|
| 3538 |
-
|
| 3539 |
-
|
| 3540 |
132
|
| 3541 |
],
|
| 3542 |
"desc": "Facet: Measure Type"
|
|
@@ -3686,7 +3688,7 @@
|
|
| 3686 |
},
|
| 3687 |
{
|
| 3688 |
"id": 10,
|
| 3689 |
-
"name": "Correct Latency",
|
| 3690 |
"related": [
|
| 3691 |
11,
|
| 3692 |
12,
|
|
@@ -3699,13 +3701,7 @@
|
|
| 3699 |
19,
|
| 3700 |
20,
|
| 3701 |
21,
|
| 3702 |
-
22
|
| 3703 |
-
83,
|
| 3704 |
-
84,
|
| 3705 |
-
85,
|
| 3706 |
-
86,
|
| 3707 |
-
87,
|
| 3708 |
-
88
|
| 3709 |
],
|
| 3710 |
"type": "aggregation",
|
| 3711 |
"info": {
|
|
@@ -3717,7 +3713,7 @@
|
|
| 3717 |
"relation_label": "semantically related to"
|
| 3718 |
},
|
| 3719 |
"isShown": true,
|
| 3720 |
-
"desc": "Measure Type: Correct Latency",
|
| 3721 |
"dtype": "determine",
|
| 3722 |
"recover": true
|
| 3723 |
},
|
|
@@ -3891,16 +3887,14 @@
|
|
| 3891 |
},
|
| 3892 |
{
|
| 3893 |
"id": 23,
|
| 3894 |
-
"name": "Percent Correct",
|
| 3895 |
"related": [
|
| 3896 |
24,
|
| 3897 |
25,
|
| 3898 |
26,
|
| 3899 |
27,
|
| 3900 |
28,
|
| 3901 |
-
29
|
| 3902 |
-
89,
|
| 3903 |
-
90
|
| 3904 |
],
|
| 3905 |
"type": "aggregation",
|
| 3906 |
"info": {
|
|
@@ -3912,7 +3906,7 @@
|
|
| 3912 |
"relation_label": "semantically related to"
|
| 3913 |
},
|
| 3914 |
"isShown": true,
|
| 3915 |
-
"desc": "Measure Type: Percent Correct",
|
| 3916 |
"dtype": "determine",
|
| 3917 |
"recover": true
|
| 3918 |
},
|
|
@@ -4002,7 +3996,7 @@
|
|
| 4002 |
},
|
| 4003 |
{
|
| 4004 |
"id": 30,
|
| 4005 |
-
"name": "Probability Error",
|
| 4006 |
"related": [
|
| 4007 |
31,
|
| 4008 |
32
|
|
@@ -4017,7 +4011,7 @@
|
|
| 4017 |
"relation_label": "semantically related to"
|
| 4018 |
},
|
| 4019 |
"isShown": true,
|
| 4020 |
-
"desc": "Measure Type: Probability Error",
|
| 4021 |
"dtype": "determine",
|
| 4022 |
"recover": true
|
| 4023 |
},
|
|
@@ -4160,16 +4154,15 @@
|
|
| 4160 |
},
|
| 4161 |
{
|
| 4162 |
"id": 40,
|
| 4163 |
-
"name": "
|
| 4164 |
"related": [
|
| 4165 |
41,
|
| 4166 |
42,
|
| 4167 |
-
79,
|
| 4168 |
-
124,
|
| 4169 |
-
125,
|
| 4170 |
-
126,
|
| 4171 |
127,
|
| 4172 |
-
128
|
|
|
|
|
|
|
|
|
|
| 4173 |
],
|
| 4174 |
"type": "aggregation",
|
| 4175 |
"info": {
|
|
@@ -4181,7 +4174,7 @@
|
|
| 4181 |
"relation_label": "semantically related to"
|
| 4182 |
},
|
| 4183 |
"isShown": true,
|
| 4184 |
-
"desc": "Measure Type:
|
| 4185 |
"dtype": "determine",
|
| 4186 |
"recover": true
|
| 4187 |
},
|
|
@@ -4524,7 +4517,7 @@
|
|
| 4524 |
},
|
| 4525 |
{
|
| 4526 |
"id": 62,
|
| 4527 |
-
"name": "
|
| 4528 |
"related": [
|
| 4529 |
63,
|
| 4530 |
64,
|
|
@@ -4543,7 +4536,7 @@
|
|
| 4543 |
"relation_label": "semantically related to"
|
| 4544 |
},
|
| 4545 |
"isShown": true,
|
| 4546 |
-
"desc": "Measure Type:
|
| 4547 |
"dtype": "determine",
|
| 4548 |
"recover": true
|
| 4549 |
},
|
|
@@ -4746,9 +4739,9 @@
|
|
| 4746 |
"related": [
|
| 4747 |
77,
|
| 4748 |
78,
|
| 4749 |
-
80,
|
| 4750 |
81,
|
| 4751 |
-
82
|
|
|
|
| 4752 |
],
|
| 4753 |
"type": "aggregation",
|
| 4754 |
"info": {
|
|
@@ -4794,6 +4787,26 @@
|
|
| 4794 |
},
|
| 4795 |
{
|
| 4796 |
"id": 79,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4797 |
"name": "PALTEA28",
|
| 4798 |
"dtype": "determine",
|
| 4799 |
"related": [],
|
|
@@ -4807,7 +4820,7 @@
|
|
| 4807 |
}
|
| 4808 |
},
|
| 4809 |
{
|
| 4810 |
-
"id":
|
| 4811 |
"name": "PALTEA4",
|
| 4812 |
"dtype": "determine",
|
| 4813 |
"related": [],
|
|
@@ -4821,7 +4834,7 @@
|
|
| 4821 |
}
|
| 4822 |
},
|
| 4823 |
{
|
| 4824 |
-
"id":
|
| 4825 |
"name": "PALTEA6",
|
| 4826 |
"dtype": "determine",
|
| 4827 |
"related": [],
|
|
@@ -4835,7 +4848,7 @@
|
|
| 4835 |
}
|
| 4836 |
},
|
| 4837 |
{
|
| 4838 |
-
"id":
|
| 4839 |
"name": "PALTEA8",
|
| 4840 |
"dtype": "determine",
|
| 4841 |
"related": [],
|
|
@@ -4849,7 +4862,32 @@
|
|
| 4849 |
}
|
| 4850 |
},
|
| 4851 |
{
|
| 4852 |
-
"id":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4853 |
"name": "PRMCLSDD",
|
| 4854 |
"dtype": "determine",
|
| 4855 |
"related": [],
|
|
@@ -4863,7 +4901,7 @@
|
|
| 4863 |
}
|
| 4864 |
},
|
| 4865 |
{
|
| 4866 |
-
"id":
|
| 4867 |
"name": "PRMCLSDI",
|
| 4868 |
"dtype": "determine",
|
| 4869 |
"related": [],
|
|
@@ -4877,7 +4915,7 @@
|
|
| 4877 |
}
|
| 4878 |
},
|
| 4879 |
{
|
| 4880 |
-
"id":
|
| 4881 |
"name": "PRMMCLD",
|
| 4882 |
"dtype": "determine",
|
| 4883 |
"related": [],
|
|
@@ -4891,7 +4929,7 @@
|
|
| 4891 |
}
|
| 4892 |
},
|
| 4893 |
{
|
| 4894 |
-
"id":
|
| 4895 |
"name": "PRMMCLI",
|
| 4896 |
"dtype": "determine",
|
| 4897 |
"related": [],
|
|
@@ -4905,7 +4943,7 @@
|
|
| 4905 |
}
|
| 4906 |
},
|
| 4907 |
{
|
| 4908 |
-
"id":
|
| 4909 |
"name": "PRMMDCLD",
|
| 4910 |
"dtype": "determine",
|
| 4911 |
"related": [],
|
|
@@ -4919,7 +4957,7 @@
|
|
| 4919 |
}
|
| 4920 |
},
|
| 4921 |
{
|
| 4922 |
-
"id":
|
| 4923 |
"name": "PRMMDCLI",
|
| 4924 |
"dtype": "determine",
|
| 4925 |
"related": [],
|
|
@@ -4933,7 +4971,28 @@
|
|
| 4933 |
}
|
| 4934 |
},
|
| 4935 |
{
|
| 4936 |
-
"id":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4937 |
"name": "PRMPCD",
|
| 4938 |
"dtype": "determine",
|
| 4939 |
"related": [],
|
|
@@ -4947,7 +5006,7 @@
|
|
| 4947 |
}
|
| 4948 |
},
|
| 4949 |
{
|
| 4950 |
-
"id":
|
| 4951 |
"name": "PRMPCI",
|
| 4952 |
"dtype": "determine",
|
| 4953 |
"related": [],
|
|
@@ -4961,10 +5020,10 @@
|
|
| 4961 |
}
|
| 4962 |
},
|
| 4963 |
{
|
| 4964 |
-
"id":
|
| 4965 |
"name": "Time Since Delayed Stimuli",
|
| 4966 |
"related": [
|
| 4967 |
-
|
| 4968 |
],
|
| 4969 |
"type": "aggregation",
|
| 4970 |
"info": {
|
|
@@ -4981,7 +5040,7 @@
|
|
| 4981 |
"recover": true
|
| 4982 |
},
|
| 4983 |
{
|
| 4984 |
-
"id":
|
| 4985 |
"name": "PRMTSDSP",
|
| 4986 |
"dtype": "determine",
|
| 4987 |
"related": [],
|
|
@@ -4995,10 +5054,10 @@
|
|
| 4995 |
}
|
| 4996 |
},
|
| 4997 |
{
|
| 4998 |
-
"id":
|
| 4999 |
"name": "Detection Measure",
|
| 5000 |
"related": [
|
| 5001 |
-
|
| 5002 |
],
|
| 5003 |
"type": "aggregation",
|
| 5004 |
"info": {
|
|
@@ -5015,7 +5074,7 @@
|
|
| 5015 |
"recover": true
|
| 5016 |
},
|
| 5017 |
{
|
| 5018 |
-
"id":
|
| 5019 |
"name": "RVPA",
|
| 5020 |
"dtype": "determine",
|
| 5021 |
"related": [],
|
|
@@ -5029,12 +5088,12 @@
|
|
| 5029 |
}
|
| 5030 |
},
|
| 5031 |
{
|
| 5032 |
-
"id":
|
| 5033 |
-
"name": "Response Latency",
|
| 5034 |
"related": [
|
| 5035 |
-
|
| 5036 |
-
|
| 5037 |
-
|
| 5038 |
],
|
| 5039 |
"type": "aggregation",
|
| 5040 |
"info": {
|
|
@@ -5046,12 +5105,12 @@
|
|
| 5046 |
"relation_label": "semantically related to"
|
| 5047 |
},
|
| 5048 |
"isShown": true,
|
| 5049 |
-
"desc": "Measure Type: Response Latency",
|
| 5050 |
"dtype": "determine",
|
| 5051 |
"recover": true
|
| 5052 |
},
|
| 5053 |
{
|
| 5054 |
-
"id":
|
| 5055 |
"name": "RVPLSD",
|
| 5056 |
"dtype": "determine",
|
| 5057 |
"related": [],
|
|
@@ -5065,7 +5124,7 @@
|
|
| 5065 |
}
|
| 5066 |
},
|
| 5067 |
{
|
| 5068 |
-
"id":
|
| 5069 |
"name": "RVPMDL",
|
| 5070 |
"dtype": "determine",
|
| 5071 |
"related": [],
|
|
@@ -5079,7 +5138,7 @@
|
|
| 5079 |
}
|
| 5080 |
},
|
| 5081 |
{
|
| 5082 |
-
"id":
|
| 5083 |
"name": "RVPML",
|
| 5084 |
"dtype": "determine",
|
| 5085 |
"related": [],
|
|
@@ -5093,14 +5152,14 @@
|
|
| 5093 |
}
|
| 5094 |
},
|
| 5095 |
{
|
| 5096 |
-
"id":
|
| 5097 |
"name": "Total",
|
| 5098 |
"related": [
|
| 5099 |
-
100,
|
| 5100 |
-
101,
|
| 5101 |
-
102,
|
| 5102 |
103,
|
| 5103 |
-
104
|
|
|
|
|
|
|
|
|
|
| 5104 |
],
|
| 5105 |
"type": "aggregation",
|
| 5106 |
"info": {
|
|
@@ -5117,7 +5176,7 @@
|
|
| 5117 |
"recover": true
|
| 5118 |
},
|
| 5119 |
{
|
| 5120 |
-
"id":
|
| 5121 |
"name": "RVPPFA",
|
| 5122 |
"dtype": "determine",
|
| 5123 |
"related": [],
|
|
@@ -5131,7 +5190,7 @@
|
|
| 5131 |
}
|
| 5132 |
},
|
| 5133 |
{
|
| 5134 |
-
"id":
|
| 5135 |
"name": "RVPPH",
|
| 5136 |
"dtype": "determine",
|
| 5137 |
"related": [],
|
|
@@ -5145,7 +5204,7 @@
|
|
| 5145 |
}
|
| 5146 |
},
|
| 5147 |
{
|
| 5148 |
-
"id":
|
| 5149 |
"name": "RVPTFA",
|
| 5150 |
"dtype": "determine",
|
| 5151 |
"related": [],
|
|
@@ -5159,7 +5218,7 @@
|
|
| 5159 |
}
|
| 5160 |
},
|
| 5161 |
{
|
| 5162 |
-
"id":
|
| 5163 |
"name": "RVPTH",
|
| 5164 |
"dtype": "determine",
|
| 5165 |
"related": [],
|
|
@@ -5173,7 +5232,7 @@
|
|
| 5173 |
}
|
| 5174 |
},
|
| 5175 |
{
|
| 5176 |
-
"id":
|
| 5177 |
"name": "RVPTM",
|
| 5178 |
"dtype": "determine",
|
| 5179 |
"related": [],
|
|
@@ -5187,14 +5246,14 @@
|
|
| 5187 |
}
|
| 5188 |
},
|
| 5189 |
{
|
| 5190 |
-
"id":
|
| 5191 |
-
"name": "Errors Boxes",
|
| 5192 |
"related": [
|
| 5193 |
-
106,
|
| 5194 |
-
107,
|
| 5195 |
-
108,
|
| 5196 |
109,
|
| 5197 |
-
110
|
|
|
|
|
|
|
|
|
|
| 5198 |
],
|
| 5199 |
"type": "aggregation",
|
| 5200 |
"info": {
|
|
@@ -5206,12 +5265,12 @@
|
|
| 5206 |
"relation_label": "semantically related to"
|
| 5207 |
},
|
| 5208 |
"isShown": true,
|
| 5209 |
-
"desc": "Measure Type: Errors Boxes",
|
| 5210 |
"dtype": "determine",
|
| 5211 |
"recover": true
|
| 5212 |
},
|
| 5213 |
{
|
| 5214 |
-
"id":
|
| 5215 |
"name": "SWMBE12",
|
| 5216 |
"dtype": "determine",
|
| 5217 |
"related": [],
|
|
@@ -5225,7 +5284,7 @@
|
|
| 5225 |
}
|
| 5226 |
},
|
| 5227 |
{
|
| 5228 |
-
"id":
|
| 5229 |
"name": "SWMBE4",
|
| 5230 |
"dtype": "determine",
|
| 5231 |
"related": [],
|
|
@@ -5239,7 +5298,7 @@
|
|
| 5239 |
}
|
| 5240 |
},
|
| 5241 |
{
|
| 5242 |
-
"id":
|
| 5243 |
"name": "SWMBE468",
|
| 5244 |
"dtype": "determine",
|
| 5245 |
"related": [],
|
|
@@ -5253,7 +5312,7 @@
|
|
| 5253 |
}
|
| 5254 |
},
|
| 5255 |
{
|
| 5256 |
-
"id":
|
| 5257 |
"name": "SWMBE6",
|
| 5258 |
"dtype": "determine",
|
| 5259 |
"related": [],
|
|
@@ -5267,7 +5326,7 @@
|
|
| 5267 |
}
|
| 5268 |
},
|
| 5269 |
{
|
| 5270 |
-
"id":
|
| 5271 |
"name": "SWMBE8",
|
| 5272 |
"dtype": "determine",
|
| 5273 |
"related": [],
|
|
@@ -5281,13 +5340,13 @@
|
|
| 5281 |
}
|
| 5282 |
},
|
| 5283 |
{
|
| 5284 |
-
"id":
|
| 5285 |
"name": "Double Errors Boxes",
|
| 5286 |
"related": [
|
| 5287 |
-
|
| 5288 |
-
113,
|
| 5289 |
116,
|
| 5290 |
-
|
|
|
|
| 5291 |
],
|
| 5292 |
"type": "aggregation",
|
| 5293 |
"info": {
|
|
@@ -5304,7 +5363,7 @@
|
|
| 5304 |
"recover": true
|
| 5305 |
},
|
| 5306 |
{
|
| 5307 |
-
"id":
|
| 5308 |
"name": "SWMDE12",
|
| 5309 |
"dtype": "determine",
|
| 5310 |
"related": [],
|
|
@@ -5318,7 +5377,7 @@
|
|
| 5318 |
}
|
| 5319 |
},
|
| 5320 |
{
|
| 5321 |
-
"id":
|
| 5322 |
"name": "SWMDE4",
|
| 5323 |
"dtype": "determine",
|
| 5324 |
"related": [],
|
|
@@ -5332,10 +5391,10 @@
|
|
| 5332 |
}
|
| 5333 |
},
|
| 5334 |
{
|
| 5335 |
-
"id":
|
| 5336 |
"name": "Double Errors",
|
| 5337 |
"related": [
|
| 5338 |
-
|
| 5339 |
],
|
| 5340 |
"type": "aggregation",
|
| 5341 |
"info": {
|
|
@@ -5352,7 +5411,7 @@
|
|
| 5352 |
"recover": true
|
| 5353 |
},
|
| 5354 |
{
|
| 5355 |
-
"id":
|
| 5356 |
"name": "SWMDE468",
|
| 5357 |
"dtype": "determine",
|
| 5358 |
"related": [],
|
|
@@ -5366,7 +5425,7 @@
|
|
| 5366 |
}
|
| 5367 |
},
|
| 5368 |
{
|
| 5369 |
-
"id":
|
| 5370 |
"name": "SWMDE6",
|
| 5371 |
"dtype": "determine",
|
| 5372 |
"related": [],
|
|
@@ -5380,7 +5439,7 @@
|
|
| 5380 |
}
|
| 5381 |
},
|
| 5382 |
{
|
| 5383 |
-
"id":
|
| 5384 |
"name": "SWMDE8",
|
| 5385 |
"dtype": "determine",
|
| 5386 |
"related": [],
|
|
@@ -5394,10 +5453,10 @@
|
|
| 5394 |
}
|
| 5395 |
},
|
| 5396 |
{
|
| 5397 |
-
"id":
|
| 5398 |
"name": "Problem Reached",
|
| 5399 |
"related": [
|
| 5400 |
-
|
| 5401 |
],
|
| 5402 |
"type": "aggregation",
|
| 5403 |
"info": {
|
|
@@ -5414,7 +5473,7 @@
|
|
| 5414 |
"recover": true
|
| 5415 |
},
|
| 5416 |
{
|
| 5417 |
-
"id":
|
| 5418 |
"name": "SWMPR",
|
| 5419 |
"dtype": "determine",
|
| 5420 |
"related": [],
|
|
@@ -5428,12 +5487,12 @@
|
|
| 5428 |
}
|
| 5429 |
},
|
| 5430 |
{
|
| 5431 |
-
"id":
|
| 5432 |
-
"name": "Strategy",
|
| 5433 |
"related": [
|
| 5434 |
-
|
| 5435 |
-
|
| 5436 |
-
|
| 5437 |
],
|
| 5438 |
"type": "aggregation",
|
| 5439 |
"info": {
|
|
@@ -5445,12 +5504,12 @@
|
|
| 5445 |
"relation_label": "semantically related to"
|
| 5446 |
},
|
| 5447 |
"isShown": true,
|
| 5448 |
-
"desc": "Measure Type: Strategy",
|
| 5449 |
"dtype": "determine",
|
| 5450 |
"recover": true
|
| 5451 |
},
|
| 5452 |
{
|
| 5453 |
-
"id":
|
| 5454 |
"name": "SWMS",
|
| 5455 |
"dtype": "determine",
|
| 5456 |
"related": [],
|
|
@@ -5464,7 +5523,7 @@
|
|
| 5464 |
}
|
| 5465 |
},
|
| 5466 |
{
|
| 5467 |
-
"id":
|
| 5468 |
"name": "SWMS6",
|
| 5469 |
"dtype": "determine",
|
| 5470 |
"related": [],
|
|
@@ -5478,7 +5537,7 @@
|
|
| 5478 |
}
|
| 5479 |
},
|
| 5480 |
{
|
| 5481 |
-
"id":
|
| 5482 |
"name": "SWMSX",
|
| 5483 |
"dtype": "determine",
|
| 5484 |
"related": [],
|
|
@@ -5492,7 +5551,7 @@
|
|
| 5492 |
}
|
| 5493 |
},
|
| 5494 |
{
|
| 5495 |
-
"id":
|
| 5496 |
"name": "SWMTE12",
|
| 5497 |
"dtype": "determine",
|
| 5498 |
"related": [],
|
|
@@ -5506,7 +5565,7 @@
|
|
| 5506 |
}
|
| 5507 |
},
|
| 5508 |
{
|
| 5509 |
-
"id":
|
| 5510 |
"name": "SWMTE4",
|
| 5511 |
"dtype": "determine",
|
| 5512 |
"related": [],
|
|
@@ -5520,7 +5579,7 @@
|
|
| 5520 |
}
|
| 5521 |
},
|
| 5522 |
{
|
| 5523 |
-
"id":
|
| 5524 |
"name": "SWMTE468",
|
| 5525 |
"dtype": "determine",
|
| 5526 |
"related": [],
|
|
@@ -5534,7 +5593,7 @@
|
|
| 5534 |
}
|
| 5535 |
},
|
| 5536 |
{
|
| 5537 |
-
"id":
|
| 5538 |
"name": "SWMTE6",
|
| 5539 |
"dtype": "determine",
|
| 5540 |
"related": [],
|
|
@@ -5548,7 +5607,7 @@
|
|
| 5548 |
}
|
| 5549 |
},
|
| 5550 |
{
|
| 5551 |
-
"id":
|
| 5552 |
"name": "SWMTE8",
|
| 5553 |
"dtype": "determine",
|
| 5554 |
"related": [],
|
|
@@ -5562,13 +5621,14 @@
|
|
| 5562 |
}
|
| 5563 |
},
|
| 5564 |
{
|
| 5565 |
-
"id":
|
| 5566 |
-
"name": "Within Errors
|
| 5567 |
"related": [
|
| 5568 |
-
|
| 5569 |
-
131,
|
| 5570 |
134,
|
| 5571 |
-
135
|
|
|
|
|
|
|
| 5572 |
],
|
| 5573 |
"type": "aggregation",
|
| 5574 |
"info": {
|
|
@@ -5580,12 +5640,12 @@
|
|
| 5580 |
"relation_label": "semantically related to"
|
| 5581 |
},
|
| 5582 |
"isShown": true,
|
| 5583 |
-
"desc": "Measure Type: Within Errors
|
| 5584 |
"dtype": "determine",
|
| 5585 |
"recover": true
|
| 5586 |
},
|
| 5587 |
{
|
| 5588 |
-
"id":
|
| 5589 |
"name": "SWMWE12",
|
| 5590 |
"dtype": "determine",
|
| 5591 |
"related": [],
|
|
@@ -5599,7 +5659,7 @@
|
|
| 5599 |
}
|
| 5600 |
},
|
| 5601 |
{
|
| 5602 |
-
"id":
|
| 5603 |
"name": "SWMWE4",
|
| 5604 |
"dtype": "determine",
|
| 5605 |
"related": [],
|
|
@@ -5613,27 +5673,7 @@
|
|
| 5613 |
}
|
| 5614 |
},
|
| 5615 |
{
|
| 5616 |
-
"id":
|
| 5617 |
-
"name": "Within Errors",
|
| 5618 |
-
"related": [
|
| 5619 |
-
133
|
| 5620 |
-
],
|
| 5621 |
-
"type": "aggregation",
|
| 5622 |
-
"info": {
|
| 5623 |
-
"operation": "concat",
|
| 5624 |
-
"usedAttributes": [],
|
| 5625 |
-
"formula": "",
|
| 5626 |
-
"exec": "",
|
| 5627 |
-
"relation_type": "related_to",
|
| 5628 |
-
"relation_label": "semantically related to"
|
| 5629 |
-
},
|
| 5630 |
-
"isShown": true,
|
| 5631 |
-
"desc": "Measure Type: Within Errors",
|
| 5632 |
-
"dtype": "determine",
|
| 5633 |
-
"recover": true
|
| 5634 |
-
},
|
| 5635 |
-
{
|
| 5636 |
-
"id": 133,
|
| 5637 |
"name": "SWMWE468",
|
| 5638 |
"dtype": "determine",
|
| 5639 |
"related": [],
|
|
@@ -5647,7 +5687,7 @@
|
|
| 5647 |
}
|
| 5648 |
},
|
| 5649 |
{
|
| 5650 |
-
"id":
|
| 5651 |
"name": "SWMWE6",
|
| 5652 |
"dtype": "determine",
|
| 5653 |
"related": [],
|
|
@@ -5661,7 +5701,7 @@
|
|
| 5661 |
}
|
| 5662 |
},
|
| 5663 |
{
|
| 5664 |
-
"id":
|
| 5665 |
"name": "SWMWE8",
|
| 5666 |
"dtype": "determine",
|
| 5667 |
"related": [],
|
|
|
|
| 3527 |
62,
|
| 3528 |
69,
|
| 3529 |
76,
|
| 3530 |
+
79,
|
| 3531 |
+
84,
|
| 3532 |
91,
|
| 3533 |
+
94,
|
| 3534 |
+
96,
|
| 3535 |
+
98,
|
| 3536 |
+
102,
|
| 3537 |
+
108,
|
| 3538 |
114,
|
| 3539 |
+
117,
|
| 3540 |
+
121,
|
| 3541 |
+
123,
|
| 3542 |
132
|
| 3543 |
],
|
| 3544 |
"desc": "Facet: Measure Type"
|
|
|
|
| 3688 |
},
|
| 3689 |
{
|
| 3690 |
"id": 10,
|
| 3691 |
+
"name": "Correct Latency Mean",
|
| 3692 |
"related": [
|
| 3693 |
11,
|
| 3694 |
12,
|
|
|
|
| 3701 |
19,
|
| 3702 |
20,
|
| 3703 |
21,
|
| 3704 |
+
22
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3705 |
],
|
| 3706 |
"type": "aggregation",
|
| 3707 |
"info": {
|
|
|
|
| 3713 |
"relation_label": "semantically related to"
|
| 3714 |
},
|
| 3715 |
"isShown": true,
|
| 3716 |
+
"desc": "Measure Type: Correct Latency Mean",
|
| 3717 |
"dtype": "determine",
|
| 3718 |
"recover": true
|
| 3719 |
},
|
|
|
|
| 3887 |
},
|
| 3888 |
{
|
| 3889 |
"id": 23,
|
| 3890 |
+
"name": "Percent Correct Percentage",
|
| 3891 |
"related": [
|
| 3892 |
24,
|
| 3893 |
25,
|
| 3894 |
26,
|
| 3895 |
27,
|
| 3896 |
28,
|
| 3897 |
+
29
|
|
|
|
|
|
|
| 3898 |
],
|
| 3899 |
"type": "aggregation",
|
| 3900 |
"info": {
|
|
|
|
| 3906 |
"relation_label": "semantically related to"
|
| 3907 |
},
|
| 3908 |
"isShown": true,
|
| 3909 |
+
"desc": "Measure Type: Percent Correct Percentage",
|
| 3910 |
"dtype": "determine",
|
| 3911 |
"recover": true
|
| 3912 |
},
|
|
|
|
| 3996 |
},
|
| 3997 |
{
|
| 3998 |
"id": 30,
|
| 3999 |
+
"name": "Probability Error Occurring",
|
| 4000 |
"related": [
|
| 4001 |
31,
|
| 4002 |
32
|
|
|
|
| 4011 |
"relation_label": "semantically related to"
|
| 4012 |
},
|
| 4013 |
"isShown": true,
|
| 4014 |
+
"desc": "Measure Type: Probability Error Occurring",
|
| 4015 |
"dtype": "determine",
|
| 4016 |
"recover": true
|
| 4017 |
},
|
|
|
|
| 4154 |
},
|
| 4155 |
{
|
| 4156 |
"id": 40,
|
| 4157 |
+
"name": "Errors Total",
|
| 4158 |
"related": [
|
| 4159 |
41,
|
| 4160 |
42,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4161 |
127,
|
| 4162 |
+
128,
|
| 4163 |
+
129,
|
| 4164 |
+
130,
|
| 4165 |
+
131
|
| 4166 |
],
|
| 4167 |
"type": "aggregation",
|
| 4168 |
"info": {
|
|
|
|
| 4174 |
"relation_label": "semantically related to"
|
| 4175 |
},
|
| 4176 |
"isShown": true,
|
| 4177 |
+
"desc": "Measure Type: Errors Total",
|
| 4178 |
"dtype": "determine",
|
| 4179 |
"recover": true
|
| 4180 |
},
|
|
|
|
| 4517 |
},
|
| 4518 |
{
|
| 4519 |
"id": 62,
|
| 4520 |
+
"name": "Attempts Patterns Total",
|
| 4521 |
"related": [
|
| 4522 |
63,
|
| 4523 |
64,
|
|
|
|
| 4536 |
"relation_label": "semantically related to"
|
| 4537 |
},
|
| 4538 |
"isShown": true,
|
| 4539 |
+
"desc": "Measure Type: Attempts Patterns Total",
|
| 4540 |
"dtype": "determine",
|
| 4541 |
"recover": true
|
| 4542 |
},
|
|
|
|
| 4739 |
"related": [
|
| 4740 |
77,
|
| 4741 |
78,
|
|
|
|
| 4742 |
81,
|
| 4743 |
+
82,
|
| 4744 |
+
83
|
| 4745 |
],
|
| 4746 |
"type": "aggregation",
|
| 4747 |
"info": {
|
|
|
|
| 4787 |
},
|
| 4788 |
{
|
| 4789 |
"id": 79,
|
| 4790 |
+
"name": "Total Errors",
|
| 4791 |
+
"related": [
|
| 4792 |
+
80
|
| 4793 |
+
],
|
| 4794 |
+
"type": "aggregation",
|
| 4795 |
+
"info": {
|
| 4796 |
+
"operation": "concat",
|
| 4797 |
+
"usedAttributes": [],
|
| 4798 |
+
"formula": "",
|
| 4799 |
+
"exec": "",
|
| 4800 |
+
"relation_type": "related_to",
|
| 4801 |
+
"relation_label": "semantically related to"
|
| 4802 |
+
},
|
| 4803 |
+
"isShown": true,
|
| 4804 |
+
"desc": "Measure Type: Total Errors",
|
| 4805 |
+
"dtype": "determine",
|
| 4806 |
+
"recover": true
|
| 4807 |
+
},
|
| 4808 |
+
{
|
| 4809 |
+
"id": 80,
|
| 4810 |
"name": "PALTEA28",
|
| 4811 |
"dtype": "determine",
|
| 4812 |
"related": [],
|
|
|
|
| 4820 |
}
|
| 4821 |
},
|
| 4822 |
{
|
| 4823 |
+
"id": 81,
|
| 4824 |
"name": "PALTEA4",
|
| 4825 |
"dtype": "determine",
|
| 4826 |
"related": [],
|
|
|
|
| 4834 |
}
|
| 4835 |
},
|
| 4836 |
{
|
| 4837 |
+
"id": 82,
|
| 4838 |
"name": "PALTEA6",
|
| 4839 |
"dtype": "determine",
|
| 4840 |
"related": [],
|
|
|
|
| 4848 |
}
|
| 4849 |
},
|
| 4850 |
{
|
| 4851 |
+
"id": 83,
|
| 4852 |
"name": "PALTEA8",
|
| 4853 |
"dtype": "determine",
|
| 4854 |
"related": [],
|
|
|
|
| 4862 |
}
|
| 4863 |
},
|
| 4864 |
{
|
| 4865 |
+
"id": 84,
|
| 4866 |
+
"name": "Latency Immediate Standard",
|
| 4867 |
+
"related": [
|
| 4868 |
+
85,
|
| 4869 |
+
86,
|
| 4870 |
+
87,
|
| 4871 |
+
88,
|
| 4872 |
+
89,
|
| 4873 |
+
90
|
| 4874 |
+
],
|
| 4875 |
+
"type": "aggregation",
|
| 4876 |
+
"info": {
|
| 4877 |
+
"operation": "concat",
|
| 4878 |
+
"usedAttributes": [],
|
| 4879 |
+
"formula": "",
|
| 4880 |
+
"exec": "",
|
| 4881 |
+
"relation_type": "related_to",
|
| 4882 |
+
"relation_label": "semantically related to"
|
| 4883 |
+
},
|
| 4884 |
+
"isShown": true,
|
| 4885 |
+
"desc": "Measure Type: Latency Immediate Standard",
|
| 4886 |
+
"dtype": "determine",
|
| 4887 |
+
"recover": true
|
| 4888 |
+
},
|
| 4889 |
+
{
|
| 4890 |
+
"id": 85,
|
| 4891 |
"name": "PRMCLSDD",
|
| 4892 |
"dtype": "determine",
|
| 4893 |
"related": [],
|
|
|
|
| 4901 |
}
|
| 4902 |
},
|
| 4903 |
{
|
| 4904 |
+
"id": 86,
|
| 4905 |
"name": "PRMCLSDI",
|
| 4906 |
"dtype": "determine",
|
| 4907 |
"related": [],
|
|
|
|
| 4915 |
}
|
| 4916 |
},
|
| 4917 |
{
|
| 4918 |
+
"id": 87,
|
| 4919 |
"name": "PRMMCLD",
|
| 4920 |
"dtype": "determine",
|
| 4921 |
"related": [],
|
|
|
|
| 4929 |
}
|
| 4930 |
},
|
| 4931 |
{
|
| 4932 |
+
"id": 88,
|
| 4933 |
"name": "PRMMCLI",
|
| 4934 |
"dtype": "determine",
|
| 4935 |
"related": [],
|
|
|
|
| 4943 |
}
|
| 4944 |
},
|
| 4945 |
{
|
| 4946 |
+
"id": 89,
|
| 4947 |
"name": "PRMMDCLD",
|
| 4948 |
"dtype": "determine",
|
| 4949 |
"related": [],
|
|
|
|
| 4957 |
}
|
| 4958 |
},
|
| 4959 |
{
|
| 4960 |
+
"id": 90,
|
| 4961 |
"name": "PRMMDCLI",
|
| 4962 |
"dtype": "determine",
|
| 4963 |
"related": [],
|
|
|
|
| 4971 |
}
|
| 4972 |
},
|
| 4973 |
{
|
| 4974 |
+
"id": 91,
|
| 4975 |
+
"name": "Percent Correct Immediate",
|
| 4976 |
+
"related": [
|
| 4977 |
+
92,
|
| 4978 |
+
93
|
| 4979 |
+
],
|
| 4980 |
+
"type": "aggregation",
|
| 4981 |
+
"info": {
|
| 4982 |
+
"operation": "concat",
|
| 4983 |
+
"usedAttributes": [],
|
| 4984 |
+
"formula": "",
|
| 4985 |
+
"exec": "",
|
| 4986 |
+
"relation_type": "related_to",
|
| 4987 |
+
"relation_label": "semantically related to"
|
| 4988 |
+
},
|
| 4989 |
+
"isShown": true,
|
| 4990 |
+
"desc": "Measure Type: Percent Correct Immediate",
|
| 4991 |
+
"dtype": "determine",
|
| 4992 |
+
"recover": true
|
| 4993 |
+
},
|
| 4994 |
+
{
|
| 4995 |
+
"id": 92,
|
| 4996 |
"name": "PRMPCD",
|
| 4997 |
"dtype": "determine",
|
| 4998 |
"related": [],
|
|
|
|
| 5006 |
}
|
| 5007 |
},
|
| 5008 |
{
|
| 5009 |
+
"id": 93,
|
| 5010 |
"name": "PRMPCI",
|
| 5011 |
"dtype": "determine",
|
| 5012 |
"related": [],
|
|
|
|
| 5020 |
}
|
| 5021 |
},
|
| 5022 |
{
|
| 5023 |
+
"id": 94,
|
| 5024 |
"name": "Time Since Delayed Stimuli",
|
| 5025 |
"related": [
|
| 5026 |
+
95
|
| 5027 |
],
|
| 5028 |
"type": "aggregation",
|
| 5029 |
"info": {
|
|
|
|
| 5040 |
"recover": true
|
| 5041 |
},
|
| 5042 |
{
|
| 5043 |
+
"id": 95,
|
| 5044 |
"name": "PRMTSDSP",
|
| 5045 |
"dtype": "determine",
|
| 5046 |
"related": [],
|
|
|
|
| 5054 |
}
|
| 5055 |
},
|
| 5056 |
{
|
| 5057 |
+
"id": 96,
|
| 5058 |
"name": "Detection Measure",
|
| 5059 |
"related": [
|
| 5060 |
+
97
|
| 5061 |
],
|
| 5062 |
"type": "aggregation",
|
| 5063 |
"info": {
|
|
|
|
| 5074 |
"recover": true
|
| 5075 |
},
|
| 5076 |
{
|
| 5077 |
+
"id": 97,
|
| 5078 |
"name": "RVPA",
|
| 5079 |
"dtype": "determine",
|
| 5080 |
"related": [],
|
|
|
|
| 5088 |
}
|
| 5089 |
},
|
| 5090 |
{
|
| 5091 |
+
"id": 98,
|
| 5092 |
+
"name": "Response Latency Mean",
|
| 5093 |
"related": [
|
| 5094 |
+
99,
|
| 5095 |
+
100,
|
| 5096 |
+
101
|
| 5097 |
],
|
| 5098 |
"type": "aggregation",
|
| 5099 |
"info": {
|
|
|
|
| 5105 |
"relation_label": "semantically related to"
|
| 5106 |
},
|
| 5107 |
"isShown": true,
|
| 5108 |
+
"desc": "Measure Type: Response Latency Mean",
|
| 5109 |
"dtype": "determine",
|
| 5110 |
"recover": true
|
| 5111 |
},
|
| 5112 |
{
|
| 5113 |
+
"id": 99,
|
| 5114 |
"name": "RVPLSD",
|
| 5115 |
"dtype": "determine",
|
| 5116 |
"related": [],
|
|
|
|
| 5124 |
}
|
| 5125 |
},
|
| 5126 |
{
|
| 5127 |
+
"id": 100,
|
| 5128 |
"name": "RVPMDL",
|
| 5129 |
"dtype": "determine",
|
| 5130 |
"related": [],
|
|
|
|
| 5138 |
}
|
| 5139 |
},
|
| 5140 |
{
|
| 5141 |
+
"id": 101,
|
| 5142 |
"name": "RVPML",
|
| 5143 |
"dtype": "determine",
|
| 5144 |
"related": [],
|
|
|
|
| 5152 |
}
|
| 5153 |
},
|
| 5154 |
{
|
| 5155 |
+
"id": 102,
|
| 5156 |
"name": "Total",
|
| 5157 |
"related": [
|
|
|
|
|
|
|
|
|
|
| 5158 |
103,
|
| 5159 |
+
104,
|
| 5160 |
+
105,
|
| 5161 |
+
106,
|
| 5162 |
+
107
|
| 5163 |
],
|
| 5164 |
"type": "aggregation",
|
| 5165 |
"info": {
|
|
|
|
| 5176 |
"recover": true
|
| 5177 |
},
|
| 5178 |
{
|
| 5179 |
+
"id": 103,
|
| 5180 |
"name": "RVPPFA",
|
| 5181 |
"dtype": "determine",
|
| 5182 |
"related": [],
|
|
|
|
| 5190 |
}
|
| 5191 |
},
|
| 5192 |
{
|
| 5193 |
+
"id": 104,
|
| 5194 |
"name": "RVPPH",
|
| 5195 |
"dtype": "determine",
|
| 5196 |
"related": [],
|
|
|
|
| 5204 |
}
|
| 5205 |
},
|
| 5206 |
{
|
| 5207 |
+
"id": 105,
|
| 5208 |
"name": "RVPTFA",
|
| 5209 |
"dtype": "determine",
|
| 5210 |
"related": [],
|
|
|
|
| 5218 |
}
|
| 5219 |
},
|
| 5220 |
{
|
| 5221 |
+
"id": 106,
|
| 5222 |
"name": "RVPTH",
|
| 5223 |
"dtype": "determine",
|
| 5224 |
"related": [],
|
|
|
|
| 5232 |
}
|
| 5233 |
},
|
| 5234 |
{
|
| 5235 |
+
"id": 107,
|
| 5236 |
"name": "RVPTM",
|
| 5237 |
"dtype": "determine",
|
| 5238 |
"related": [],
|
|
|
|
| 5246 |
}
|
| 5247 |
},
|
| 5248 |
{
|
| 5249 |
+
"id": 108,
|
| 5250 |
+
"name": "Errors Boxes Times",
|
| 5251 |
"related": [
|
|
|
|
|
|
|
|
|
|
| 5252 |
109,
|
| 5253 |
+
110,
|
| 5254 |
+
111,
|
| 5255 |
+
112,
|
| 5256 |
+
113
|
| 5257 |
],
|
| 5258 |
"type": "aggregation",
|
| 5259 |
"info": {
|
|
|
|
| 5265 |
"relation_label": "semantically related to"
|
| 5266 |
},
|
| 5267 |
"isShown": true,
|
| 5268 |
+
"desc": "Measure Type: Errors Boxes Times",
|
| 5269 |
"dtype": "determine",
|
| 5270 |
"recover": true
|
| 5271 |
},
|
| 5272 |
{
|
| 5273 |
+
"id": 109,
|
| 5274 |
"name": "SWMBE12",
|
| 5275 |
"dtype": "determine",
|
| 5276 |
"related": [],
|
|
|
|
| 5284 |
}
|
| 5285 |
},
|
| 5286 |
{
|
| 5287 |
+
"id": 110,
|
| 5288 |
"name": "SWMBE4",
|
| 5289 |
"dtype": "determine",
|
| 5290 |
"related": [],
|
|
|
|
| 5298 |
}
|
| 5299 |
},
|
| 5300 |
{
|
| 5301 |
+
"id": 111,
|
| 5302 |
"name": "SWMBE468",
|
| 5303 |
"dtype": "determine",
|
| 5304 |
"related": [],
|
|
|
|
| 5312 |
}
|
| 5313 |
},
|
| 5314 |
{
|
| 5315 |
+
"id": 112,
|
| 5316 |
"name": "SWMBE6",
|
| 5317 |
"dtype": "determine",
|
| 5318 |
"related": [],
|
|
|
|
| 5326 |
}
|
| 5327 |
},
|
| 5328 |
{
|
| 5329 |
+
"id": 113,
|
| 5330 |
"name": "SWMBE8",
|
| 5331 |
"dtype": "determine",
|
| 5332 |
"related": [],
|
|
|
|
| 5340 |
}
|
| 5341 |
},
|
| 5342 |
{
|
| 5343 |
+
"id": 114,
|
| 5344 |
"name": "Double Errors Boxes",
|
| 5345 |
"related": [
|
| 5346 |
+
115,
|
|
|
|
| 5347 |
116,
|
| 5348 |
+
119,
|
| 5349 |
+
120
|
| 5350 |
],
|
| 5351 |
"type": "aggregation",
|
| 5352 |
"info": {
|
|
|
|
| 5363 |
"recover": true
|
| 5364 |
},
|
| 5365 |
{
|
| 5366 |
+
"id": 115,
|
| 5367 |
"name": "SWMDE12",
|
| 5368 |
"dtype": "determine",
|
| 5369 |
"related": [],
|
|
|
|
| 5377 |
}
|
| 5378 |
},
|
| 5379 |
{
|
| 5380 |
+
"id": 116,
|
| 5381 |
"name": "SWMDE4",
|
| 5382 |
"dtype": "determine",
|
| 5383 |
"related": [],
|
|
|
|
| 5391 |
}
|
| 5392 |
},
|
| 5393 |
{
|
| 5394 |
+
"id": 117,
|
| 5395 |
"name": "Double Errors",
|
| 5396 |
"related": [
|
| 5397 |
+
118
|
| 5398 |
],
|
| 5399 |
"type": "aggregation",
|
| 5400 |
"info": {
|
|
|
|
| 5411 |
"recover": true
|
| 5412 |
},
|
| 5413 |
{
|
| 5414 |
+
"id": 118,
|
| 5415 |
"name": "SWMDE468",
|
| 5416 |
"dtype": "determine",
|
| 5417 |
"related": [],
|
|
|
|
| 5425 |
}
|
| 5426 |
},
|
| 5427 |
{
|
| 5428 |
+
"id": 119,
|
| 5429 |
"name": "SWMDE6",
|
| 5430 |
"dtype": "determine",
|
| 5431 |
"related": [],
|
|
|
|
| 5439 |
}
|
| 5440 |
},
|
| 5441 |
{
|
| 5442 |
+
"id": 120,
|
| 5443 |
"name": "SWMDE8",
|
| 5444 |
"dtype": "determine",
|
| 5445 |
"related": [],
|
|
|
|
| 5453 |
}
|
| 5454 |
},
|
| 5455 |
{
|
| 5456 |
+
"id": 121,
|
| 5457 |
"name": "Problem Reached",
|
| 5458 |
"related": [
|
| 5459 |
+
122
|
| 5460 |
],
|
| 5461 |
"type": "aggregation",
|
| 5462 |
"info": {
|
|
|
|
| 5473 |
"recover": true
|
| 5474 |
},
|
| 5475 |
{
|
| 5476 |
+
"id": 122,
|
| 5477 |
"name": "SWMPR",
|
| 5478 |
"dtype": "determine",
|
| 5479 |
"related": [],
|
|
|
|
| 5487 |
}
|
| 5488 |
},
|
| 5489 |
{
|
| 5490 |
+
"id": 123,
|
| 5491 |
+
"name": "Strategy High",
|
| 5492 |
"related": [
|
| 5493 |
+
124,
|
| 5494 |
+
125,
|
| 5495 |
+
126
|
| 5496 |
],
|
| 5497 |
"type": "aggregation",
|
| 5498 |
"info": {
|
|
|
|
| 5504 |
"relation_label": "semantically related to"
|
| 5505 |
},
|
| 5506 |
"isShown": true,
|
| 5507 |
+
"desc": "Measure Type: Strategy High",
|
| 5508 |
"dtype": "determine",
|
| 5509 |
"recover": true
|
| 5510 |
},
|
| 5511 |
{
|
| 5512 |
+
"id": 124,
|
| 5513 |
"name": "SWMS",
|
| 5514 |
"dtype": "determine",
|
| 5515 |
"related": [],
|
|
|
|
| 5523 |
}
|
| 5524 |
},
|
| 5525 |
{
|
| 5526 |
+
"id": 125,
|
| 5527 |
"name": "SWMS6",
|
| 5528 |
"dtype": "determine",
|
| 5529 |
"related": [],
|
|
|
|
| 5537 |
}
|
| 5538 |
},
|
| 5539 |
{
|
| 5540 |
+
"id": 126,
|
| 5541 |
"name": "SWMSX",
|
| 5542 |
"dtype": "determine",
|
| 5543 |
"related": [],
|
|
|
|
| 5551 |
}
|
| 5552 |
},
|
| 5553 |
{
|
| 5554 |
+
"id": 127,
|
| 5555 |
"name": "SWMTE12",
|
| 5556 |
"dtype": "determine",
|
| 5557 |
"related": [],
|
|
|
|
| 5565 |
}
|
| 5566 |
},
|
| 5567 |
{
|
| 5568 |
+
"id": 128,
|
| 5569 |
"name": "SWMTE4",
|
| 5570 |
"dtype": "determine",
|
| 5571 |
"related": [],
|
|
|
|
| 5579 |
}
|
| 5580 |
},
|
| 5581 |
{
|
| 5582 |
+
"id": 129,
|
| 5583 |
"name": "SWMTE468",
|
| 5584 |
"dtype": "determine",
|
| 5585 |
"related": [],
|
|
|
|
| 5593 |
}
|
| 5594 |
},
|
| 5595 |
{
|
| 5596 |
+
"id": 130,
|
| 5597 |
"name": "SWMTE6",
|
| 5598 |
"dtype": "determine",
|
| 5599 |
"related": [],
|
|
|
|
| 5607 |
}
|
| 5608 |
},
|
| 5609 |
{
|
| 5610 |
+
"id": 131,
|
| 5611 |
"name": "SWMTE8",
|
| 5612 |
"dtype": "determine",
|
| 5613 |
"related": [],
|
|
|
|
| 5621 |
}
|
| 5622 |
},
|
| 5623 |
{
|
| 5624 |
+
"id": 132,
|
| 5625 |
+
"name": "Within Errors",
|
| 5626 |
"related": [
|
| 5627 |
+
133,
|
|
|
|
| 5628 |
134,
|
| 5629 |
+
135,
|
| 5630 |
+
136,
|
| 5631 |
+
137
|
| 5632 |
],
|
| 5633 |
"type": "aggregation",
|
| 5634 |
"info": {
|
|
|
|
| 5640 |
"relation_label": "semantically related to"
|
| 5641 |
},
|
| 5642 |
"isShown": true,
|
| 5643 |
+
"desc": "Measure Type: Within Errors",
|
| 5644 |
"dtype": "determine",
|
| 5645 |
"recover": true
|
| 5646 |
},
|
| 5647 |
{
|
| 5648 |
+
"id": 133,
|
| 5649 |
"name": "SWMWE12",
|
| 5650 |
"dtype": "determine",
|
| 5651 |
"related": [],
|
|
|
|
| 5659 |
}
|
| 5660 |
},
|
| 5661 |
{
|
| 5662 |
+
"id": 134,
|
| 5663 |
"name": "SWMWE4",
|
| 5664 |
"dtype": "determine",
|
| 5665 |
"related": [],
|
|
|
|
| 5673 |
}
|
| 5674 |
},
|
| 5675 |
{
|
| 5676 |
+
"id": 135,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5677 |
"name": "SWMWE468",
|
| 5678 |
"dtype": "determine",
|
| 5679 |
"related": [],
|
|
|
|
| 5687 |
}
|
| 5688 |
},
|
| 5689 |
{
|
| 5690 |
+
"id": 136,
|
| 5691 |
"name": "SWMWE6",
|
| 5692 |
"dtype": "determine",
|
| 5693 |
"related": [],
|
|
|
|
| 5701 |
}
|
| 5702 |
},
|
| 5703 |
{
|
| 5704 |
+
"id": 137,
|
| 5705 |
"name": "SWMWE8",
|
| 5706 |
"dtype": "determine",
|
| 5707 |
"related": [],
|
outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json
CHANGED
|
@@ -1645,7 +1645,7 @@
|
|
| 1645 |
115,
|
| 1646 |
116,
|
| 1647 |
117,
|
| 1648 |
-
|
| 1649 |
],
|
| 1650 |
"type": "aggregation",
|
| 1651 |
"info": {
|
|
@@ -1687,16 +1687,19 @@
|
|
| 1687 |
"recover": true,
|
| 1688 |
"concept_provenance": {
|
| 1689 |
"node_label": "Total Correct",
|
| 1690 |
-
"confidence": 0.
|
| 1691 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
| 1692 |
"source_evidence": [
|
| 1693 |
-
"
|
| 1694 |
],
|
| 1695 |
-
"embedding_sim": 0.
|
| 1696 |
-
"
|
| 1697 |
-
"
|
| 1698 |
-
"
|
| 1699 |
-
"
|
| 1700 |
}
|
| 1701 |
},
|
| 1702 |
{
|
|
@@ -1722,21 +1725,65 @@
|
|
| 1722 |
"recover": true,
|
| 1723 |
"concept_provenance": {
|
| 1724 |
"node_label": "Error",
|
| 1725 |
-
"confidence": 0.
|
| 1726 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1727 |
"source_evidence": [
|
| 1728 |
-
"
|
| 1729 |
],
|
| 1730 |
-
"embedding_sim": 0.
|
| 1731 |
-
"
|
| 1732 |
-
"coverage": 0.0,
|
| 1733 |
"contrast": 0.0,
|
| 1734 |
-
"specificity": 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1735 |
}
|
| 1736 |
},
|
| 1737 |
{
|
| 1738 |
"id": 113,
|
| 1739 |
-
"name": "
|
| 1740 |
"related": [
|
| 1741 |
34,
|
| 1742 |
35
|
|
@@ -1751,21 +1798,24 @@
|
|
| 1751 |
"relation_label": "belongs to"
|
| 1752 |
},
|
| 1753 |
"isShown": true,
|
| 1754 |
-
"desc": "Concept group: DMS >
|
| 1755 |
"dtype": "determine",
|
| 1756 |
"recover": true,
|
| 1757 |
"concept_provenance": {
|
| 1758 |
-
"node_label": "
|
| 1759 |
-
"confidence": 0.
|
| 1760 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
| 1761 |
"source_evidence": [
|
| 1762 |
-
"
|
| 1763 |
],
|
| 1764 |
-
"embedding_sim": 0.
|
| 1765 |
-
"
|
| 1766 |
-
"
|
| 1767 |
-
"
|
| 1768 |
-
"
|
| 1769 |
}
|
| 1770 |
},
|
| 1771 |
{
|
|
@@ -1794,21 +1844,24 @@
|
|
| 1794 |
"recover": true,
|
| 1795 |
"concept_provenance": {
|
| 1796 |
"node_label": "Correct Latency Standard Deviation",
|
| 1797 |
-
"confidence": 0.
|
| 1798 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
| 1799 |
"source_evidence": [
|
| 1800 |
-
"
|
| 1801 |
],
|
| 1802 |
-
"embedding_sim": 0.
|
| 1803 |
-
"
|
| 1804 |
-
"
|
| 1805 |
-
"
|
| 1806 |
-
"
|
| 1807 |
}
|
| 1808 |
},
|
| 1809 |
{
|
| 1810 |
"id": 115,
|
| 1811 |
-
"name": "Probability Error",
|
| 1812 |
"related": [
|
| 1813 |
26,
|
| 1814 |
27
|
|
@@ -1823,26 +1876,30 @@
|
|
| 1823 |
"relation_label": "belongs to"
|
| 1824 |
},
|
| 1825 |
"isShown": true,
|
| 1826 |
-
"desc": "Concept group: DMS > Probability Error",
|
| 1827 |
"dtype": "determine",
|
| 1828 |
"recover": true,
|
| 1829 |
"concept_provenance": {
|
| 1830 |
-
"node_label": "Probability Error",
|
| 1831 |
-
"confidence": 0.
|
| 1832 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1833 |
"source_evidence": [
|
| 1834 |
-
"
|
| 1835 |
],
|
| 1836 |
-
"embedding_sim": 0.
|
| 1837 |
-
"
|
| 1838 |
-
"
|
| 1839 |
-
"
|
| 1840 |
-
"
|
| 1841 |
}
|
| 1842 |
},
|
| 1843 |
{
|
| 1844 |
"id": 116,
|
| 1845 |
-
"name": "Percent Correct",
|
| 1846 |
"related": [
|
| 1847 |
20,
|
| 1848 |
21,
|
|
@@ -1861,21 +1918,25 @@
|
|
| 1861 |
"relation_label": "belongs to"
|
| 1862 |
},
|
| 1863 |
"isShown": true,
|
| 1864 |
-
"desc": "Concept group: DMS > Percent Correct",
|
| 1865 |
"dtype": "determine",
|
| 1866 |
"recover": true,
|
| 1867 |
"concept_provenance": {
|
| 1868 |
-
"node_label": "Percent Correct",
|
| 1869 |
-
"confidence": 0.
|
| 1870 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1871 |
"source_evidence": [
|
| 1872 |
-
"
|
| 1873 |
],
|
| 1874 |
-
"embedding_sim": 0.
|
| 1875 |
-
"
|
| 1876 |
-
"
|
| 1877 |
-
"
|
| 1878 |
-
"
|
| 1879 |
}
|
| 1880 |
},
|
| 1881 |
{
|
|
@@ -1941,11 +2002,11 @@
|
|
| 1941 |
"recover": true,
|
| 1942 |
"concept_provenance": {
|
| 1943 |
"node_label": "Latency Display Stimulus",
|
| 1944 |
-
"confidence": 0.
|
| 1945 |
"alternatives": [
|
| 1946 |
-
"
|
| 1947 |
-
"
|
| 1948 |
-
"
|
| 1949 |
],
|
| 1950 |
"source_evidence": [
|
| 1951 |
"keybert"
|
|
@@ -1953,7 +2014,7 @@
|
|
| 1953 |
"embedding_sim": 0.732,
|
| 1954 |
"coverage": 0.732,
|
| 1955 |
"contrast": 0.595,
|
| 1956 |
-
"specificity":
|
| 1957 |
"string_sim": 0.0
|
| 1958 |
}
|
| 1959 |
},
|
|
@@ -1979,10 +2040,11 @@
|
|
| 1979 |
"recover": true,
|
| 1980 |
"concept_provenance": {
|
| 1981 |
"node_label": "Total Assessment Trials",
|
| 1982 |
-
"confidence": 0.
|
| 1983 |
"alternatives": [
|
| 1984 |
-
"
|
| 1985 |
-
"
|
|
|
|
| 1986 |
],
|
| 1987 |
"source_evidence": [
|
| 1988 |
"keybert"
|
|
@@ -1990,7 +2052,7 @@
|
|
| 1990 |
"embedding_sim": 0.629,
|
| 1991 |
"coverage": 0.629,
|
| 1992 |
"contrast": 0.204,
|
| 1993 |
-
"specificity":
|
| 1994 |
"string_sim": 0.0
|
| 1995 |
}
|
| 1996 |
},
|
|
@@ -1998,11 +2060,10 @@
|
|
| 1998 |
"id": 121,
|
| 1999 |
"name": "PAL",
|
| 2000 |
"related": [
|
| 2001 |
-
122,
|
| 2002 |
123,
|
| 2003 |
-
124,
|
| 2004 |
125,
|
| 2005 |
126,
|
|
|
|
| 2006 |
147
|
| 2007 |
],
|
| 2008 |
"type": "aggregation",
|
|
@@ -2021,7 +2082,7 @@
|
|
| 2021 |
},
|
| 2022 |
{
|
| 2023 |
"id": 122,
|
| 2024 |
-
"name": "
|
| 2025 |
"related": [
|
| 2026 |
49,
|
| 2027 |
50,
|
|
@@ -2040,21 +2101,24 @@
|
|
| 2040 |
"relation_label": "belongs to"
|
| 2041 |
},
|
| 2042 |
"isShown": true,
|
| 2043 |
-
"desc": "Concept group: PAL >
|
| 2044 |
"dtype": "determine",
|
| 2045 |
"recover": true,
|
| 2046 |
"concept_provenance": {
|
| 2047 |
-
"node_label": "
|
| 2048 |
-
"confidence": 0.
|
| 2049 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
| 2050 |
"source_evidence": [
|
| 2051 |
-
"
|
| 2052 |
],
|
| 2053 |
-
"embedding_sim": 0.
|
| 2054 |
-
"
|
| 2055 |
-
"
|
| 2056 |
-
"
|
| 2057 |
-
"
|
| 2058 |
}
|
| 2059 |
},
|
| 2060 |
{
|
|
@@ -2079,7 +2143,7 @@
|
|
| 2079 |
},
|
| 2080 |
{
|
| 2081 |
"id": 124,
|
| 2082 |
-
"name": "Errors Patterns
|
| 2083 |
"related": [
|
| 2084 |
55,
|
| 2085 |
56,
|
|
@@ -2103,11 +2167,11 @@
|
|
| 2103 |
"recover": true,
|
| 2104 |
"concept_provenance": {
|
| 2105 |
"node_label": "Errors Patterns Total",
|
| 2106 |
-
"confidence": 0.
|
| 2107 |
"alternatives": [
|
| 2108 |
-
"
|
| 2109 |
-
"
|
| 2110 |
-
"
|
| 2111 |
],
|
| 2112 |
"source_evidence": [
|
| 2113 |
"keybert"
|
|
@@ -2115,7 +2179,7 @@
|
|
| 2115 |
"embedding_sim": 0.619,
|
| 2116 |
"coverage": 0.619,
|
| 2117 |
"contrast": 0.115,
|
| 2118 |
-
"specificity":
|
| 2119 |
"string_sim": 0.0
|
| 2120 |
}
|
| 2121 |
},
|
|
@@ -2184,16 +2248,20 @@
|
|
| 2184 |
"recover": true,
|
| 2185 |
"concept_provenance": {
|
| 2186 |
"node_label": "Include Total Errors Shapes",
|
| 2187 |
-
"confidence": 0.
|
| 2188 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2189 |
"source_evidence": [
|
| 2190 |
-
"
|
| 2191 |
],
|
| 2192 |
-
"embedding_sim": 0.
|
| 2193 |
-
"
|
| 2194 |
-
"
|
| 2195 |
-
"
|
| 2196 |
-
"
|
| 2197 |
}
|
| 2198 |
},
|
| 2199 |
{
|
|
@@ -2220,10 +2288,14 @@
|
|
| 2220 |
},
|
| 2221 |
{
|
| 2222 |
"id": 130,
|
| 2223 |
-
"name": "
|
| 2224 |
"related": [
|
| 2225 |
-
|
| 2226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2227 |
],
|
| 2228 |
"type": "aggregation",
|
| 2229 |
"info": {
|
|
@@ -2235,21 +2307,25 @@
|
|
| 2235 |
"relation_label": "belongs to"
|
| 2236 |
},
|
| 2237 |
"isShown": true,
|
| 2238 |
-
"desc": "Concept group: PRM >
|
| 2239 |
"dtype": "determine",
|
| 2240 |
"recover": true,
|
| 2241 |
"concept_provenance": {
|
| 2242 |
-
"node_label": "
|
| 2243 |
-
"confidence": 0.
|
| 2244 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2245 |
"source_evidence": [
|
| 2246 |
-
"
|
| 2247 |
],
|
| 2248 |
-
"embedding_sim": 0.
|
| 2249 |
-
"
|
| 2250 |
-
"
|
| 2251 |
-
"
|
| 2252 |
-
"
|
| 2253 |
}
|
| 2254 |
},
|
| 2255 |
{
|
|
@@ -2274,7 +2350,7 @@
|
|
| 2274 |
},
|
| 2275 |
{
|
| 2276 |
"id": 132,
|
| 2277 |
-
"name": "Percent Correct",
|
| 2278 |
"related": [
|
| 2279 |
73,
|
| 2280 |
74
|
|
@@ -2289,21 +2365,25 @@
|
|
| 2289 |
"relation_label": "belongs to"
|
| 2290 |
},
|
| 2291 |
"isShown": true,
|
| 2292 |
-
"desc": "Concept group: PRM > Percent Correct",
|
| 2293 |
"dtype": "determine",
|
| 2294 |
"recover": true,
|
| 2295 |
"concept_provenance": {
|
| 2296 |
-
"node_label": "Percent Correct",
|
| 2297 |
-
"confidence": 0.
|
| 2298 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2299 |
"source_evidence": [
|
| 2300 |
-
"
|
| 2301 |
],
|
| 2302 |
-
"embedding_sim": 0.
|
| 2303 |
-
"
|
| 2304 |
-
"
|
| 2305 |
-
"
|
| 2306 |
-
"
|
| 2307 |
}
|
| 2308 |
},
|
| 2309 |
{
|
|
@@ -2353,21 +2433,24 @@
|
|
| 2353 |
"recover": true,
|
| 2354 |
"concept_provenance": {
|
| 2355 |
"node_label": "Total",
|
| 2356 |
-
"confidence": 0.
|
| 2357 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
| 2358 |
"source_evidence": [
|
| 2359 |
-
"
|
| 2360 |
],
|
| 2361 |
-
"embedding_sim": 0.
|
| 2362 |
-
"
|
| 2363 |
-
"
|
| 2364 |
-
"
|
| 2365 |
-
"
|
| 2366 |
}
|
| 2367 |
},
|
| 2368 |
{
|
| 2369 |
"id": 135,
|
| 2370 |
-
"name": "Response Latency",
|
| 2371 |
"related": [
|
| 2372 |
77,
|
| 2373 |
78,
|
|
@@ -2383,21 +2466,25 @@
|
|
| 2383 |
"relation_label": "belongs to"
|
| 2384 |
},
|
| 2385 |
"isShown": true,
|
| 2386 |
-
"desc": "Concept group: RVP > Response Latency",
|
| 2387 |
"dtype": "determine",
|
| 2388 |
"recover": true,
|
| 2389 |
"concept_provenance": {
|
| 2390 |
-
"node_label": "Response Latency",
|
| 2391 |
-
"confidence": 0.
|
| 2392 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2393 |
"source_evidence": [
|
| 2394 |
-
"
|
| 2395 |
],
|
| 2396 |
-
"embedding_sim": 0.
|
| 2397 |
-
"
|
| 2398 |
-
"
|
| 2399 |
-
"
|
| 2400 |
-
"
|
| 2401 |
}
|
| 2402 |
},
|
| 2403 |
{
|
|
@@ -2426,9 +2513,9 @@
|
|
| 2426 |
"related": [
|
| 2427 |
140,
|
| 2428 |
141,
|
|
|
|
| 2429 |
143,
|
| 2430 |
144,
|
| 2431 |
-
145,
|
| 2432 |
148
|
| 2433 |
],
|
| 2434 |
"type": "aggregation",
|
|
@@ -2445,9 +2532,50 @@
|
|
| 2445 |
"dtype": "determine",
|
| 2446 |
"recover": true
|
| 2447 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2448 |
{
|
| 2449 |
"id": 140,
|
| 2450 |
-
"name": "Strategy",
|
| 2451 |
"related": [
|
| 2452 |
96,
|
| 2453 |
97,
|
|
@@ -2463,21 +2591,25 @@
|
|
| 2463 |
"relation_label": "belongs to"
|
| 2464 |
},
|
| 2465 |
"isShown": true,
|
| 2466 |
-
"desc": "Concept group: SWM > Strategy",
|
| 2467 |
"dtype": "determine",
|
| 2468 |
"recover": true,
|
| 2469 |
"concept_provenance": {
|
| 2470 |
-
"node_label": "Strategy",
|
| 2471 |
-
"confidence": 0.
|
| 2472 |
-
"alternatives": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2473 |
"source_evidence": [
|
| 2474 |
-
"
|
| 2475 |
],
|
| 2476 |
-
"embedding_sim": 0.
|
| 2477 |
-
"
|
| 2478 |
-
"
|
| 2479 |
-
"
|
| 2480 |
-
"
|
| 2481 |
}
|
| 2482 |
},
|
| 2483 |
{
|
|
@@ -2501,14 +2633,14 @@
|
|
| 2501 |
"recover": true
|
| 2502 |
},
|
| 2503 |
{
|
| 2504 |
-
"id":
|
| 2505 |
-
"name": "
|
| 2506 |
"related": [
|
| 2507 |
-
|
| 2508 |
-
|
| 2509 |
-
|
| 2510 |
-
|
| 2511 |
-
|
| 2512 |
],
|
| 2513 |
"type": "aggregation",
|
| 2514 |
"info": {
|
|
@@ -2520,32 +2652,34 @@
|
|
| 2520 |
"relation_label": "belongs to"
|
| 2521 |
},
|
| 2522 |
"isShown": true,
|
| 2523 |
-
"desc": "Concept group: SWM >
|
| 2524 |
"dtype": "determine",
|
| 2525 |
"recover": true,
|
| 2526 |
"concept_provenance": {
|
| 2527 |
-
"node_label": "
|
| 2528 |
-
"confidence": 0.
|
| 2529 |
-
"alternatives": [
|
|
|
|
|
|
|
| 2530 |
"source_evidence": [
|
| 2531 |
-
"
|
| 2532 |
],
|
| 2533 |
-
"embedding_sim": 0.
|
| 2534 |
-
"
|
| 2535 |
-
"coverage": 0.0,
|
| 2536 |
"contrast": 0.0,
|
| 2537 |
-
"specificity": 0.0
|
|
|
|
| 2538 |
}
|
| 2539 |
},
|
| 2540 |
{
|
| 2541 |
-
"id":
|
| 2542 |
-
"name": "
|
| 2543 |
"related": [
|
| 2544 |
-
|
| 2545 |
-
|
| 2546 |
-
|
| 2547 |
-
|
| 2548 |
-
|
| 2549 |
],
|
| 2550 |
"type": "aggregation",
|
| 2551 |
"info": {
|
|
@@ -2557,12 +2691,28 @@
|
|
| 2557 |
"relation_label": "belongs to"
|
| 2558 |
},
|
| 2559 |
"isShown": true,
|
| 2560 |
-
"desc": "Concept group: SWM >
|
| 2561 |
"dtype": "determine",
|
| 2562 |
-
"recover": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2563 |
},
|
| 2564 |
{
|
| 2565 |
-
"id":
|
| 2566 |
"name": "Double Errors",
|
| 2567 |
"related": [
|
| 2568 |
92,
|
|
@@ -2586,14 +2736,11 @@
|
|
| 2586 |
"recover": true
|
| 2587 |
},
|
| 2588 |
{
|
| 2589 |
-
"id":
|
| 2590 |
"name": "Correct Latency",
|
| 2591 |
"related": [
|
| 2592 |
-
|
| 2593 |
-
|
| 2594 |
-
155,
|
| 2595 |
-
156,
|
| 2596 |
-
157
|
| 2597 |
],
|
| 2598 |
"type": "aggregation",
|
| 2599 |
"info": {
|
|
@@ -2609,6 +2756,27 @@
|
|
| 2609 |
"dtype": "determine",
|
| 2610 |
"recover": true
|
| 2611 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2612 |
{
|
| 2613 |
"id": 147,
|
| 2614 |
"name": "Total Errors",
|
|
@@ -2634,11 +2802,7 @@
|
|
| 2634 |
"id": 148,
|
| 2635 |
"name": "Errors Boxes",
|
| 2636 |
"related": [
|
| 2637 |
-
|
| 2638 |
-
86,
|
| 2639 |
-
87,
|
| 2640 |
-
88,
|
| 2641 |
-
89
|
| 2642 |
],
|
| 2643 |
"type": "aggregation",
|
| 2644 |
"info": {
|
|
@@ -2719,50 +2883,6 @@
|
|
| 2719 |
},
|
| 2720 |
{
|
| 2721 |
"id": 152,
|
| 2722 |
-
"name": "Delayed",
|
| 2723 |
-
"related": [
|
| 2724 |
-
67,
|
| 2725 |
-
69,
|
| 2726 |
-
71
|
| 2727 |
-
],
|
| 2728 |
-
"type": "aggregation",
|
| 2729 |
-
"info": {
|
| 2730 |
-
"operation": "concat",
|
| 2731 |
-
"usedAttributes": [],
|
| 2732 |
-
"formula": "",
|
| 2733 |
-
"exec": "",
|
| 2734 |
-
"relation_type": "belongs_to",
|
| 2735 |
-
"relation_label": "belongs to"
|
| 2736 |
-
},
|
| 2737 |
-
"isShown": true,
|
| 2738 |
-
"desc": "Sub-group: Delayed",
|
| 2739 |
-
"dtype": "determine",
|
| 2740 |
-
"recover": true
|
| 2741 |
-
},
|
| 2742 |
-
{
|
| 2743 |
-
"id": 153,
|
| 2744 |
-
"name": "Immediate",
|
| 2745 |
-
"related": [
|
| 2746 |
-
72,
|
| 2747 |
-
68,
|
| 2748 |
-
70
|
| 2749 |
-
],
|
| 2750 |
-
"type": "aggregation",
|
| 2751 |
-
"info": {
|
| 2752 |
-
"operation": "concat",
|
| 2753 |
-
"usedAttributes": [],
|
| 2754 |
-
"formula": "",
|
| 2755 |
-
"exec": "",
|
| 2756 |
-
"relation_type": "belongs_to",
|
| 2757 |
-
"relation_label": "belongs to"
|
| 2758 |
-
},
|
| 2759 |
-
"isShown": true,
|
| 2760 |
-
"desc": "Sub-group: Immediate",
|
| 2761 |
-
"dtype": "determine",
|
| 2762 |
-
"recover": true
|
| 2763 |
-
},
|
| 2764 |
-
{
|
| 2765 |
-
"id": 154,
|
| 2766 |
"name": "Median Seconds Delay",
|
| 2767 |
"related": [
|
| 2768 |
9,
|
|
@@ -2784,8 +2904,8 @@
|
|
| 2784 |
"recover": true
|
| 2785 |
},
|
| 2786 |
{
|
| 2787 |
-
"id":
|
| 2788 |
-
"name": "
|
| 2789 |
"related": [
|
| 2790 |
16,
|
| 2791 |
17,
|
|
@@ -2801,12 +2921,12 @@
|
|
| 2801 |
"relation_label": "belongs to"
|
| 2802 |
},
|
| 2803 |
"isShown": true,
|
| 2804 |
-
"desc": "Sub-group:
|
| 2805 |
"dtype": "determine",
|
| 2806 |
"recover": true
|
| 2807 |
},
|
| 2808 |
{
|
| 2809 |
-
"id":
|
| 2810 |
"name": "Median",
|
| 2811 |
"related": [
|
| 2812 |
8,
|
|
@@ -2826,27 +2946,5 @@
|
|
| 2826 |
"desc": "Sub-group: Median",
|
| 2827 |
"dtype": "determine",
|
| 2828 |
"recover": true
|
| 2829 |
-
},
|
| 2830 |
-
{
|
| 2831 |
-
"id": 157,
|
| 2832 |
-
"name": "Mean",
|
| 2833 |
-
"related": [
|
| 2834 |
-
18,
|
| 2835 |
-
19,
|
| 2836 |
-
14
|
| 2837 |
-
],
|
| 2838 |
-
"type": "aggregation",
|
| 2839 |
-
"info": {
|
| 2840 |
-
"operation": "concat",
|
| 2841 |
-
"usedAttributes": [],
|
| 2842 |
-
"formula": "",
|
| 2843 |
-
"exec": "",
|
| 2844 |
-
"relation_type": "belongs_to",
|
| 2845 |
-
"relation_label": "belongs to"
|
| 2846 |
-
},
|
| 2847 |
-
"isShown": true,
|
| 2848 |
-
"desc": "Sub-group: Mean",
|
| 2849 |
-
"dtype": "determine",
|
| 2850 |
-
"recover": true
|
| 2851 |
}
|
| 2852 |
]
|
|
|
|
| 1645 |
115,
|
| 1646 |
116,
|
| 1647 |
117,
|
| 1648 |
+
145
|
| 1649 |
],
|
| 1650 |
"type": "aggregation",
|
| 1651 |
"info": {
|
|
|
|
| 1687 |
"recover": true,
|
| 1688 |
"concept_provenance": {
|
| 1689 |
"node_label": "Total Correct",
|
| 1690 |
+
"confidence": 0.507,
|
| 1691 |
+
"alternatives": [
|
| 1692 |
+
"correct total",
|
| 1693 |
+
"correct total times"
|
| 1694 |
+
],
|
| 1695 |
"source_evidence": [
|
| 1696 |
+
"description_title"
|
| 1697 |
],
|
| 1698 |
+
"embedding_sim": 0.319,
|
| 1699 |
+
"coverage": 0.319,
|
| 1700 |
+
"contrast": 0.086,
|
| 1701 |
+
"specificity": 0.0,
|
| 1702 |
+
"string_sim": 1.0
|
| 1703 |
}
|
| 1704 |
},
|
| 1705 |
{
|
|
|
|
| 1725 |
"recover": true,
|
| 1726 |
"concept_provenance": {
|
| 1727 |
"node_label": "Error",
|
| 1728 |
+
"confidence": 0.447,
|
| 1729 |
+
"alternatives": [
|
| 1730 |
+
"error times subject",
|
| 1731 |
+
"error times",
|
| 1732 |
+
"failed"
|
| 1733 |
+
],
|
| 1734 |
"source_evidence": [
|
| 1735 |
+
"description_title"
|
| 1736 |
],
|
| 1737 |
+
"embedding_sim": 0.216,
|
| 1738 |
+
"coverage": 0.216,
|
|
|
|
| 1739 |
"contrast": 0.0,
|
| 1740 |
+
"specificity": 0.0,
|
| 1741 |
+
"string_sim": 1.0
|
| 1742 |
+
}
|
| 1743 |
+
},
|
| 1744 |
+
{
|
| 1745 |
+
"id": 112,
|
| 1746 |
+
"name": "Mean Latency",
|
| 1747 |
+
"related": [
|
| 1748 |
+
14,
|
| 1749 |
+
18,
|
| 1750 |
+
19,
|
| 1751 |
+
152,
|
| 1752 |
+
153,
|
| 1753 |
+
154
|
| 1754 |
+
],
|
| 1755 |
+
"type": "aggregation",
|
| 1756 |
+
"info": {
|
| 1757 |
+
"operation": "concat",
|
| 1758 |
+
"usedAttributes": [],
|
| 1759 |
+
"formula": "",
|
| 1760 |
+
"exec": "",
|
| 1761 |
+
"relation_type": "belongs_to",
|
| 1762 |
+
"relation_label": "belongs to"
|
| 1763 |
+
},
|
| 1764 |
+
"isShown": true,
|
| 1765 |
+
"desc": "Concept group: DMS > Correct Latency Mean",
|
| 1766 |
+
"dtype": "determine",
|
| 1767 |
+
"recover": true,
|
| 1768 |
+
"concept_provenance": {
|
| 1769 |
+
"node_label": "Correct Latency Mean",
|
| 1770 |
+
"confidence": 0.625,
|
| 1771 |
+
"alternatives": [
|
| 1772 |
+
"latency mean"
|
| 1773 |
+
],
|
| 1774 |
+
"source_evidence": [
|
| 1775 |
+
"keybert"
|
| 1776 |
+
],
|
| 1777 |
+
"embedding_sim": 0.676,
|
| 1778 |
+
"coverage": 0.676,
|
| 1779 |
+
"contrast": 0.076,
|
| 1780 |
+
"specificity": 0.0,
|
| 1781 |
+
"string_sim": 0.884
|
| 1782 |
}
|
| 1783 |
},
|
| 1784 |
{
|
| 1785 |
"id": 113,
|
| 1786 |
+
"name": "Errors Total",
|
| 1787 |
"related": [
|
| 1788 |
34,
|
| 1789 |
35
|
|
|
|
| 1798 |
"relation_label": "belongs to"
|
| 1799 |
},
|
| 1800 |
"isShown": true,
|
| 1801 |
+
"desc": "Concept group: DMS > Errors Total",
|
| 1802 |
"dtype": "determine",
|
| 1803 |
"recover": true,
|
| 1804 |
"concept_provenance": {
|
| 1805 |
+
"node_label": "Errors Total",
|
| 1806 |
+
"confidence": 0.604,
|
| 1807 |
+
"alternatives": [
|
| 1808 |
+
"errors total times",
|
| 1809 |
+
"Total Errors"
|
| 1810 |
+
],
|
| 1811 |
"source_evidence": [
|
| 1812 |
+
"keybert"
|
| 1813 |
],
|
| 1814 |
+
"embedding_sim": 0.543,
|
| 1815 |
+
"coverage": 0.543,
|
| 1816 |
+
"contrast": 0.125,
|
| 1817 |
+
"specificity": 0.0,
|
| 1818 |
+
"string_sim": 0.974
|
| 1819 |
}
|
| 1820 |
},
|
| 1821 |
{
|
|
|
|
| 1844 |
"recover": true,
|
| 1845 |
"concept_provenance": {
|
| 1846 |
"node_label": "Correct Latency Standard Deviation",
|
| 1847 |
+
"confidence": 0.687,
|
| 1848 |
+
"alternatives": [
|
| 1849 |
+
"latency standard deviation",
|
| 1850 |
+
"deviation response latencies"
|
| 1851 |
+
],
|
| 1852 |
"source_evidence": [
|
| 1853 |
+
"description_title"
|
| 1854 |
],
|
| 1855 |
+
"embedding_sim": 0.684,
|
| 1856 |
+
"coverage": 0.684,
|
| 1857 |
+
"contrast": 0.193,
|
| 1858 |
+
"specificity": 0.0,
|
| 1859 |
+
"string_sim": 1.0
|
| 1860 |
}
|
| 1861 |
},
|
| 1862 |
{
|
| 1863 |
"id": 115,
|
| 1864 |
+
"name": "Probability Error Occurring",
|
| 1865 |
"related": [
|
| 1866 |
26,
|
| 1867 |
27
|
|
|
|
| 1876 |
"relation_label": "belongs to"
|
| 1877 |
},
|
| 1878 |
"isShown": true,
|
| 1879 |
+
"desc": "Concept group: DMS > Probability Error Occurring",
|
| 1880 |
"dtype": "determine",
|
| 1881 |
"recover": true,
|
| 1882 |
"concept_provenance": {
|
| 1883 |
+
"node_label": "Probability Error Occurring",
|
| 1884 |
+
"confidence": 0.619,
|
| 1885 |
+
"alternatives": [
|
| 1886 |
+
"Probability Error",
|
| 1887 |
+
"probability error made",
|
| 1888 |
+
"reports probability error"
|
| 1889 |
+
],
|
| 1890 |
"source_evidence": [
|
| 1891 |
+
"keybert"
|
| 1892 |
],
|
| 1893 |
+
"embedding_sim": 0.578,
|
| 1894 |
+
"coverage": 0.578,
|
| 1895 |
+
"contrast": 0.142,
|
| 1896 |
+
"specificity": 0.0,
|
| 1897 |
+
"string_sim": 0.966
|
| 1898 |
}
|
| 1899 |
},
|
| 1900 |
{
|
| 1901 |
"id": 116,
|
| 1902 |
+
"name": "Percent Correct Percentage",
|
| 1903 |
"related": [
|
| 1904 |
20,
|
| 1905 |
21,
|
|
|
|
| 1918 |
"relation_label": "belongs to"
|
| 1919 |
},
|
| 1920 |
"isShown": true,
|
| 1921 |
+
"desc": "Concept group: DMS > Percent Correct Percentage",
|
| 1922 |
"dtype": "determine",
|
| 1923 |
"recover": true,
|
| 1924 |
"concept_provenance": {
|
| 1925 |
+
"node_label": "Percent Correct Percentage",
|
| 1926 |
+
"confidence": 0.54,
|
| 1927 |
+
"alternatives": [
|
| 1928 |
+
"correct percentage assessment",
|
| 1929 |
+
"correct percentage",
|
| 1930 |
+
"Percent Correct"
|
| 1931 |
+
],
|
| 1932 |
"source_evidence": [
|
| 1933 |
+
"keybert"
|
| 1934 |
],
|
| 1935 |
+
"embedding_sim": 0.473,
|
| 1936 |
+
"coverage": 0.473,
|
| 1937 |
+
"contrast": 0.156,
|
| 1938 |
+
"specificity": 0.0,
|
| 1939 |
+
"string_sim": 0.868
|
| 1940 |
}
|
| 1941 |
},
|
| 1942 |
{
|
|
|
|
| 2002 |
"recover": true,
|
| 2003 |
"concept_provenance": {
|
| 2004 |
"node_label": "Latency Display Stimulus",
|
| 2005 |
+
"confidence": 0.418,
|
| 2006 |
"alternatives": [
|
| 2007 |
+
"mean latency display",
|
| 2008 |
+
"standard deviation latency",
|
| 2009 |
+
"deviation latency calculated"
|
| 2010 |
],
|
| 2011 |
"source_evidence": [
|
| 2012 |
"keybert"
|
|
|
|
| 2014 |
"embedding_sim": 0.732,
|
| 2015 |
"coverage": 0.732,
|
| 2016 |
"contrast": 0.595,
|
| 2017 |
+
"specificity": 0.0,
|
| 2018 |
"string_sim": 0.0
|
| 2019 |
}
|
| 2020 |
},
|
|
|
|
| 2040 |
"recover": true,
|
| 2041 |
"concept_provenance": {
|
| 2042 |
"node_label": "Total Assessment Trials",
|
| 2043 |
+
"confidence": 0.313,
|
| 2044 |
"alternatives": [
|
| 2045 |
+
"assessment trials subject",
|
| 2046 |
+
"trials subject failed",
|
| 2047 |
+
"trials subject"
|
| 2048 |
],
|
| 2049 |
"source_evidence": [
|
| 2050 |
"keybert"
|
|
|
|
| 2052 |
"embedding_sim": 0.629,
|
| 2053 |
"coverage": 0.629,
|
| 2054 |
"contrast": 0.204,
|
| 2055 |
+
"specificity": 0.0,
|
| 2056 |
"string_sim": 0.0
|
| 2057 |
}
|
| 2058 |
},
|
|
|
|
| 2060 |
"id": 121,
|
| 2061 |
"name": "PAL",
|
| 2062 |
"related": [
|
|
|
|
| 2063 |
123,
|
|
|
|
| 2064 |
125,
|
| 2065 |
126,
|
| 2066 |
+
146,
|
| 2067 |
147
|
| 2068 |
],
|
| 2069 |
"type": "aggregation",
|
|
|
|
| 2082 |
},
|
| 2083 |
{
|
| 2084 |
"id": 122,
|
| 2085 |
+
"name": "Attempts Patterns",
|
| 2086 |
"related": [
|
| 2087 |
49,
|
| 2088 |
50,
|
|
|
|
| 2101 |
"relation_label": "belongs to"
|
| 2102 |
},
|
| 2103 |
"isShown": true,
|
| 2104 |
+
"desc": "Concept group: PAL > Attempts Patterns Total",
|
| 2105 |
"dtype": "determine",
|
| 2106 |
"recover": true,
|
| 2107 |
"concept_provenance": {
|
| 2108 |
+
"node_label": "Attempts Patterns Total",
|
| 2109 |
+
"confidence": 0.633,
|
| 2110 |
+
"alternatives": [
|
| 2111 |
+
"patterns total attempts",
|
| 2112 |
+
"Total Attempts Patterns"
|
| 2113 |
+
],
|
| 2114 |
"source_evidence": [
|
| 2115 |
+
"keybert"
|
| 2116 |
],
|
| 2117 |
+
"embedding_sim": 0.598,
|
| 2118 |
+
"coverage": 0.598,
|
| 2119 |
+
"contrast": 0.151,
|
| 2120 |
+
"specificity": 0.0,
|
| 2121 |
+
"string_sim": 0.975
|
| 2122 |
}
|
| 2123 |
},
|
| 2124 |
{
|
|
|
|
| 2143 |
},
|
| 2144 |
{
|
| 2145 |
"id": 124,
|
| 2146 |
+
"name": "Errors Patterns",
|
| 2147 |
"related": [
|
| 2148 |
55,
|
| 2149 |
56,
|
|
|
|
| 2167 |
"recover": true,
|
| 2168 |
"concept_provenance": {
|
| 2169 |
"node_label": "Errors Patterns Total",
|
| 2170 |
+
"confidence": 0.296,
|
| 2171 |
"alternatives": [
|
| 2172 |
+
"box stimulus assessment",
|
| 2173 |
+
"stimulus assessment problems",
|
| 2174 |
+
"incorrect box stimulus"
|
| 2175 |
],
|
| 2176 |
"source_evidence": [
|
| 2177 |
"keybert"
|
|
|
|
| 2179 |
"embedding_sim": 0.619,
|
| 2180 |
"coverage": 0.619,
|
| 2181 |
"contrast": 0.115,
|
| 2182 |
+
"specificity": 0.0,
|
| 2183 |
"string_sim": 0.0
|
| 2184 |
}
|
| 2185 |
},
|
|
|
|
| 2248 |
"recover": true,
|
| 2249 |
"concept_provenance": {
|
| 2250 |
"node_label": "Include Total Errors Shapes",
|
| 2251 |
+
"confidence": 0.609,
|
| 2252 |
+
"alternatives": [
|
| 2253 |
+
"total errors shapes",
|
| 2254 |
+
"errors shapes times",
|
| 2255 |
+
"errors shapes"
|
| 2256 |
+
],
|
| 2257 |
"source_evidence": [
|
| 2258 |
+
"description_title"
|
| 2259 |
],
|
| 2260 |
+
"embedding_sim": 0.549,
|
| 2261 |
+
"coverage": 0.549,
|
| 2262 |
+
"contrast": 0.08,
|
| 2263 |
+
"specificity": 0.0,
|
| 2264 |
+
"string_sim": 1.0
|
| 2265 |
}
|
| 2266 |
},
|
| 2267 |
{
|
|
|
|
| 2288 |
},
|
| 2289 |
{
|
| 2290 |
"id": 130,
|
| 2291 |
+
"name": "Latency Immediate Standard",
|
| 2292 |
"related": [
|
| 2293 |
+
67,
|
| 2294 |
+
68,
|
| 2295 |
+
69,
|
| 2296 |
+
70,
|
| 2297 |
+
71,
|
| 2298 |
+
72
|
| 2299 |
],
|
| 2300 |
"type": "aggregation",
|
| 2301 |
"info": {
|
|
|
|
| 2307 |
"relation_label": "belongs to"
|
| 2308 |
},
|
| 2309 |
"isShown": true,
|
| 2310 |
+
"desc": "Concept group: PRM > Latency Immediate Standard",
|
| 2311 |
"dtype": "determine",
|
| 2312 |
"recover": true,
|
| 2313 |
"concept_provenance": {
|
| 2314 |
+
"node_label": "Latency Immediate Standard",
|
| 2315 |
+
"confidence": 0.653,
|
| 2316 |
+
"alternatives": [
|
| 2317 |
+
"correct latency immediate",
|
| 2318 |
+
"latency immediate",
|
| 2319 |
+
"correct latency delayed"
|
| 2320 |
+
],
|
| 2321 |
"source_evidence": [
|
| 2322 |
+
"keybert"
|
| 2323 |
],
|
| 2324 |
+
"embedding_sim": 0.715,
|
| 2325 |
+
"coverage": 0.715,
|
| 2326 |
+
"contrast": 0.34,
|
| 2327 |
+
"specificity": 0.0,
|
| 2328 |
+
"string_sim": 0.801
|
| 2329 |
}
|
| 2330 |
},
|
| 2331 |
{
|
|
|
|
| 2350 |
},
|
| 2351 |
{
|
| 2352 |
"id": 132,
|
| 2353 |
+
"name": "Percent Correct Immediate",
|
| 2354 |
"related": [
|
| 2355 |
73,
|
| 2356 |
74
|
|
|
|
| 2365 |
"relation_label": "belongs to"
|
| 2366 |
},
|
| 2367 |
"isShown": true,
|
| 2368 |
+
"desc": "Concept group: PRM > Percent Correct Immediate",
|
| 2369 |
"dtype": "determine",
|
| 2370 |
"recover": true,
|
| 2371 |
"concept_provenance": {
|
| 2372 |
+
"node_label": "Percent Correct Immediate",
|
| 2373 |
+
"confidence": 0.596,
|
| 2374 |
+
"alternatives": [
|
| 2375 |
+
"Percent Correct",
|
| 2376 |
+
"key percent correct",
|
| 2377 |
+
"percent correct delayed"
|
| 2378 |
+
],
|
| 2379 |
"source_evidence": [
|
| 2380 |
+
"keybert"
|
| 2381 |
],
|
| 2382 |
+
"embedding_sim": 0.671,
|
| 2383 |
+
"coverage": 0.671,
|
| 2384 |
+
"contrast": 0.245,
|
| 2385 |
+
"specificity": 0.0,
|
| 2386 |
+
"string_sim": 0.735
|
| 2387 |
}
|
| 2388 |
},
|
| 2389 |
{
|
|
|
|
| 2433 |
"recover": true,
|
| 2434 |
"concept_provenance": {
|
| 2435 |
"node_label": "Total",
|
| 2436 |
+
"confidence": 0.407,
|
| 2437 |
+
"alternatives": [
|
| 2438 |
+
"total hits",
|
| 2439 |
+
"hits total"
|
| 2440 |
+
],
|
| 2441 |
"source_evidence": [
|
| 2442 |
+
"description_title"
|
| 2443 |
],
|
| 2444 |
+
"embedding_sim": 0.111,
|
| 2445 |
+
"coverage": 0.111,
|
| 2446 |
+
"contrast": 0.05,
|
| 2447 |
+
"specificity": 0.0,
|
| 2448 |
+
"string_sim": 1.0
|
| 2449 |
}
|
| 2450 |
},
|
| 2451 |
{
|
| 2452 |
"id": 135,
|
| 2453 |
+
"name": "Response Latency Mean",
|
| 2454 |
"related": [
|
| 2455 |
77,
|
| 2456 |
78,
|
|
|
|
| 2466 |
"relation_label": "belongs to"
|
| 2467 |
},
|
| 2468 |
"isShown": true,
|
| 2469 |
+
"desc": "Concept group: RVP > Response Latency Mean",
|
| 2470 |
"dtype": "determine",
|
| 2471 |
"recover": true,
|
| 2472 |
"concept_provenance": {
|
| 2473 |
+
"node_label": "Response Latency Mean",
|
| 2474 |
+
"confidence": 0.676,
|
| 2475 |
+
"alternatives": [
|
| 2476 |
+
"Response Latency",
|
| 2477 |
+
"response latency trials",
|
| 2478 |
+
"latency mean response"
|
| 2479 |
+
],
|
| 2480 |
"source_evidence": [
|
| 2481 |
+
"keybert"
|
| 2482 |
],
|
| 2483 |
+
"embedding_sim": 0.683,
|
| 2484 |
+
"coverage": 0.683,
|
| 2485 |
+
"contrast": 0.311,
|
| 2486 |
+
"specificity": 0.0,
|
| 2487 |
+
"string_sim": 0.92
|
| 2488 |
}
|
| 2489 |
},
|
| 2490 |
{
|
|
|
|
| 2513 |
"related": [
|
| 2514 |
140,
|
| 2515 |
141,
|
| 2516 |
+
142,
|
| 2517 |
143,
|
| 2518 |
144,
|
|
|
|
| 2519 |
148
|
| 2520 |
],
|
| 2521 |
"type": "aggregation",
|
|
|
|
| 2532 |
"dtype": "determine",
|
| 2533 |
"recover": true
|
| 2534 |
},
|
| 2535 |
+
{
|
| 2536 |
+
"id": 138,
|
| 2537 |
+
"name": "Times Errors",
|
| 2538 |
+
"related": [
|
| 2539 |
+
85,
|
| 2540 |
+
86,
|
| 2541 |
+
87,
|
| 2542 |
+
88,
|
| 2543 |
+
89
|
| 2544 |
+
],
|
| 2545 |
+
"type": "aggregation",
|
| 2546 |
+
"info": {
|
| 2547 |
+
"operation": "concat",
|
| 2548 |
+
"usedAttributes": [],
|
| 2549 |
+
"formula": "",
|
| 2550 |
+
"exec": "",
|
| 2551 |
+
"relation_type": "belongs_to",
|
| 2552 |
+
"relation_label": "belongs to"
|
| 2553 |
+
},
|
| 2554 |
+
"isShown": true,
|
| 2555 |
+
"desc": "Concept group: SWM > Errors Boxes Times",
|
| 2556 |
+
"dtype": "determine",
|
| 2557 |
+
"recover": true,
|
| 2558 |
+
"concept_provenance": {
|
| 2559 |
+
"node_label": "Errors Boxes Times",
|
| 2560 |
+
"confidence": 0.515,
|
| 2561 |
+
"alternatives": [
|
| 2562 |
+
"Errors Boxes",
|
| 2563 |
+
"key errors boxes",
|
| 2564 |
+
"errors times"
|
| 2565 |
+
],
|
| 2566 |
+
"source_evidence": [
|
| 2567 |
+
"keybert"
|
| 2568 |
+
],
|
| 2569 |
+
"embedding_sim": 0.447,
|
| 2570 |
+
"coverage": 0.447,
|
| 2571 |
+
"contrast": 0.0,
|
| 2572 |
+
"specificity": 0.0,
|
| 2573 |
+
"string_sim": 0.896
|
| 2574 |
+
}
|
| 2575 |
+
},
|
| 2576 |
{
|
| 2577 |
"id": 140,
|
| 2578 |
+
"name": "Strategy High",
|
| 2579 |
"related": [
|
| 2580 |
96,
|
| 2581 |
97,
|
|
|
|
| 2591 |
"relation_label": "belongs to"
|
| 2592 |
},
|
| 2593 |
"isShown": true,
|
| 2594 |
+
"desc": "Concept group: SWM > Strategy High",
|
| 2595 |
"dtype": "determine",
|
| 2596 |
"recover": true,
|
| 2597 |
"concept_provenance": {
|
| 2598 |
+
"node_label": "Strategy High",
|
| 2599 |
+
"confidence": 0.569,
|
| 2600 |
+
"alternatives": [
|
| 2601 |
+
"Strategy",
|
| 2602 |
+
"strategy finding",
|
| 2603 |
+
"high strategy"
|
| 2604 |
+
],
|
| 2605 |
"source_evidence": [
|
| 2606 |
+
"keybert"
|
| 2607 |
],
|
| 2608 |
+
"embedding_sim": 0.509,
|
| 2609 |
+
"coverage": 0.509,
|
| 2610 |
+
"contrast": 0.362,
|
| 2611 |
+
"specificity": 0.0,
|
| 2612 |
+
"string_sim": 0.814
|
| 2613 |
}
|
| 2614 |
},
|
| 2615 |
{
|
|
|
|
| 2633 |
"recover": true
|
| 2634 |
},
|
| 2635 |
{
|
| 2636 |
+
"id": 142,
|
| 2637 |
+
"name": "Within Errors",
|
| 2638 |
"related": [
|
| 2639 |
+
104,
|
| 2640 |
+
105,
|
| 2641 |
+
107,
|
| 2642 |
+
108,
|
| 2643 |
+
106
|
| 2644 |
],
|
| 2645 |
"type": "aggregation",
|
| 2646 |
"info": {
|
|
|
|
| 2652 |
"relation_label": "belongs to"
|
| 2653 |
},
|
| 2654 |
"isShown": true,
|
| 2655 |
+
"desc": "Concept group: SWM > Within Errors",
|
| 2656 |
"dtype": "determine",
|
| 2657 |
"recover": true,
|
| 2658 |
"concept_provenance": {
|
| 2659 |
+
"node_label": "Within Errors",
|
| 2660 |
+
"confidence": 0.412,
|
| 2661 |
+
"alternatives": [
|
| 2662 |
+
"boxes times subject"
|
| 2663 |
+
],
|
| 2664 |
"source_evidence": [
|
| 2665 |
+
"keybert"
|
| 2666 |
],
|
| 2667 |
+
"embedding_sim": 0.303,
|
| 2668 |
+
"coverage": 0.303,
|
|
|
|
| 2669 |
"contrast": 0.0,
|
| 2670 |
+
"specificity": 0.0,
|
| 2671 |
+
"string_sim": 0.787
|
| 2672 |
}
|
| 2673 |
},
|
| 2674 |
{
|
| 2675 |
+
"id": 143,
|
| 2676 |
+
"name": "Errors Total",
|
| 2677 |
"related": [
|
| 2678 |
+
99,
|
| 2679 |
+
100,
|
| 2680 |
+
101,
|
| 2681 |
+
102,
|
| 2682 |
+
103
|
| 2683 |
],
|
| 2684 |
"type": "aggregation",
|
| 2685 |
"info": {
|
|
|
|
| 2691 |
"relation_label": "belongs to"
|
| 2692 |
},
|
| 2693 |
"isShown": true,
|
| 2694 |
+
"desc": "Concept group: SWM > Errors Total",
|
| 2695 |
"dtype": "determine",
|
| 2696 |
+
"recover": true,
|
| 2697 |
+
"concept_provenance": {
|
| 2698 |
+
"node_label": "Errors Total",
|
| 2699 |
+
"confidence": 0.593,
|
| 2700 |
+
"alternatives": [
|
| 2701 |
+
"errors total times",
|
| 2702 |
+
"Total Errors"
|
| 2703 |
+
],
|
| 2704 |
+
"source_evidence": [
|
| 2705 |
+
"keybert"
|
| 2706 |
+
],
|
| 2707 |
+
"embedding_sim": 0.537,
|
| 2708 |
+
"coverage": 0.537,
|
| 2709 |
+
"contrast": 0.07,
|
| 2710 |
+
"specificity": 0.0,
|
| 2711 |
+
"string_sim": 0.974
|
| 2712 |
+
}
|
| 2713 |
},
|
| 2714 |
{
|
| 2715 |
+
"id": 144,
|
| 2716 |
"name": "Double Errors",
|
| 2717 |
"related": [
|
| 2718 |
92,
|
|
|
|
| 2736 |
"recover": true
|
| 2737 |
},
|
| 2738 |
{
|
| 2739 |
+
"id": 145,
|
| 2740 |
"name": "Correct Latency",
|
| 2741 |
"related": [
|
| 2742 |
+
112,
|
| 2743 |
+
114
|
|
|
|
|
|
|
|
|
|
| 2744 |
],
|
| 2745 |
"type": "aggregation",
|
| 2746 |
"info": {
|
|
|
|
| 2756 |
"dtype": "determine",
|
| 2757 |
"recover": true
|
| 2758 |
},
|
| 2759 |
+
{
|
| 2760 |
+
"id": 146,
|
| 2761 |
+
"name": "Patterns Total",
|
| 2762 |
+
"related": [
|
| 2763 |
+
122,
|
| 2764 |
+
124
|
| 2765 |
+
],
|
| 2766 |
+
"type": "aggregation",
|
| 2767 |
+
"info": {
|
| 2768 |
+
"operation": "concat",
|
| 2769 |
+
"usedAttributes": [],
|
| 2770 |
+
"formula": "",
|
| 2771 |
+
"exec": "",
|
| 2772 |
+
"relation_type": "belongs_to",
|
| 2773 |
+
"relation_label": "belongs to"
|
| 2774 |
+
},
|
| 2775 |
+
"isShown": true,
|
| 2776 |
+
"desc": "Measure: Patterns Total",
|
| 2777 |
+
"dtype": "determine",
|
| 2778 |
+
"recover": true
|
| 2779 |
+
},
|
| 2780 |
{
|
| 2781 |
"id": 147,
|
| 2782 |
"name": "Total Errors",
|
|
|
|
| 2802 |
"id": 148,
|
| 2803 |
"name": "Errors Boxes",
|
| 2804 |
"related": [
|
| 2805 |
+
138
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2806 |
],
|
| 2807 |
"type": "aggregation",
|
| 2808 |
"info": {
|
|
|
|
| 2883 |
},
|
| 2884 |
{
|
| 2885 |
"id": 152,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2886 |
"name": "Median Seconds Delay",
|
| 2887 |
"related": [
|
| 2888 |
9,
|
|
|
|
| 2904 |
"recover": true
|
| 2905 |
},
|
| 2906 |
{
|
| 2907 |
+
"id": 153,
|
| 2908 |
+
"name": "Seconds Delay",
|
| 2909 |
"related": [
|
| 2910 |
16,
|
| 2911 |
17,
|
|
|
|
| 2921 |
"relation_label": "belongs to"
|
| 2922 |
},
|
| 2923 |
"isShown": true,
|
| 2924 |
+
"desc": "Sub-group: Seconds Delay",
|
| 2925 |
"dtype": "determine",
|
| 2926 |
"recover": true
|
| 2927 |
},
|
| 2928 |
{
|
| 2929 |
+
"id": 154,
|
| 2930 |
"name": "Median",
|
| 2931 |
"related": [
|
| 2932 |
8,
|
|
|
|
| 2946 |
"desc": "Sub-group: Median",
|
| 2947 |
"dtype": "determine",
|
| 2948 |
"recover": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2949 |
}
|
| 2950 |
]
|
pages/2_Approach_1.py
CHANGED
|
@@ -121,7 +121,7 @@ _STOP = {
|
|
| 121 |
USE_NOUN_PHRASES = False
|
| 122 |
# USE_CTFIDF β True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
|
| 123 |
# boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
|
| 124 |
-
USE_CTFIDF =
|
| 125 |
# KEYBERT_DIVERSITY β MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
|
| 126 |
# (pick the single most relevant phrase); 0.5 = standard MMR diversification.
|
| 127 |
KEYBERT_DIVERSITY = 0
|
|
@@ -148,6 +148,20 @@ _CORPUS_IDF: dict = {}
|
|
| 148 |
# scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
|
| 149 |
_ACTIVE_DOMAIN: str = 'general'
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 152 |
# FILE LOADING
|
| 153 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -345,6 +359,9 @@ def build_canonical(df, cfg, source):
|
|
| 345 |
if not sem_parts:
|
| 346 |
sem_parts = list(leaf_parts) if leaf_parts else []
|
| 347 |
semantic_text = ' '.join(sem_parts) if sem_parts else text
|
|
|
|
|
|
|
|
|
|
| 348 |
rows.append({
|
| 349 |
'_source_file': source,
|
| 350 |
'_row_index': int(i),
|
|
@@ -1983,26 +2000,34 @@ def _keybert_candidates(member_texts, ancestor_words=None, used_labels=None,
|
|
| 1983 |
"""
|
| 1984 |
ancestor_words = ancestor_words or set()
|
| 1985 |
used = {str(u).lower() for u in (used_labels or [])}
|
|
|
|
| 1986 |
cand = set()
|
| 1987 |
for t in member_texts:
|
| 1988 |
-
raw = re.sub(r'\([^)]*\)', ' ', str(t))
|
| 1989 |
nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
|
| 1990 |
if nps:
|
| 1991 |
for p in nps:
|
| 1992 |
-
toks = [w for w in p.lower().split()
|
| 1993 |
-
if w not in _STOP and w not in ancestor_words]
|
| 1994 |
if toks:
|
| 1995 |
cand.add(' '.join(toks))
|
| 1996 |
else:
|
| 1997 |
-
toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower())
|
| 1998 |
-
if w not in _STOP and w not in ancestor_words]
|
| 1999 |
for nlen in range(1, max_words + 1):
|
| 2000 |
for i in range(len(toks) - nlen + 1):
|
| 2001 |
cand.add(' '.join(toks[i:i + nlen]))
|
| 2002 |
-
|
| 2003 |
-
|
| 2004 |
-
|
| 2005 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2006 |
|
| 2007 |
|
| 2008 |
def _concept_title(text):
|
|
@@ -2047,9 +2072,10 @@ def _title_cluster_label(member_titles, sibling_title_lists, ancestor_words=None
|
|
| 2047 |
used_labels = {str(u).lower() for u in (used_labels or [])}
|
| 2048 |
|
| 2049 |
def _phrases(title):
|
| 2050 |
-
t = re.sub(r'\([^)]*\)', ' ', title.lower())
|
| 2051 |
toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
|
| 2052 |
-
if w not in _STOP and w not in ancestor_words
|
|
|
|
| 2053 |
out = set()
|
| 2054 |
for nlen in range(1, max_words + 1):
|
| 2055 |
for i in range(len(toks) - nlen + 1):
|
|
@@ -2098,9 +2124,10 @@ def _raw_title(text):
|
|
| 2098 |
def _label_from_own_title(title, ancestor_words, max_words=4):
|
| 2099 |
"""[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
|
| 2100 |
words and parentheticals). Returns '' for sentence-like / empty titles."""
|
| 2101 |
-
t = re.sub(r'\([^)]*\)', ' ', str(title).lower())
|
| 2102 |
toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
|
| 2103 |
-
if w not in _STOP and w not in ancestor_words
|
|
|
|
| 2104 |
if not toks or len(toks) > 7: # >7 words β prose, not a concept title
|
| 2105 |
return ''
|
| 2106 |
return ' '.join(toks[:max_words]).title()
|
|
@@ -2413,6 +2440,12 @@ def _cluster_and_label(tdf, path_prefix, nodes, leaf_to_id, embedder,
|
|
| 2413 |
if pool and cluster_emb is not None:
|
| 2414 |
cand_embs = np.asarray(embedder.encode(pool), dtype=float)
|
| 2415 |
relevance = cosine_similarity([cluster_emb], cand_embs)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2416 |
if sibling_centroids:
|
| 2417 |
sib_sim = cosine_similarity(cand_embs,
|
| 2418 |
np.asarray(sibling_centroids, dtype=float)).max(axis=1)
|
|
@@ -4053,6 +4086,16 @@ if uploads:
|
|
| 4053 |
else:
|
| 4054 |
c_embs = None
|
| 4055 |
nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4056 |
st.session_state.hiexpan_report = report
|
| 4057 |
wmoves = report.get('width_expansion_moves', 0)
|
| 4058 |
dexp = report.get('depth_expansion_nodes', 0)
|
|
|
|
| 121 |
USE_NOUN_PHRASES = False
|
| 122 |
# USE_CTFIDF β True: multiply KeyBERT cosine relevance by corpus IDF so dataset-wide
|
| 123 |
# boilerplate (low IDF) is down-weighted; False: plain cosine-to-centroid.
|
| 124 |
+
USE_CTFIDF = True
|
| 125 |
# KEYBERT_DIVERSITY β MMR redundancy penalty weight. 0 = pure argmax cosine-to-centroid
|
| 126 |
# (pick the single most relevant phrase); 0.5 = standard MMR diversification.
|
| 127 |
KEYBERT_DIVERSITY = 0
|
|
|
|
| 148 |
# scorer's external-grounding signal (Cognitive Atlas vs Wikidata routing).
|
| 149 |
_ACTIVE_DOMAIN: str = 'general'
|
| 150 |
|
| 151 |
+
# Label boilerplate: web/URL artifacts and Likert response-scale tokens that leak from
|
| 152 |
+
# data-dictionary descriptions (e.g. HCP FreeSurfer rows embed Neurolex URLs; survey rows
|
| 153 |
+
# embed "strongly agree" scales). These are stripped from KeyBERT candidates AND from the
|
| 154 |
+
# embedding text so they can neither name a node nor distort clustering. Domain-agnostic
|
| 155 |
+
# documentation/scale tokens only β not concept vocabulary.
|
| 156 |
+
_LABEL_BOILERPLATE = {
|
| 157 |
+
'http', 'https', 'href', 'www', 'org', 'com', 'net', 'wiki', 'url', 'link',
|
| 158 |
+
'neurolex', 'connectomedb', 'humanconnectome', 'definition', 'category',
|
| 159 |
+
'sa', 'sd', 'strongly', 'agree', 'disagree', 'neither', 'somewhat',
|
| 160 |
+
}
|
| 161 |
+
# Inline URLs in free text (http://β¦, www.β¦/β¦) β removed from the embedding text.
|
| 162 |
+
_URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
|
| 163 |
+
re.IGNORECASE)
|
| 164 |
+
|
| 165 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
# FILE LOADING
|
| 167 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 359 |
if not sem_parts:
|
| 360 |
sem_parts = list(leaf_parts) if leaf_parts else []
|
| 361 |
semantic_text = ' '.join(sem_parts) if sem_parts else text
|
| 362 |
+
# Strip inline URLs (HCP FreeSurfer rows embed Neurolex links) so web tokens
|
| 363 |
+
# cannot dominate either the embedding (clustering) or the KeyBERT label.
|
| 364 |
+
semantic_text = _URL_RE.sub(' ', semantic_text)
|
| 365 |
rows.append({
|
| 366 |
'_source_file': source,
|
| 367 |
'_row_index': int(i),
|
|
|
|
| 2000 |
"""
|
| 2001 |
ancestor_words = ancestor_words or set()
|
| 2002 |
used = {str(u).lower() for u in (used_labels or [])}
|
| 2003 |
+
block = _STOP | ancestor_words | _LABEL_BOILERPLATE # boilerplate/web/Likert tokens out
|
| 2004 |
cand = set()
|
| 2005 |
for t in member_texts:
|
| 2006 |
+
raw = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(t)))
|
| 2007 |
nps = _noun_phrases(raw, max_words) if USE_NOUN_PHRASES else []
|
| 2008 |
if nps:
|
| 2009 |
for p in nps:
|
| 2010 |
+
toks = [w for w in p.lower().split() if w not in block]
|
|
|
|
| 2011 |
if toks:
|
| 2012 |
cand.add(' '.join(toks))
|
| 2013 |
else:
|
| 2014 |
+
toks = [w for w in re.findall(r'[a-z][a-z\-]+', raw.lower()) if w not in block]
|
|
|
|
| 2015 |
for nlen in range(1, max_words + 1):
|
| 2016 |
for i in range(len(toks) - nlen + 1):
|
| 2017 |
cand.add(' '.join(toks[i:i + nlen]))
|
| 2018 |
+
|
| 2019 |
+
def _ok(c):
|
| 2020 |
+
words = c.split()
|
| 2021 |
+
if len(c) < 4 or c.lower() in used or c.replace(' ', '').isdigit():
|
| 2022 |
+
return False
|
| 2023 |
+
if re.search(r'\b(\w+)\s+\1\b', c.lower()): # adjacent word repeat
|
| 2024 |
+
return False
|
| 2025 |
+
if len(words) == 4 and words[:2] == words[2:]: # phrase repeat "x y x y"
|
| 2026 |
+
return False
|
| 2027 |
+
if len(words) == 1 and (len(c) <= 3 or _is_acronym(c)): # bare fragment/acronym
|
| 2028 |
+
return False
|
| 2029 |
+
return True
|
| 2030 |
+
return [c for c in cand if _ok(c)][:cap]
|
| 2031 |
|
| 2032 |
|
| 2033 |
def _concept_title(text):
|
|
|
|
| 2072 |
used_labels = {str(u).lower() for u in (used_labels or [])}
|
| 2073 |
|
| 2074 |
def _phrases(title):
|
| 2075 |
+
t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', title.lower())) # drop parens + URLs
|
| 2076 |
toks = [w for w in re.findall(r'[a-z][a-z\-]{1,}', t)
|
| 2077 |
+
if w not in _STOP and w not in ancestor_words
|
| 2078 |
+
and w not in _LABEL_BOILERPLATE] # web/Likert out
|
| 2079 |
out = set()
|
| 2080 |
for nlen in range(1, max_words + 1):
|
| 2081 |
for i in range(len(toks) - nlen + 1):
|
|
|
|
| 2124 |
def _label_from_own_title(title, ancestor_words, max_words=4):
|
| 2125 |
"""[Fix5] Label a singleton variable from its OWN title (minus ancestor/task
|
| 2126 |
words and parentheticals). Returns '' for sentence-like / empty titles."""
|
| 2127 |
+
t = _URL_RE.sub(' ', re.sub(r'\([^)]*\)', ' ', str(title).lower()))
|
| 2128 |
toks = [w for w in re.findall(r'[a-z][a-z\-]+', t)
|
| 2129 |
+
if w not in _STOP and w not in ancestor_words
|
| 2130 |
+
and w not in _LABEL_BOILERPLATE]
|
| 2131 |
if not toks or len(toks) > 7: # >7 words β prose, not a concept title
|
| 2132 |
return ''
|
| 2133 |
return ' '.join(toks[:max_words]).title()
|
|
|
|
| 2440 |
if pool and cluster_emb is not None:
|
| 2441 |
cand_embs = np.asarray(embedder.encode(pool), dtype=float)
|
| 2442 |
relevance = cosine_similarity([cluster_emb], cand_embs)[0]
|
| 2443 |
+
# c-TF-IDF: down-weight dataset-wide boilerplate (low corpus IDF) so generic
|
| 2444 |
+
# phrases ("test", "description", "measure", "scores") lose to distinctive ones.
|
| 2445 |
+
if USE_CTFIDF and _CORPUS_IDF:
|
| 2446 |
+
_mx = max(_CORPUS_IDF.values()) or 1.0
|
| 2447 |
+
_idf = np.array([min(1.0, _CORPUS_IDF.get(c.lower(), _mx) / _mx) for c in pool])
|
| 2448 |
+
relevance = relevance * (0.5 + 0.5 * _idf)
|
| 2449 |
if sibling_centroids:
|
| 2450 |
sib_sim = cosine_similarity(cand_embs,
|
| 2451 |
np.asarray(sibling_centroids, dtype=float)).max(axis=1)
|
|
|
|
| 4086 |
else:
|
| 4087 |
c_embs = None
|
| 4088 |
nodes, report = run_hiexpan(nodes, can, emb, concept_table, c_embs)
|
| 4089 |
+
# HiExpan's width/global passes MOVE leaves between concepts; a concept
|
| 4090 |
+
# that loses all its leaves becomes empty. build_concept_hierarchy prunes
|
| 4091 |
+
# internally, but that runs BEFORE HiExpan β so re-prune here, else empty
|
| 4092 |
+
# nodes break the Plotly branchvalues='total' sunburst/treemap (parent
|
| 4093 |
+
# value < sum(children) β blank render; node-link is unaffected).
|
| 4094 |
+
_prune_empty_aggregations(nodes)
|
| 4095 |
+
_alive = {int(n['id']) for n in nodes}
|
| 4096 |
+
for _n in nodes:
|
| 4097 |
+
_n['related'] = [x for x in dict.fromkeys(int(c) for c in _n.get('related', []))
|
| 4098 |
+
if x in _alive]
|
| 4099 |
st.session_state.hiexpan_report = report
|
| 4100 |
wmoves = report.get('width_expansion_moves', 0)
|
| 4101 |
dexp = report.get('depth_expansion_nodes', 0)
|