RoophaSharon Claude Opus 4.8 commited on
Commit
dd46f48
Β·
1 Parent(s): fef0152

UX v2: move configuration to main area, deploy version2 to the Space

Browse files

Professor feedback was that approach configuration belongs in the main area,
not the sidebar. New version2/ app implements this across every page:

- Demo View: method/dataset selectors moved into the main area.
- Build pages: upload + config moved to main area; expert knobs in a
collapsed "Advanced settings" expander; sidebar is navigation-only.
- Descriptive method names via methods.py (single source of truth),
e.g. "Approach 1: External Concept Alignment Hierarchy".
- Removed emoji, numbered "1." / "Step N -" headers; method-aware
provenance panel; Baseline gets all three visualizations.
- Output save paths aligned to version2/outputs so saved builds appear
in the Demo View.
- Robustness: broaden torch import guards (OSError on broken installs).
- Mirror v1 fixes into version2 (Approach 1 max-vars default 3000,
Approach 2 prune_empty_aggregations).

Dockerfile now runs version2/demo.py so the live Space serves version 2.
Root v1 files are left untouched as a fallback.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (37) hide show
  1. Dockerfile +1 -1
  2. approach_1.py +4 -1
  3. approach_2.py +34 -0
  4. baseline.py +21 -30
  5. version2/.gitignore +26 -0
  6. version2/Dockerfile +32 -0
  7. version2/LICENSE +21 -0
  8. version2/README.md +205 -0
  9. version2/approach_1.py +0 -0
  10. version2/approach_2.py +0 -0
  11. version2/baseline.py +1086 -0
  12. version2/data/HCP_S1200_DataDictionary_Oct_30_2023.csv +0 -0
  13. version2/data/ai-mind-variable-descriptions(in).csv +109 -0
  14. version2/data/dictionary_harmonized_categories.csv +571 -0
  15. version2/data/tidytuesday_json_val.json +1911 -0
  16. version2/demo.py +47 -0
  17. version2/hierarchy_eval.py +622 -0
  18. version2/launcher.py +137 -0
  19. version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_canonical.csv +0 -0
  20. version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_concept_labels.csv +159 -0
  21. version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json +0 -0
  22. version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json +0 -0
  23. version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_canonical.csv +109 -0
  24. version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_concept_labels.csv +21 -0
  25. version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json +0 -0
  26. version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json +0 -0
  27. version2/outputs/approach_2/HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json +0 -0
  28. version2/outputs/approach_2/ai-mind-variable-descriptions_in__approach2_lod.json +2716 -0
  29. version2/outputs/baseline/HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json +0 -0
  30. version2/outputs/baseline/ai-mind-variable-descriptions_in__baseline_hierarchy.json +1876 -0
  31. version2/requirements.txt +17 -0
  32. version2/views/methods.py +77 -0
  33. version2/views/run_approach_1.py +0 -0
  34. version2/views/run_approach_2.py +0 -0
  35. version2/views/run_baseline.py +1091 -0
  36. version2/views/viewer.py +661 -0
  37. views/run_baseline.py +21 -30
Dockerfile CHANGED
@@ -26,7 +26,7 @@ EXPOSE 7860
26
 
27
  # XSRF/CORS disabled so file uploads work behind the Hugging Face proxy/iframe
28
  # (otherwise the uploader returns "AxiosError: 403"). Standard for HF Spaces.
29
- CMD ["streamlit", "run", "demo.py", \
30
  "--server.port=7860", "--server.address=0.0.0.0", \
31
  "--server.enableXsrfProtection=false", \
32
  "--server.enableCORS=false"]
 
26
 
27
  # XSRF/CORS disabled so file uploads work behind the Hugging Face proxy/iframe
28
  # (otherwise the uploader returns "AxiosError: 403"). Standard for HF Spaces.
29
+ CMD ["streamlit", "run", "version2/demo.py", \
30
  "--server.port=7860", "--server.address=0.0.0.0", \
31
  "--server.enableXsrfProtection=false", \
32
  "--server.enableCORS=false"]
approach_1.py CHANGED
@@ -3826,7 +3826,10 @@ with st.sidebar:
3826
 
3827
  st.header('2. Generation')
3828
  project = st.text_input('Root / project name', value='metadata_project')
3829
- max_rows = st.slider('Max variables', 10, 3000, 600, 10)
 
 
 
3830
  merge_files = st.checkbox('Merge uploaded files', value=True)
3831
  n_clusters = st.slider('Max clusters per group', 2, 16, 8, 1,
3832
  help='Maximum number of concept sub-groups per top-level group.')
 
3826
 
3827
  st.header('2. Generation')
3828
  project = st.text_input('Root / project name', value='metadata_project')
3829
+ max_rows = st.slider('Max variables', 10, 3000, 3000, 10,
3830
+ help='Variables to build from (uses df.head). Default '
3831
+ 'covers full datasets like HCP (~813); lower it '
3832
+ 'only to sample a subset for speed.')
3833
  merge_files = st.checkbox('Merge uploaded files', value=True)
3834
  n_clusters = st.slider('Max clusters per group', 2, 16, 8, 1,
3835
  help='Maximum number of concept sub-groups per top-level group.')
approach_2.py CHANGED
@@ -1958,6 +1958,33 @@ def enforce_single_parent(nodes: list) -> int:
1958
  removed += 1
1959
  return removed
1960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1961
  def mine_phrase_slots(texts: list,
1962
  text_col_names: Optional[list] = None,
1963
  min_phrase_count: int = 2,
@@ -3037,6 +3064,12 @@ def build_dynamic_lod_tree(can: pd.DataFrame,
3037
  except Exception:
3038
  n_reparented = 0
3039
 
 
 
 
 
 
 
3040
  # Annotate the root with post-build statistics
3041
  if nodes and nodes[0].get('type') == 'root':
3042
  nodes[0]['post_build_stats'] = {
@@ -3044,6 +3077,7 @@ def build_dynamic_lod_tree(can: pd.DataFrame,
3044
  'low_quality_nodes_dissolved': int(n_dissolved),
3045
  'group_prefix_labels_stripped': int(n_stripped),
3046
  'dag_links_removed': int(n_reparented),
 
3047
  }
3048
 
3049
  # Deduplicate children
 
1958
  removed += 1
1959
  return removed
1960
 
1961
+ def prune_empty_aggregations(nodes: list) -> int:
1962
+ """
1963
+ POST-BUILD PASS 5 β€” drop aggregation nodes that ended up with no children.
1964
+
1965
+ `enforce_single_parent` can empty a shallow aggregation when all of its
1966
+ variables were kept under a deeper/sibling parent (e.g. 'RVP Response
1967
+ Latency' losing every leaf to a more specific group). An empty category
1968
+ node is export noise β€” it renders as a blank sector and has no members.
1969
+
1970
+ Iteratively removes childless aggregation nodes and detaches them from
1971
+ their parents (removal can empty a parent in turn). Root and attribute
1972
+ nodes are never touched. Returns the number of nodes removed.
1973
+ """
1974
+ removed = 0
1975
+ while True:
1976
+ node_map = {int(n['id']): n for n in nodes}
1977
+ empties = {int(n['id']) for n in nodes
1978
+ if n.get('type') == 'aggregation' and not n.get('related')}
1979
+ if not empties:
1980
+ break
1981
+ nodes[:] = [n for n in nodes if int(n['id']) not in empties]
1982
+ for n in nodes:
1983
+ if any(int(c) in empties for c in n.get('related', [])):
1984
+ n['related'] = [int(c) for c in n['related'] if int(c) not in empties]
1985
+ removed += len(empties)
1986
+ return removed
1987
+
1988
  def mine_phrase_slots(texts: list,
1989
  text_col_names: Optional[list] = None,
1990
  min_phrase_count: int = 2,
 
3064
  except Exception:
3065
  n_reparented = 0
3066
 
3067
+ # ── POST-BUILD PASS 5 β€” drop aggregation nodes left childless by PASS 4 ───
3068
+ try:
3069
+ n_empty_pruned = prune_empty_aggregations(nodes)
3070
+ except Exception:
3071
+ n_empty_pruned = 0
3072
+
3073
  # Annotate the root with post-build statistics
3074
  if nodes and nodes[0].get('type') == 'root':
3075
  nodes[0]['post_build_stats'] = {
 
3077
  'low_quality_nodes_dissolved': int(n_dissolved),
3078
  'group_prefix_labels_stripped': int(n_stripped),
3079
  'dag_links_removed': int(n_reparented),
3080
+ 'empty_aggregations_pruned': int(n_empty_pruned),
3081
  }
3082
 
3083
  # Deduplicate children
baseline.py CHANGED
@@ -5,7 +5,7 @@
5
  #
6
  # Pipeline:
7
  # 1. Load metadata file (CSV / TSV / XLSX / JSON)
8
- # 2. Detect column roles (leaf / group / text / meta) β€” same as Approach 1 / 2
9
  # 3. Build canonical schema (incl. _semantic_text = description values only)
10
  # 4. Embed each variable (code + description) via Word2Vec skip-gram and build
11
  # the cosine-distance semantic space [TAX Β§3.2]
@@ -28,7 +28,7 @@
28
  # the bare code goes out-of-vocabulary (a limitation the paper flags,
29
  # e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a
30
  # paragraph; using the short name (not the full description prose) keeps
31
- # task-distinctive words from being diluted by shared explanatory text.
32
  # 3. Fully-automatic labels β€” the paper's labelling is semi-automatic
33
  # (human picks from suggestions); a baseline must be non-interactive, so
34
  # we use data-driven contrastive terms from each cluster's members.
@@ -186,7 +186,7 @@ def detect_roles(df: pd.DataFrame) -> tuple:
186
  meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
187
  .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
188
  # Representation columns (decimal/precision/unit/type/format/…) must never
189
- # become structural levels β€” force them out of group and into metadata. [GON][TAX]
190
  _META_SUBSTR_BLOCK = {
191
  'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
192
  'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
@@ -306,9 +306,9 @@ def attribute_name(text: str) -> str:
306
  paragraph. Descriptions here are formatted '<name>: <full sentence>' (some
307
  prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
308
  that is not a pure all-caps marker. Embedding this short name β€” rather than
309
- the full description prose β€” keeps the task-distinctive words from being
310
- diluted by shared explanatory text, so the taxonomy groups far more by theme
311
- (e.g. DMS / PAL / SWM) without ever touching the group column.
312
  """
313
  text = str(text)
314
  for clause in re.split(r'[:\n]', text):
@@ -470,7 +470,7 @@ def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
470
  average) β€” the name clause of the description, as Taxonomizer specifies.
471
  Recursively clusters via balanced Ward linkage β€” the semantic-space
472
  dendrogram. Labels each internal node with the contrastive content terms of
473
- its members (data-driven, fully automatic). No group column, no hardcoding.
474
  """
475
  # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
476
  nodes: list = [{'id': 0, 'name': project, 'type': 'root',
@@ -807,8 +807,8 @@ with st.sidebar:
807
  max_items = st.slider('Maximum variables', 25, 1200, 900, 25,
808
  help='Cap on variables included (lower only to speed up very large files). '
809
  'Default keeps full datasets like HCP (813).')
810
- group_filter = st.text_input('Group filter (optional)', value='',
811
- help='Filter rows whose group path contains this text')
812
 
813
  # ─────────────────────────────────────────────────────────────────────────────
814
  # MAIN
@@ -829,8 +829,9 @@ if not uploaded:
829
  | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette β†’ dendrogram | Taxonomizer Β§4.2 |
830
  | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer Β§4.3 *(adapted)* |
831
 
832
- The group column is **not** used for construction, so the recovered taxonomy
833
- can be fairly evaluated against it (NMI / ARI / Purity in the Evaluation tab).
 
834
 
835
  **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
836
 
@@ -853,7 +854,7 @@ st.subheader('Step 1 β€” File preview')
853
  with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
854
  expanded=False):
855
  st.dataframe(df.head(10), use_container_width=True)
856
- score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
857
  if c in prof.columns]
858
  st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
859
  use_container_width=True)
@@ -869,8 +870,9 @@ with st.expander('Column configuration', expanded=True):
869
  with left:
870
  leaf_cols = st.multiselect('Leaf variable column(s)', cols,
871
  default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
872
- group_cols = st.multiselect('Group/task column(s)', cols,
873
- default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}')
 
874
  with right:
875
  text_cols = st.multiselect('Text/description column(s)', cols,
876
  default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
@@ -982,11 +984,11 @@ with tabs[1]:
982
  for i in lids if i in nm and 'metadata' in nm[i]}
983
  sub = can[can['_leaf_id'].isin(leaf_ids_set)]
984
  st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
985
- st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
986
  use_container_width=True)
987
 
988
  with tabs[2]:
989
- st.dataframe(can, use_container_width=True)
990
 
991
  with tabs[3]:
992
  _base = safe_name(project_name)
@@ -1035,9 +1037,9 @@ with tabs[4]:
1035
 
1036
  st.subheader('Hierarchy Quality Evaluation')
1037
  st.caption(
1038
- 'The group column is a *construction input* (GonΓ§alves text object), so it '
1039
- 'cannot serve as ground truth. The primary metrics below are **reference-free** '
1040
- 'β€” they assess the hierarchy itself, with no gold standard.'
1041
  )
1042
 
1043
  with st.spinner('Computing reference-free metrics…'):
@@ -1082,14 +1084,3 @@ with tabs[4]:
1082
  s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%",
1083
  help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
1084
 
1085
- # ── Held-out group recovery (VALID β€” group column not used in construction) ─
1086
- st.markdown('#### Held-out group recovery *(valid β€” group column not used)*')
1087
- st.caption(
1088
- 'The baseline never uses the group column (it embeds only attribute '
1089
- 'names), so this is a **valid held-out** recovery score. ARI and AMI are '
1090
- 'chance-corrected; NMI and Purity are omitted as inflated by over-splitting.'
1091
- )
1092
- gp = he.group_preservation(nodes, can)
1093
- g1, g2 = st.columns(2)
1094
- g1.metric('ARI', gp['ARI'], help='Adjusted Rand Index (chance-corrected).')
1095
- g2.metric('AMI', gp['AMI'], help='Adjusted Mutual Information (chance-corrected).')
 
5
  #
6
  # Pipeline:
7
  # 1. Load metadata file (CSV / TSV / XLSX / JSON)
8
+ # 2. Detect column roles (leaf / context / text / meta) β€” same as Approach 1 / 2
9
  # 3. Build canonical schema (incl. _semantic_text = description values only)
10
  # 4. Embed each variable (code + description) via Word2Vec skip-gram and build
11
  # the cosine-distance semantic space [TAX Β§3.2]
 
28
  # the bare code goes out-of-vocabulary (a limitation the paper flags,
29
  # e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a
30
  # paragraph; using the short name (not the full description prose) keeps
31
+ # domain-specific words from being diluted by shared explanatory text.
32
  # 3. Fully-automatic labels β€” the paper's labelling is semi-automatic
33
  # (human picks from suggestions); a baseline must be non-interactive, so
34
  # we use data-driven contrastive terms from each cluster's members.
 
186
  meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
187
  .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
188
  # Representation columns (decimal/precision/unit/type/format/…) must never
189
+ # become structural levels; prefer them as metadata. [GON][TAX]
190
  _META_SUBSTR_BLOCK = {
191
  'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
192
  'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
 
306
  paragraph. Descriptions here are formatted '<name>: <full sentence>' (some
307
  prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
308
  that is not a pure all-caps marker. Embedding this short name β€” rather than
309
+ the full description prose β€” keeps the domain-specific words from being
310
+ diluted by shared explanatory text, so the taxonomy clusters more by theme
311
+ (e.g. DMS / PAL / SWM).
312
  """
313
  text = str(text)
314
  for clause in re.split(r'[:\n]', text):
 
470
  average) β€” the name clause of the description, as Taxonomizer specifies.
471
  Recursively clusters via balanced Ward linkage β€” the semantic-space
472
  dendrogram. Labels each internal node with the contrastive content terms of
473
+ its members (data-driven, fully automatic). No hardcoding.
474
  """
475
  # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
476
  nodes: list = [{'id': 0, 'name': project, 'type': 'root',
 
807
  max_items = st.slider('Maximum variables', 25, 1200, 900, 25,
808
  help='Cap on variables included (lower only to speed up very large files). '
809
  'Default keeps full datasets like HCP (813).')
810
+ group_filter = st.text_input('Row filter (optional)', value='',
811
+ help='Filter rows by contextual path text before building')
812
 
813
  # ─────────────────────────────────────────────────────────────────────────────
814
  # MAIN
 
829
  | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette β†’ dendrogram | Taxonomizer Β§4.2 |
830
  | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer Β§4.3 *(adapted)* |
831
 
832
+ This page is the pure Taxonomizer-style semantic-space reference method:
833
+ variable meanings are embedded and recursively clustered into a hierarchy,
834
+ with node labels generated from contrastive terms.
835
 
836
  **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
837
 
 
854
  with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
855
  expanded=False):
856
  st.dataframe(df.head(10), use_container_width=True)
857
+ score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
858
  if c in prof.columns]
859
  st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
860
  use_container_width=True)
 
870
  with left:
871
  leaf_cols = st.multiselect('Leaf variable column(s)', cols,
872
  default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
873
+ group_cols = st.multiselect('Context column(s) (optional)', cols,
874
+ default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
875
+ help='Optional contextual columns for display/filtering.')
876
  with right:
877
  text_cols = st.multiselect('Text/description column(s)', cols,
878
  default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
 
984
  for i in lids if i in nm and 'metadata' in nm[i]}
985
  sub = can[can['_leaf_id'].isin(leaf_ids_set)]
986
  st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
987
+ st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
988
  use_container_width=True)
989
 
990
  with tabs[2]:
991
+ st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), use_container_width=True)
992
 
993
  with tabs[3]:
994
  _base = safe_name(project_name)
 
1037
 
1038
  st.subheader('Hierarchy Quality Evaluation')
1039
  st.caption(
1040
+ 'No manually curated reference taxonomy is available for this experiment. '
1041
+ 'The metrics below are reference-free: they assess hierarchy structure, '
1042
+ 'label coherence and interpretability directly.'
1043
  )
1044
 
1045
  with st.spinner('Computing reference-free metrics…'):
 
1084
  s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%",
1085
  help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
1086
 
 
 
 
 
 
 
 
 
 
 
 
version2/.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ .venv/
6
+ venv/
7
+
8
+ # Streamlit
9
+ .streamlit/secrets.toml
10
+
11
+ # Jupyter
12
+ .ipynb_checkpoints/
13
+
14
+ # OS / editor
15
+ .DS_Store
16
+ Thumbs.db
17
+ .vscode/
18
+ .idea/
19
+
20
+ # Anaconda envs
21
+ *.conda
22
+ *.egg-info/
23
+
24
+ # Temp
25
+ ~WRL*.tmp
26
+ *.tmp
version2/Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # System deps (build tools for some wheels, curl for healthcheck)
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential curl \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Run as non-root user (Hugging Face Spaces convention: uid 1000)
9
+ RUN useradd -m -u 1000 user
10
+ USER user
11
+ ENV HOME=/home/user \
12
+ PATH=/home/user/.local/bin:$PATH
13
+
14
+ WORKDIR $HOME/app
15
+
16
+ # Install Python dependencies first (better layer caching)
17
+ COPY --chown=user requirements.txt .
18
+ RUN pip install --no-cache-dir --upgrade pip \
19
+ && pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Copy the rest of the app
22
+ COPY --chown=user . .
23
+
24
+ # Hugging Face Spaces expects the app on port 7860
25
+ EXPOSE 7860
26
+
27
+ # XSRF/CORS disabled so file uploads work behind the Hugging Face proxy/iframe
28
+ # (otherwise the uploader returns "AxiosError: 403"). Standard for HF Spaces.
29
+ CMD ["streamlit", "run", "demo.py", \
30
+ "--server.port=7860", "--server.address=0.0.0.0", \
31
+ "--server.enableXsrfProtection=false", \
32
+ "--server.enableCORS=false"]
version2/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 RoophaSharon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
version2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Metadata Hierarchy Explorer
3
+ colorFrom: green
4
+ colorTo: blue
5
+ sdk: docker
6
+ app_port: 7860
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # Metadata Hierarchy Construction β€” TFM
12
+
13
+ Master's thesis prototype: automatic hierarchy construction from data-dictionary metadata.
14
+ Three algorithms are implemented for comparison.
15
+
16
+ ## Live demo
17
+
18
+ The deployed app opens on a **pre-built results viewer** (`demo.py`) showing the
19
+ AI-MIND and HCP hierarchies for all three approaches β€” no upload needed. Use the
20
+ sidebar to switch approach/dataset and the Level-of-Detail controls to adjust depth.
21
+
22
+ To **build a hierarchy from your own CSV**, open the **Baseline**, **Approach 1**, or
23
+ **Approach 2** page from the left sidebar and upload a file. (Approach 2's optional
24
+ local-LLM label refinement runs only on a local machine with Ollama; in the cloud it
25
+ falls back to the deterministic pipeline automatically.)
26
+
27
+ ## Approaches
28
+
29
+ - **Baseline** β€” Pure clustering baseline. Plain TF-IDF / Word2Vec embeddings + hierarchical
30
+ clustering. Documented in `README_baseline.md`.
31
+
32
+ - **Approach 1** β€” Global embedding pipeline. Uses SBERT + NΓ—M concept-table alignment
33
+ (GonΓ§alves 2019) + HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets.
34
+ Optionally retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal.
35
+
36
+ - **Approach 2** β€” Dataset-constrained multi-aspect hierarchy. Algorithmic pipeline with no
37
+ domain hardcoding:
38
+ 1. Group-anchored L1/L2 (from detected metadata column structure β€” BISE 2026)
39
+ 2. Phrase-slot mining (IE-style slot induction) for descriptions with regular structure
40
+ 3. **FASTopic** semantic aspect discovery (Wu et al. NeurIPS 2024) β€” replaces NMF
41
+ 4. NMF lexical fallback for small groups
42
+ 5. GMM + BIC for small clusters, MiniBatchKMeans + silhouette for large ones
43
+ 6. Deterministic 5-stage label generation (description prefix β†’ group anchor β†’ IDF filter
44
+ β†’ bigram-preferred TF-IDF β†’ optional LLM refinement)
45
+ 7. **Optional local-LLM label refinement** via Ollama + Qwen 2.5 (TopicTag pattern, DocEng
46
+ 2024). Strict grounding check rejects labels not derived from CSV evidence. Per-node
47
+ provenance recorded.
48
+ 8. TraCo-inspired hierarchy diagnostics (AAAI 2024)
49
+
50
+ No facet trees β€” single coherent LoD tree.
51
+
52
+ See each script's "Method" tab in the running app for the full algorithm and paper references.
53
+
54
+ ## Paper stack
55
+
56
+ | Component | Paper |
57
+ |---|---|
58
+ | Multi-aspect taxonomy scaffold | Zhu et al. 2025, EMNLP |
59
+ | Canonical metadata text objects | GonΓ§alves et al. 2019, ESWC |
60
+ | Semantic aspect discovery | Wu et al. 2024 (FASTopic), NeurIPS, arXiv:2405.17978 |
61
+ | Phrase-slot mining | IE / slot-induction literature (ACM CSUR 2022) |
62
+ | LLM label refinement pattern | Eren et al. 2024 (TopicTag), DocEng, arXiv:2407.19616 |
63
+ | Local LLM (used for refinement) | Qwen Team 2024 (Qwen 2.5), arXiv:2412.15115 |
64
+ | Hierarchy quality diagnostics | Wu et al. 2024 (TraCo), AAAI, arXiv:2401.14113 |
65
+ | Group-anchored entry strategy | Motamedi, Novalija, Rei 2026, Springer BISE |
66
+ | Multidimensional taxonomy motivation | Kargupta et al. 2025 (TaxoAdapt), ACL |
67
+ | Future-work semantic consistency | SC-Taxo 2026, arXiv:2605.00620 |
68
+ | Concept-label evaluation framework | Kejriwal et al. 2022 (TICL), EAAI |
69
+
70
+ ## Project layout
71
+
72
+ ```
73
+ Hierarchy tool/
74
+ β”œβ”€β”€ baseline.py # Pure clustering baseline (Streamlit app)
75
+ β”œβ”€β”€ approach_1.py # Approach 1 (Streamlit app)
76
+ β”œβ”€β”€ approach_2.py # Approach 2 (Streamlit app)
77
+ β”œβ”€β”€ approach_1.ipynb # Approach 1 reproducible notebook
78
+ β”œβ”€β”€ approach_2.ipynb # Approach 2 reproducible notebook
79
+ β”œβ”€β”€ baseline.ipynb # Baseline reproducible notebook
80
+ β”œβ”€β”€ launcher.py # Run all three apps simultaneously on different ports
81
+ β”œβ”€β”€ data/ # Sample input CSVs (AI-MIND, HCP, etc.)
82
+ β”œβ”€β”€ outputs/ # Generated hierarchies (JSON)
83
+ └── requirements.txt
84
+ ```
85
+
86
+ ## Running locally
87
+
88
+ ### 1. Install Python dependencies
89
+
90
+ ```bash
91
+ pip install -r requirements.txt
92
+ ```
93
+
94
+ Python 3.10 or 3.11 recommended.
95
+
96
+ ### 2. (Approach 2 only) Install Ollama for the local-LLM label refinement layer
97
+
98
+ **This is optional β€” Approach 2 produces deterministic labels without it.** If you want
99
+ the optional TopicTag-style LLM label refinement:
100
+
101
+ 1. Download and install Ollama from https://ollama.com/download
102
+ 2. Open Ollama once so the background service starts (icon in the system tray)
103
+ 3. Pull the recommended model:
104
+ ```bash
105
+ ollama pull qwen2.5:3b-instruct
106
+ ```
107
+ (For higher quality at higher RAM cost: `ollama pull qwen2.5:7b-instruct`.)
108
+ 4. Verify the server is reachable:
109
+ - In a browser open `http://localhost:11434/api/tags`
110
+ - Or run `ollama list`
111
+
112
+ When Approach 2 starts it auto-detects Ollama and the "Refine labels with LLM" checkbox
113
+ defaults to ON. Uncheck any time. The deterministic pipeline is the canonical thesis
114
+ result; the LLM is an optional re-phraser of evidence already in the CSV.
115
+
116
+ To override the default URL or model:
117
+
118
+ ```bash
119
+ # Optional environment variables
120
+ set OLLAMA_URL=http://localhost:11434/v1
121
+ set OLLAMA_MODEL=qwen2.5:3b-instruct
122
+ ```
123
+
124
+ Or change them live in the Approach 2 sidebar.
125
+
126
+ ### 3. Run one app at a time
127
+
128
+ ```bash
129
+ streamlit run baseline.py
130
+ # or
131
+ streamlit run approach_1.py
132
+ # or
133
+ streamlit run approach_2.py
134
+ ```
135
+
136
+ Each opens at http://localhost:8501 by default.
137
+
138
+ ### 4. Run all three apps simultaneously (for side-by-side comparison)
139
+
140
+ ```bash
141
+ python launcher.py
142
+ ```
143
+
144
+ This opens three browser tabs:
145
+
146
+ - http://localhost:8501 β€” Baseline
147
+ - http://localhost:8502 β€” Approach 1
148
+ - http://localhost:8503 β€” Approach 2
149
+
150
+ Press **Enter** in the launcher terminal to stop all servers.
151
+
152
+ ## Using the apps
153
+
154
+ 1. Upload one or more metadata CSV / TSV / XLSX / JSON files in the sidebar.
155
+ 2. Confirm the auto-detected column roles (leaf / group / text / meta).
156
+ 3. Click **Build hierarchy**.
157
+ 4. Inspect the LoD tree, evaluation metrics, label provenance (Approach 2), and export JSON.
158
+
159
+ Sample data is in `data/`:
160
+ - `ai-mind-variable-descriptions(in).csv`
161
+ - `HCP_S1200_DataDictionary_Oct_30_2023.csv`
162
+
163
+ ## Outputs
164
+
165
+ - **Baseline / Approach 1** export two JSON files for visualization:
166
+ - `*_lod.json` β€” primary LoD tree
167
+ - `*_facets.json` β€” parallel Castanet facet trees
168
+
169
+ - **Approach 2** exports a single LoD JSON:
170
+ - `*_approach2_lod.json` β€” primary LoD tree (every aggregation node carries
171
+ `label_provenance` with source stage, confidence, and evidence terms)
172
+
173
+ Filenames are derived from the uploaded CSV file name, so different CSVs export under
174
+ different filenames into `outputs/approach 2/`.
175
+
176
+ Existing output examples are in `outputs/approach 1/` and `outputs/approach 2/`.
177
+
178
+ ## Defensibility highlights for Approach 2
179
+
180
+ - **No domain hardcoding.** Slot names, group anchors, and labels are all derived from the
181
+ detected metadata columns + the uploaded CSV β€” no hand-curated domain vocabulary.
182
+ - **Deterministic by default.** Tree topology and all five label-generation stages are
183
+ reproducible from the input CSV alone. Local LLM is opt-in.
184
+ - **Grounded LLM refinement.** Every LLM-proposed label must pass a strict grounding
185
+ check β€” every word in the label must appear in the extracted evidence. Failed proposals
186
+ are rejected and the deterministic label is used instead. Per-node provenance lets
187
+ you answer "did the LLM invent this?" with hard evidence.
188
+ - **Local-only LLM.** Qwen 2.5 runs on the thesis machine via Ollama. No external API
189
+ calls, no third-party data sharing, no key management.
190
+
191
+ ## Troubleshooting
192
+
193
+ | Symptom | Fix |
194
+ |---|---|
195
+ | `FASTopic not installed` warning | `pip install fastopic` (also installs `torch`) |
196
+ | `openai` package missing | `pip install openai` |
197
+ | `Ollama not reachable` in sidebar | Open the Ollama app from Start menu; the service runs in the system tray |
198
+ | Model not found | `ollama pull qwen2.5:3b-instruct` |
199
+ | Build very slow with LLM on | Expected for HCP β€” ~15–40 min on CPU with a 3B model. Disable LLM for fast iteration. |
200
+ | `LLM-labeled nodes: 0/N` after build | The grounding check rejected every LLM proposal. Check the **Label Provenance** tab β€” counts under `llm_rejected = True` show what happened. |
201
+ | Hierarchy too shallow | Increase `Max LoD tree depth` slider (top of sidebar in Approach 2) |
202
+
203
+ ## License
204
+
205
+ For thesis evaluation only.
version2/approach_1.py ADDED
The diff for this file is too large to render. See raw diff
 
version2/approach_2.py ADDED
The diff for this file is too large to render. See raw diff
 
version2/baseline.py ADDED
@@ -0,0 +1,1086 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # baseline.py β€” Metadata Hierarchy Builder β€” Baseline (Taxonomizer)
2
+ #
3
+ # Baseline = Taxonomizer (Mahmood & Mueller, IEEE TVCG 2019), semantic-space
4
+ # pipeline, adapted to a metadata-only setting. No hardcoded domain patterns.
5
+ #
6
+ # Pipeline:
7
+ # 1. Load metadata file (CSV / TSV / XLSX / JSON)
8
+ # 2. Detect column roles (leaf / context / text / meta) β€” same as Approach 1 / 2
9
+ # 3. Build canonical schema (incl. _semantic_text = description values only)
10
+ # 4. Embed each variable (code + description) via Word2Vec skip-gram and build
11
+ # the cosine-distance semantic space [TAX Β§3.2]
12
+ # 5. Recursively cluster (agglomerative, cosine) into the dendrogram taxonomy;
13
+ # internal-node labels = data-driven contrastive terms of each cluster
14
+ # 6. Visualise (Sunburst / Treemap / Node-link)
15
+ # 7. Export visualization-ready JSON + canonical CSV
16
+ #
17
+ # Paper & justified adaptations (metadata/schema setting, fully automatic):
18
+ # [TAX] Mahmood & Mueller β€” Taxonomizer, IEEE TVCG 2019.
19
+ # Builds a SEMANTIC space (cosine over word2vec skip-gram embeddings of
20
+ # attribute names; gensim, Wikipedia, window=5, dim=128) merged with a
21
+ # DATA space (correlation over raw values), clustered into a dendrogram;
22
+ # inner nodes labelled semi-automatically by distributional degree-of-
23
+ # entailment + WordNet synonyms.
24
+ # Adaptations (all documented):
25
+ # 1. No DATA space β€” a schema/dictionary has no raw values, so we use the
26
+ # semantic space alone (Taxonomizer with semantic weight = 1.0).
27
+ # 2. Embed the attribute's short NAME (the description's name clause), since
28
+ # the bare code goes out-of-vocabulary (a limitation the paper flags,
29
+ # e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a
30
+ # paragraph; using the short name (not the full description prose) keeps
31
+ # domain-specific words from being diluted by shared explanatory text.
32
+ # 3. Fully-automatic labels β€” the paper's labelling is semi-automatic
33
+ # (human picks from suggestions); a baseline must be non-interactive, so
34
+ # we use data-driven contrastive terms from each cluster's members.
35
+ #
36
+ # Dependencies: gensim
37
+ # pip install gensim
38
+
39
+ from __future__ import annotations
40
+ import csv, json, re, warnings
41
+ from collections import Counter, defaultdict
42
+ from pathlib import Path
43
+ import tempfile
44
+
45
+ import numpy as np
46
+ import pandas as pd
47
+ import plotly.graph_objects as go
48
+ import streamlit as st
49
+ from sklearn.cluster import AgglomerativeClustering
50
+ from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, silhouette_score
51
+ from sklearn.preprocessing import LabelEncoder
52
+
53
+ warnings.filterwarnings('ignore')
54
+
55
+ st.set_page_config(page_title='Metadata Hierarchy β€” Baseline', layout='wide')
56
+ st.title('Metadata Hierarchy Builder β€” Baseline (Taxonomizer)')
57
+ st.caption(
58
+ 'Taxonomizer baseline [Mahmood & Mueller, IEEE TVCG 2019]: Word2Vec skip-gram '
59
+ 'semantic space (short attribute names) + balanced Ward agglomerative clustering '
60
+ 'into the dendrogram taxonomy; nodes labelled by data-driven contrastive terms. '
61
+ 'Semantic space only (no raw data values); no hardcoded patterns, no external APIs.'
62
+ )
63
+
64
+ # ─────────────────────────────────────────────────────────────────────────────
65
+ # CONSTANTS
66
+ # ─────────────────────────────────────────────────────────────────────────────
67
+ LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split()
68
+ GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
69
+ TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation comment'.split()
70
+ META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split()
71
+
72
+ # URL pattern β€” strip embedded links (e.g. HCP FreeSurfer NeuroLex URLs) so web
73
+ # tokens cannot dominate the embedding or the cluster label. [shared with A1]
74
+ _URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
75
+ re.IGNORECASE)
76
+
77
+ # ─────────────────────────────────────────────────────────────────────────────
78
+ # FILE LOADING
79
+ # ─────────────────────────────────────────────────────────────────────────────
80
+ def safe_name(name: str) -> str:
81
+ return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name)
82
+
83
+ def try_read_csv(path: Path) -> pd.DataFrame:
84
+ best, best_score = None, -1
85
+ for enc in ['utf-8-sig', 'utf-8', 'latin1']:
86
+ for sep in [None, ',', '\t', ';', '|']:
87
+ try:
88
+ df = pd.read_csv(path, sep=sep, engine='python', encoding=enc)
89
+ score = df.shape[1] * 10 - float(df.isna().mean().mean())
90
+ if score > best_score:
91
+ best, best_score = df, score
92
+ except Exception:
93
+ pass
94
+ if best is None:
95
+ raise ValueError(f'Could not read {path.name}')
96
+ best.columns = [str(c).strip().replace(';', '') for c in best.columns]
97
+ # Repair comma-packed rows (AI-Mind format)
98
+ if len(best) > 0:
99
+ first = best.iloc[:, 0].astype(str)
100
+ other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0
101
+ if first.str.contains(',').mean() > 0.50 and other_null > 0.70:
102
+ lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines()
103
+ if lines:
104
+ header = [h.strip().replace(';', '') for h in lines[0].split(',')]
105
+ rows = []
106
+ for line in lines[1:]:
107
+ line = line.strip().rstrip(';')
108
+ if not line:
109
+ continue
110
+ if line.startswith('"') and line.endswith('"'):
111
+ line = line[1:-1]
112
+ try:
113
+ parts = next(csv.reader([line], quotechar='"'))
114
+ except Exception:
115
+ continue
116
+ if len(parts) >= len(header):
117
+ rows.append(parts[:len(header)])
118
+ if rows:
119
+ best = pd.DataFrame(rows, columns=header)
120
+ best.columns = [str(c).strip().replace(';', '') for c in best.columns]
121
+ return best
122
+
123
+ def load_any(path: Path) -> pd.DataFrame:
124
+ s = path.suffix.lower()
125
+ if s in ['.csv', '.tsv', '.txt']:
126
+ return try_read_csv(path)
127
+ if s in ['.xlsx', '.xls']:
128
+ return pd.read_excel(path)
129
+ if s == '.json':
130
+ obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
131
+ if isinstance(obj, list):
132
+ return pd.json_normalize(obj)
133
+ if isinstance(obj, dict):
134
+ for v in obj.values():
135
+ if isinstance(v, list):
136
+ return pd.json_normalize(v)
137
+ raise ValueError(f'Unsupported file type: {s}')
138
+
139
+ def save_upload(f) -> Path:
140
+ tmp = Path(tempfile.mkdtemp(prefix='baseline_'))
141
+ p = tmp / safe_name(f.name)
142
+ p.write_bytes(f.getbuffer())
143
+ return p
144
+
145
+ # ─────────────────────────────────────────────────────────────────────────────
146
+ # ROLE DETECTION [GON]
147
+ # ─────────────────────────────────────────────────────────────────────────────
148
+ def norm(c: str) -> str:
149
+ return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_')
150
+
151
+ def kscore(c: str, keys: list) -> int:
152
+ nc = norm(c)
153
+ return sum(1 for k in keys if k in nc)
154
+
155
+ def profile_columns(df: pd.DataFrame) -> pd.DataFrame:
156
+ out = []
157
+ n = max(len(df), 1)
158
+ for col in df.columns:
159
+ s = df[col]
160
+ non = float(s.notna().mean())
161
+ nun = int(s.nunique(dropna=True))
162
+ ur = nun / n
163
+ avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
164
+ out.append({
165
+ 'column': str(col),
166
+ 'non_null': round(non, 3),
167
+ 'unique_values': nun,
168
+ 'unique_ratio': round(ur, 3),
169
+ 'avg_length': round(avg, 1),
170
+ 'leaf_score': 4*kscore(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0),
171
+ 'group_score': 4*kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n*0.5, 80) else 0) + (1 if avg < 60 else 0),
172
+ 'text_score': 5*kscore(col, TEXT_KEYS) + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0),
173
+ 'metadata_score': 4*kscore(col, META_KEYS) + (2 if 1 < nun < min(n*0.8, 100) else 0),
174
+ })
175
+ return pd.DataFrame(out)
176
+
177
+ def detect_roles(df: pd.DataFrame) -> tuple:
178
+ """Auto-detect column roles. Identical logic to Approach 1 / 2 so the
179
+ preprocessing up to the canonical table is comparable across all apps."""
180
+ prof = profile_columns(df)
181
+ leaf = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist()
182
+ text = (prof[(prof.text_score >= 4) | (prof.avg_length > 80)]
183
+ .sort_values('text_score', ascending=False)['column'].tolist()) or leaf.copy()
184
+ group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)]
185
+ .sort_values('group_score', ascending=False)['column'].head(3).tolist())
186
+ meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
187
+ .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
188
+ # Representation columns (decimal/precision/unit/type/format/…) must never
189
+ # become structural levels; prefer them as metadata. [GON][TAX]
190
+ _META_SUBSTR_BLOCK = {
191
+ 'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
192
+ 'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
193
+ }
194
+ def _is_repr(col_name):
195
+ nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower())
196
+ return any(sub in nc for sub in _META_SUBSTR_BLOCK)
197
+ meta_extra = [c for c in prof['column'].tolist()
198
+ if _is_repr(c) and c not in text and c not in leaf and c not in meta]
199
+ group = [c for c in group if not _is_repr(c)]
200
+ meta = list(dict.fromkeys(meta + meta_extra))[:8]
201
+ return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof
202
+
203
+ # ─────────────────────────────────────────────────────────────────────────────
204
+ # CANONICAL SCHEMA [GON]
205
+ # ─────────────────────────────────────────────────────────────────────────────
206
+ def sv(x) -> str:
207
+ return '' if pd.isna(x) else str(x).strip()
208
+
209
+ def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
210
+ leaf_cols = cfg.get('leaf_cols', [])
211
+ group_cols = cfg.get('group_cols', [])
212
+ text_cols = cfg.get('text_cols', [])
213
+ meta_cols = cfg.get('metadata_cols', [])
214
+ rows = []
215
+ for i, row in df.iterrows():
216
+ leaf_parts = [sv(row.get(c, '')) for c in leaf_cols]
217
+ leaf_parts = [p for p in leaf_parts if p]
218
+ label = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}'
219
+ group_parts = [sv(row.get(c, '')) for c in group_cols]
220
+ group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']]
221
+ gpath = ' > '.join(group_parts) if group_parts else 'Ungrouped'
222
+ parts = []
223
+ for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)):
224
+ v = sv(row.get(c, ''))
225
+ if v:
226
+ parts.append(f'{c}: {v}')
227
+ text = ' | '.join(parts) if parts else label
228
+ # _semantic_text: description VALUES only β€” no "fieldname:" prefixes, no
229
+ # other fields, URLs stripped. This is the clean text Taxonomizer embeds
230
+ # (the attribute's meaning), identical in spirit to Approach 1's column.
231
+ sem_parts = [sv(row.get(c, '')) for c in text_cols]
232
+ sem_parts = [p for p in sem_parts if p]
233
+ if not sem_parts:
234
+ sem_parts = list(leaf_parts)
235
+ semantic = _URL_RE.sub(' ', ' '.join(sem_parts)) if sem_parts else label
236
+ rows.append({
237
+ '_source_file': source,
238
+ '_row_index': int(i),
239
+ '_leaf_label': label,
240
+ '_leaf_id': f'{gpath}.{label}' if gpath != 'Ungrouped' else label,
241
+ '_group_path': gpath,
242
+ '_text': text,
243
+ '_semantic_text': semantic,
244
+ })
245
+ can = pd.DataFrame(rows)
246
+ if can['_leaf_id'].duplicated().any():
247
+ cnt: dict = defaultdict(int)
248
+ ids = []
249
+ for lid in can['_leaf_id']:
250
+ cnt[lid] += 1
251
+ ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
252
+ can['_leaf_id'] = ids
253
+ return can
254
+
255
+ # ─────────────────────────────────────────────────────────────────────────────
256
+ # TAXONOMIZER CORE [TAX β€” Mahmood & Mueller, IEEE TVCG 2019]
257
+ #
258
+ # Taxonomizer builds the taxonomy from a SEMANTIC SPACE (cosine distance between
259
+ # word2vec skip-gram embeddings of attribute names) merged with a DATA SPACE
260
+ # (correlation over the raw values). In a metadata/schema setting we have no
261
+ # raw data values, so we use the semantic space alone (= Taxonomizer with
262
+ # semantic weight 1.0). Because attribute *names* here are opaque codes that go
263
+ # out-of-vocabulary β€” a limitation the paper explicitly flags (e.g. "BP") β€” we
264
+ # embed code + description so real words carry the meaning (OOV code tokens are
265
+ # skipped during averaging). Internal-node labels: the paper uses semi-automatic
266
+ # distributional degree-of-entailment + WordNet synonyms; a baseline must be
267
+ # fully automatic, so we use data-driven contrastive terms drawn from the data.
268
+ # ─────────────────────────────────────────────────────────────────────────────
269
+
270
+ _W2V_STOP = frozenset(
271
+ 'a an the and or but if in on at to of for with by is are was were be '
272
+ 'been being have has had do does did will would could should may might '
273
+ 'shall can this that these those i you he she it we they me him her us '
274
+ 'them my your his her its our their what which who whom when where why '
275
+ 'how all each every few more most other some such no not only same so '
276
+ 'than too very just because as until while'.split()
277
+ )
278
+
279
+ @st.cache_resource(show_spinner=False)
280
+ def _load_w2v():
281
+ """Load pre-trained Word2Vec / GloVe model via gensim downloader.
282
+
283
+ We prefer glove-wiki-gigaword-100 (~66 MB) because its Wikipedia training
284
+ corpus and skip-gram-style objective most closely match Taxonomizer's
285
+ described word2vec-Wikipedia-dim128 model.
286
+ """
287
+ try:
288
+ import gensim.downloader as api
289
+ return api.load('glove-wiki-gigaword-100')
290
+ except Exception as e:
291
+ st.error(
292
+ f'Could not load Word2Vec model: {e}\n\n'
293
+ 'Run: pip install gensim and restart the app.\n'
294
+ 'The model (~66 MB) is downloaded automatically on first use.'
295
+ )
296
+ return None
297
+
298
+ def _tokenize(label: str) -> list[str]:
299
+ return [t for t in re.sub(r'[^a-zA-Z]+', ' ', label).lower().split()
300
+ if len(t) > 2 and t not in _W2V_STOP]
301
+
302
+ def attribute_name(text: str) -> str:
303
+ """The attribute's short NAME β€” what Taxonomizer actually embeds [TAX Β§3.2].
304
+
305
+ The paper embeds the attribute name ("not more than a few words long"), not a
306
+ paragraph. Descriptions here are formatted '<name>: <full sentence>' (some
307
+ prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
308
+ that is not a pure all-caps marker. Embedding this short name β€” rather than
309
+ the full description prose β€” keeps the domain-specific words from being
310
+ diluted by shared explanatory text, so the taxonomy clusters more by theme
311
+ (e.g. DMS / PAL / SWM).
312
+ """
313
+ text = str(text)
314
+ for clause in re.split(r'[:\n]', text):
315
+ clause = clause.strip()
316
+ if clause and not all(2 <= len(w) <= 6 and w.isupper() for w in clause.split()):
317
+ return clause
318
+ return text.strip()
319
+
320
+ def embed_labels_w2v(labels: list[str], model) -> np.ndarray:
321
+ """Average Word2Vec vectors for each label's tokens [TAX Β§4.1].
322
+
323
+ Falls back to a zero vector for labels where none of the tokens are in the
324
+ model vocabulary (rare for standard English attribute names).
325
+ """
326
+ dim = model.vector_size
327
+ out = np.zeros((len(labels), dim), dtype=np.float32)
328
+ for i, label in enumerate(labels):
329
+ toks = _tokenize(label)
330
+ vecs = [model[t] for t in toks if t in model]
331
+ if vecs:
332
+ out[i] = np.mean(vecs, axis=0)
333
+ # L2-normalise so cosine distance = 1 - dot
334
+ norms = np.linalg.norm(out, axis=1, keepdims=True)
335
+ norms[norms == 0] = 1.0
336
+ return out / norms
337
+
338
+ def _cluster(X: np.ndarray, k: int) -> np.ndarray:
339
+ """Ward-linkage agglomerative cut into k clusters.
340
+
341
+ Ward (on the L2-normalised embedding vectors, where Euclidean ∝ √cosine)
342
+ minimises within-cluster variance and so produces *balanced* clusters.
343
+ This avoids the average/single-linkage chaining pathology that otherwise
344
+ peels off tiny clusters and leaves one giant residual (i.e. no real
345
+ hierarchy forms).
346
+ """
347
+ return AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(X)
348
+
349
+ def best_k(X: np.ndarray, n: int, k_min: int = 2, k_max: int = 8) -> int:
350
+ """Pick the number of clusters that maximises the silhouette score.
351
+
352
+ Fully data-driven β€” no fixed cluster count. Returns 1 only when the node
353
+ is too small to split (n <= k_min).
354
+ """
355
+ k_hi = min(k_max, n - 1)
356
+ if k_hi < k_min:
357
+ return 1
358
+ best, best_s = 1, -1.0
359
+ for k in range(k_min, k_hi + 1):
360
+ labels = _cluster(X, k)
361
+ if len(set(labels)) < 2:
362
+ continue
363
+ try:
364
+ s = silhouette_score(X, labels)
365
+ except Exception:
366
+ continue
367
+ if s > best_s:
368
+ best_s, best = s, k
369
+ return best
370
+
371
+ def _doc_freq(texts: list[str]) -> Counter:
372
+ """Document frequency: how many member texts each content word appears in."""
373
+ c: Counter = Counter()
374
+ for t in texts:
375
+ for w in set(_tokenize(t)):
376
+ c[w] += 1
377
+ return c
378
+
379
+ def cluster_term_label(member_texts: list[str], sibling_texts: list[str],
380
+ used: set, vocab=None, top_n: int = 2) -> str:
381
+ """Label a node with the content words most characteristic of its members.
382
+
383
+ Data-driven labelling: each candidate word is scored by how much more
384
+ frequent it is *inside* the cluster than in the sibling pool (contrastive
385
+ document frequency), so labels are domain terms drawn from the dataset
386
+ itself β€” not external ontology words. This replaces Taxonomizer's
387
+ WordNet degree-of-entailment, which produces over-general, off-domain
388
+ abstractions on specialised scientific metadata.
389
+
390
+ If `vocab` is given (the Word2Vec model), only real dictionary words are
391
+ eligible, so opaque attribute codes (e.g. 'dms', 'motml') are filtered out
392
+ of labels. Codes are used only as a last-resort fallback.
393
+ """
394
+ def in_vocab(w: str) -> bool:
395
+ return vocab is None or w in vocab
396
+
397
+ n_in = max(len(member_texts), 1)
398
+ n_out = max(len(sibling_texts), 1)
399
+ cin = _doc_freq(member_texts)
400
+ cout = _doc_freq(sibling_texts)
401
+
402
+ scores: dict[str, float] = {}
403
+ for w, f in cin.items():
404
+ if w in used or len(w) <= 2 or not in_vocab(w):
405
+ continue
406
+ p_in = f / n_in
407
+ p_out = cout.get(w, 0) / n_out
408
+ # ignore single-occurrence noise unless the term is widely shared
409
+ if f < 2 and p_in < 0.5:
410
+ continue
411
+ scores[w] = p_in - p_out
412
+
413
+ picks = [w for w, _ in sorted(scores.items(), key=lambda x: -x[1])[:top_n]
414
+ if scores[w] > 0]
415
+ if not picks:
416
+ # fallback: most frequent shared real word, then any shared token
417
+ for require_vocab in (True, False):
418
+ for w, _ in cin.most_common():
419
+ if w not in used and len(w) > 2 and (not require_vocab or in_vocab(w)):
420
+ picks = [w]
421
+ break
422
+ if picks:
423
+ break
424
+ return ' / '.join(p.title() for p in picks) if picks else 'Group'
425
+
426
+ # ─────────────────────────────────────────────────────────────────────────────
427
+ # HIERARCHY CONSTRUCTION [TAX + GON]
428
+ # ─────────────────────────────────────────────────────────────────────────────
429
+ def _nmap(nodes: list) -> dict:
430
+ return {int(n['id']): n for n in nodes}
431
+
432
+ def _next_id(nodes: list) -> int:
433
+ return max((int(n['id']) for n in nodes), default=0) + 1
434
+
435
+ def _add_child(nodes: list, parent_id: int, child_id: int):
436
+ m = _nmap(nodes)
437
+ p = m.get(int(parent_id))
438
+ if p is None:
439
+ return
440
+ rel = list(p.get('related', []))
441
+ if int(child_id) not in rel:
442
+ rel.append(int(child_id))
443
+ p['related'] = rel
444
+
445
+ def _make_agg(nid: int, name: str, desc: str = '') -> dict:
446
+ return {'id': int(nid), 'name': str(name), 'related': [],
447
+ 'type': 'aggregation', 'isShown': True, 'desc': desc, 'dtype': 'determine'}
448
+
449
+ def _leaf_ids(nodes: list, nid: int) -> list:
450
+ m = _nmap(nodes)
451
+ out: list = []
452
+ def rec(x):
453
+ n = m.get(int(x))
454
+ if not n:
455
+ return
456
+ if n.get('type') == 'attribute':
457
+ out.append(int(x))
458
+ return
459
+ for c in n.get('related', []):
460
+ rec(int(c))
461
+ rec(nid)
462
+ return list(dict.fromkeys(out))
463
+
464
+ def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
465
+ max_depth: int = 3, min_cluster_size: int = 6,
466
+ branch_max: int = 8) -> list:
467
+ """Taxonomizer semantic-space construction [TAX].
468
+
469
+ Embeds each variable from its short attribute NAME (Word2Vec skip-gram
470
+ average) β€” the name clause of the description, as Taxonomizer specifies.
471
+ Recursively clusters via balanced Ward linkage β€” the semantic-space
472
+ dendrogram. Labels each internal node with the contrastive content terms of
473
+ its members (data-driven, fully automatic). No hardcoding.
474
+ """
475
+ # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
476
+ nodes: list = [{'id': 0, 'name': project, 'type': 'root',
477
+ 'dtype': 'root', 'isShown': True, 'related': [], 'desc': 'Root node'}]
478
+ row_to_node: list = []
479
+ embed_list: list[str] = [] # short attribute name β†’ embedding input + labels
480
+ for i, (_, r) in enumerate(can.iterrows(), start=1):
481
+ sem = str(r.get('_semantic_text', '') or r['_leaf_label'])
482
+ name = attribute_name(sem) or str(r['_leaf_label'])
483
+ nodes.append({'id': i, 'name': r['_leaf_label'], 'dtype': 'determine',
484
+ 'related': [], 'isShown': True, 'type': 'attribute',
485
+ 'desc': r['_text'],
486
+ 'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}})
487
+ row_to_node.append(i)
488
+ embed_list.append(name)
489
+ label_list = embed_list
490
+ row_to_node = np.array(row_to_node)
491
+
492
+ # ── Word2Vec semantic-space embeddings [TAX Β§3.2] ─────────────────────────
493
+ emb = embed_labels_w2v(embed_list, w2v_model) # (N, dim), L2-normalised
494
+
495
+ # ── recursive clustering down the Ward dendrogram ─────────────────────────
496
+ def attach_leaves(parent_id: int, idx: np.ndarray):
497
+ for i in idx:
498
+ _add_child(nodes, parent_id, int(row_to_node[i]))
499
+
500
+ def recurse(parent_id: int, idx: np.ndarray, depth: int, used: set):
501
+ n = len(idx)
502
+ if n <= min_cluster_size or depth >= max_depth:
503
+ attach_leaves(parent_id, idx)
504
+ return
505
+
506
+ sub = emb[idx]
507
+ k_cap = min(branch_max, n - 1)
508
+ # Branching floor: a node with n leaves and `remaining` levels left must
509
+ # fan out enough to fit all its leaves into buckets of ~min_cluster_size
510
+ # by the depth cap, i.e. k >= (n / min_cluster_size) ** (1/remaining).
511
+ # Without this, silhouette keeps picking k=2 on overlapping data (e.g.
512
+ # HCP), giving a near-binary tree that dumps ~100 leaves per bottom node.
513
+ remaining = max(1, max_depth - depth)
514
+ k_floor = int(np.ceil((n / max(min_cluster_size, 1)) ** (1.0 / remaining)))
515
+ k_floor = max(2, min(k_floor, k_cap))
516
+ k = best_k(sub, n, k_min=k_floor, k_max=k_cap)
517
+ if k <= 1:
518
+ k = min(k_floor, k_cap) if n > min_cluster_size else 1
519
+ if k <= 1:
520
+ attach_leaves(parent_id, idx)
521
+ return
522
+
523
+ cluster_labels = _cluster(sub, k)
524
+ for c in range(k):
525
+ mask = cluster_labels == c
526
+ members = idx[mask]
527
+ if len(members) == 0:
528
+ continue
529
+ if len(members) == 1: # don't create singleton internal nodes
530
+ _add_child(nodes, parent_id, int(row_to_node[members[0]]))
531
+ continue
532
+ mset = set(members.tolist())
533
+ member_texts = [label_list[i] for i in members]
534
+ sibling_texts = [label_list[i] for i in idx if i not in mset]
535
+ # data-driven contrastive-term labelling
536
+ label = cluster_term_label(member_texts, sibling_texts, used)
537
+ nid = _next_id(nodes)
538
+ nodes.append(_make_agg(nid, label,
539
+ desc=f'Cluster of {len(members)} variables β€” '
540
+ f'label terms: {label}'))
541
+ _add_child(nodes, parent_id, nid)
542
+ recurse(nid, members, depth + 1, used | {label.lower()})
543
+
544
+ recurse(0, np.arange(len(can)), 0, set())
545
+
546
+ for n in nodes:
547
+ n['related'] = list(dict.fromkeys(int(x) for x in n.get('related', [])))
548
+ return nodes
549
+
550
+ # ─────────────────────────────────────────────────────────────────────────────
551
+ # VISUALISATION
552
+ # ─────────────────────────────────────────────────────────────────────────────
553
+ def _parent_map(nodes: list) -> dict:
554
+ pm: dict = {}
555
+ for n in nodes:
556
+ for c in n.get('related', []):
557
+ if int(c) not in pm:
558
+ pm[int(c)] = int(n['id'])
559
+ return pm
560
+
561
+ # ─────────────────────────────────────────────────────────────────────────────
562
+ # EVALUATION HELPERS
563
+ # ─────────────────────────────────────────────────────────────────────────────
564
+ def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list[int]:
565
+ """Return predicted cluster id (depth-1 aggregation ancestor) for each row in can."""
566
+ pm = _parent_map(nodes)
567
+ def depth1(nid: int) -> int:
568
+ # Walk up until our parent is root (id==0) or we have no parent
569
+ while pm.get(nid, -1) not in (-1, 0):
570
+ nid = pm[nid]
571
+ return nid
572
+ lid_to_nid = {n['metadata']['leaf_id']: int(n['id'])
573
+ for n in nodes if n.get('type') == 'attribute' and 'metadata' in n}
574
+ return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1
575
+ for lid in can['_leaf_id']]
576
+
577
+ def _purity(y_true, y_pred) -> float:
578
+ from collections import Counter
579
+ clusters: dict = {}
580
+ for t, p in zip(y_true, y_pred):
581
+ clusters.setdefault(p, []).append(t)
582
+ correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values())
583
+ return correct / max(len(y_true), 1)
584
+
585
+ def _structural_stats(nodes: list) -> dict:
586
+ pm = _parent_map(nodes)
587
+ def depth_of(nid: int) -> int:
588
+ d = 0
589
+ while nid in pm:
590
+ nid = pm[nid]; d += 1
591
+ return d
592
+ agg = [n for n in nodes if n.get('type') == 'aggregation']
593
+ leafs = [n for n in nodes if n.get('type') == 'attribute']
594
+ depths = [depth_of(int(n['id'])) for n in leafs]
595
+ branches = [len(n.get('related', [])) for n in agg]
596
+ singletons = sum(1 for b in branches if b == 1)
597
+ return {
598
+ 'n_aggregation_nodes': len(agg),
599
+ 'max_depth': int(max(depths, default=0)),
600
+ 'avg_leaf_depth': round(float(np.mean(depths)), 2) if depths else 0.0,
601
+ 'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0,
602
+ 'singleton_nodes_%': round(100.0 * singletons / max(len(agg), 1), 1),
603
+ }
604
+
605
+ def _wrap(text: str, width: int = 70) -> str:
606
+ """Wrap long hover text onto multiple <br> lines so it never runs off-screen."""
607
+ import textwrap
608
+ text = str(text).replace('<', '&lt;')
609
+ lines: list = []
610
+ for para in text.split('\n'):
611
+ wrapped = textwrap.wrap(para, width=width) or ['']
612
+ lines.extend(wrapped)
613
+ return '<br>'.join(lines)
614
+
615
+ def plot_sunburst(nodes: list, max_depth: int = 4) -> go.Figure:
616
+ pm = _parent_map(nodes)
617
+ ids, labels, parents, values, hover = [], [], [], [], []
618
+ for n in nodes:
619
+ nid = int(n['id'])
620
+ lc = len(_leaf_ids(nodes, nid))
621
+ ids.append(str(nid))
622
+ labels.append(str(n.get('name', ''))[:40])
623
+ parents.append('' if nid == 0 else str(pm.get(nid, 0)))
624
+ values.append(max(1, lc))
625
+ desc = _wrap(n.get('desc', ''))
626
+ hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Type: {n.get("type","")}'
627
+ f'<br>Variables: {lc}<br><br>{desc}')
628
+ fig = go.Figure(go.Sunburst(
629
+ ids=ids, labels=labels, parents=parents, values=values,
630
+ branchvalues='total', hovertext=hover, hoverinfo='text',
631
+ maxdepth=max_depth, insidetextorientation='radial',
632
+ marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
633
+ ))
634
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
635
+ title='Click a sector to drill down β€” click centre to go back')
636
+ return fig
637
+
638
+ def plot_treemap(nodes: list) -> go.Figure:
639
+ pm = _parent_map(nodes)
640
+ ids, labels, parents, values, hover = [], [], [], [], []
641
+ for n in nodes:
642
+ nid = int(n['id'])
643
+ lc = len(_leaf_ids(nodes, nid))
644
+ ids.append(str(nid))
645
+ labels.append(str(n.get('name', ''))[:40])
646
+ parents.append('' if nid == 0 else str(pm.get(nid, 0)))
647
+ values.append(max(1, lc))
648
+ desc = _wrap(n.get('desc', ''))
649
+ hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Variables: {lc}<br>{desc}')
650
+ fig = go.Figure(go.Treemap(
651
+ ids=ids, labels=labels, parents=parents, values=values,
652
+ branchvalues='total', hovertext=hover, hoverinfo='text',
653
+ textinfo='label+value',
654
+ marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
655
+ ))
656
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
657
+ return fig
658
+
659
+ # ─────────────────────────────────────────────────────────────────────────────
660
+ # NODE-LINK TREE (Reingold–Tilford layout β€” matches Approach 1 / 2 interface)
661
+ # ─────────────────────────────────────────────────────────────────────────────
662
+ def _bl_node_color(n: dict) -> str:
663
+ t = n.get('type', '')
664
+ if t == 'root': return '#2a7d2a'
665
+ if t == 'attribute': return '#74c476'
666
+ if t == 'collapsed': return '#bbbbbb'
667
+ return '#238b45'
668
+
669
+ def _display_graph(nodes: list, max_depth: int = 4):
670
+ """Walk the tree to the chosen depth, inserting 'collapsed' placeholders for
671
+ branches cut off below max_depth (the Level-of-Detail control)."""
672
+ m = _nmap(nodes)
673
+ dnodes: dict = {}
674
+ edges: list = []
675
+ counter = 10 ** 9
676
+
677
+ def rec(nid, depth):
678
+ nonlocal counter
679
+ n = m.get(int(nid))
680
+ if not n:
681
+ return
682
+ dnodes[int(nid)] = n
683
+ if depth >= max_depth and n.get('related'):
684
+ counter += 1
685
+ cid = counter
686
+ n_leaves = len(_leaf_ids(nodes, nid))
687
+ dnodes[cid] = {'id': cid, 'name': f'… {n_leaves} variables',
688
+ 'type': 'collapsed', 'related': [],
689
+ 'desc': f"Collapsed: {n.get('name')}", 'isShown': True}
690
+ edges.append((int(nid), cid))
691
+ return
692
+ for c in n.get('related', []):
693
+ if int(c) not in m:
694
+ continue
695
+ edges.append((int(nid), int(c)))
696
+ rec(int(c), depth + 1)
697
+
698
+ rec(0, 0)
699
+ return list(dnodes.values()), edges
700
+
701
+ def _positions(edges: list):
702
+ """Reingold–Tilford style positions: x = depth, y = subtree-aware vertical."""
703
+ H_SCALE, V_SPACE = 3.0, 1.8
704
+ children: dict = defaultdict(list)
705
+ for p, c in edges:
706
+ children[p].append(c)
707
+ pos: dict = {}
708
+ counter = {'v': 0}
709
+
710
+ def rec(nid, depth):
711
+ ch = children.get(nid, [])
712
+ if not ch:
713
+ y = counter['v'] * V_SPACE
714
+ counter['v'] += 1
715
+ pos[nid] = (depth * H_SCALE, y)
716
+ return y
717
+ y = float(np.mean([rec(c, depth + 1) for c in ch]))
718
+ pos[nid] = (depth * H_SCALE, y)
719
+ return y
720
+
721
+ rec(0, 0)
722
+ return pos
723
+
724
+ def plot_node_link(nodes: list, max_depth: int = 4, show_leaf_labels: bool = False) -> go.Figure:
725
+ """Node-link tree with elbow edges. Best for inspecting structure at moderate
726
+ depth; Sunburst is recommended for large hierarchies (Taxonomizer)."""
727
+ dnodes, edges = _display_graph(nodes, max_depth)
728
+ pos = _positions(edges)
729
+
730
+ ex, ey = [], []
731
+ for p, c in edges:
732
+ if p not in pos or c not in pos:
733
+ continue
734
+ x0, y0 = pos[p]; x1, y1 = pos[c]
735
+ xm = (x0 + x1) / 2
736
+ ex += [x0, xm, xm, x1, None]
737
+ ey += [y0, y0, y1, y1, None]
738
+ traces = [go.Scatter(x=ex, y=ey, mode='lines',
739
+ line=dict(width=1, color='#c8c8c8'),
740
+ hoverinfo='skip', showlegend=False)]
741
+
742
+ agg_x, agg_y, agg_l, agg_c, agg_h = [], [], [], [], []
743
+ lf_x, lf_y, lf_l, lf_c, lf_h = [], [], [], [], []
744
+ for n in dnodes:
745
+ nid = int(n['id'])
746
+ if nid not in pos:
747
+ continue
748
+ x, y = pos[nid]
749
+ lc = len(_leaf_ids(nodes, nid))
750
+ lab = str(n.get('name', nid))
751
+ htxt = (f"<b>{_wrap(n.get('name',''))}</b><br>Type: {n.get('type','')}"
752
+ f"<br>Variables: {lc}<br><br>{_wrap(n.get('desc',''))}")
753
+ col = _bl_node_color(n)
754
+ if n.get('type') in ('root', 'aggregation', 'collapsed'):
755
+ agg_x.append(x); agg_y.append(y)
756
+ agg_l.append((lab + (f' ({lc})' if lc else ''))[:50])
757
+ agg_c.append(col); agg_h.append(htxt)
758
+ else:
759
+ lf_x.append(x); lf_y.append(y)
760
+ lf_l.append(lab[:40] if show_leaf_labels else '')
761
+ lf_c.append(col); lf_h.append(htxt)
762
+
763
+ if agg_x:
764
+ traces.append(go.Scatter(
765
+ x=agg_x, y=agg_y, mode='markers+text', text=agg_l,
766
+ textposition='middle right', hovertext=agg_h, hoverinfo='text',
767
+ marker=dict(size=16, color=agg_c, line=dict(color='white', width=2)),
768
+ showlegend=False))
769
+ if lf_x:
770
+ traces.append(go.Scatter(
771
+ x=lf_x, y=lf_y, mode='markers+text', text=lf_l,
772
+ textposition='middle right', hovertext=lf_h, hoverinfo='text',
773
+ marker=dict(size=7, color=lf_c, symbol='circle', opacity=0.75,
774
+ line=dict(color='white', width=1)),
775
+ showlegend=False))
776
+
777
+ n_leaves = max(12, len(lf_x))
778
+ fig = go.Figure(traces)
779
+ fig.update_layout(
780
+ height=max(700, min(4000, int(n_leaves * 32))),
781
+ margin=dict(l=20, r=220, t=30, b=20),
782
+ plot_bgcolor='white', paper_bgcolor='white',
783
+ xaxis=dict(visible=False, fixedrange=False),
784
+ yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
785
+ dragmode='pan')
786
+ return fig
787
+
788
+ # ─────────────────────────────────────────────────────────────────────────────
789
+ # SIDEBAR
790
+ # ─────────────────────────────────────────────────────────────────────────────
791
+ with st.sidebar:
792
+ st.header('1. Upload')
793
+ uploaded = st.file_uploader(
794
+ 'Upload a metadata file',
795
+ type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
796
+ accept_multiple_files=False,
797
+ )
798
+ st.header('2. Taxonomizer settings')
799
+ tx_max_depth = st.slider('Max taxonomy depth', 2, 6, 3, 1,
800
+ help='How many abstract-to-concrete levels to build')
801
+ tx_min_size = st.slider('Min cluster size', 3, 20, 6, 1,
802
+ help='Clusters smaller than this stop splitting (leaves attach directly)')
803
+ tx_branch = st.slider('Max branches per node', 3, 12, 8, 1,
804
+ help='Upper bound on clusters per split; the actual number is chosen by silhouette')
805
+
806
+ st.header('3. Display')
807
+ max_items = st.slider('Maximum variables', 25, 1200, 900, 25,
808
+ help='Cap on variables included (lower only to speed up very large files). '
809
+ 'Default keeps full datasets like HCP (813).')
810
+ group_filter = st.text_input('Row filter (optional)', value='',
811
+ help='Filter rows by contextual path text before building')
812
+
813
+ # ─────────────────────────────────────────────────────────────────────────────
814
+ # MAIN
815
+ # ───────────────────────────────────────────────────────────────��─────────────
816
+ if not uploaded:
817
+ st.info('Upload a metadata CSV / XLSX / JSON file to begin.')
818
+ st.markdown("""
819
+ ### Baseline algorithm β€” Taxonomizer (semantic space)
820
+
821
+ Based on **Mahmood & Mueller, IEEE TVCG 2019** (Taxonomizer), adapted to a
822
+ metadata-only setting. No hardcoded domain patterns, no external APIs.
823
+
824
+ | Step | Method | Paper |
825
+ |------|--------|-------|
826
+ | Variable representation | **short attribute name** (description's name clause; codes are OOV) | Taxonomizer Β§3.2 / Β§4.1 |
827
+ | Embedding | Word2Vec skip-gram β€” average of word vectors (`glove-wiki-gigaword-100`) | Taxonomizer Β§3.2 |
828
+ | Semantic space | Cosine-distance matrix (no data space β€” schema has no raw values) | Taxonomizer Β§3.2 *(adapted)* |
829
+ | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette β†’ dendrogram | Taxonomizer Β§4.2 |
830
+ | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer Β§4.3 *(adapted)* |
831
+
832
+ This page is the pure Taxonomizer-style semantic-space reference method:
833
+ variable meanings are embedded and recursively clustered into a hierarchy,
834
+ with node labels generated from contrastive terms.
835
+
836
+ **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
837
+
838
+ **Approach 2** adds NMF/FASTopic aspect discovery + GMM clustering + optional LLM labels.
839
+ """)
840
+ st.stop()
841
+
842
+ path = save_upload(uploaded)
843
+
844
+ @st.cache_data(show_spinner=False)
845
+ def _load_profile(path_str: str):
846
+ df = load_any(Path(path_str))
847
+ cfg, prof = detect_roles(df)
848
+ return df, cfg, prof
849
+
850
+ with st.spinner('Loading file…'):
851
+ df, auto_cfg, prof = _load_profile(str(path))
852
+
853
+ st.subheader('Step 1 β€” File preview')
854
+ with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
855
+ expanded=False):
856
+ st.dataframe(df.head(10), use_container_width=True)
857
+ score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
858
+ if c in prof.columns]
859
+ st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
860
+ use_container_width=True)
861
+
862
+ st.subheader('Step 2 β€” Confirm column roles')
863
+ cols = list(df.columns)
864
+ # Scope widget keys to the uploaded file so a NEW file always shows its own
865
+ # auto-detected defaults (Streamlit otherwise keeps the previous file's
866
+ # selections under a fixed key, which silently overrides the new defaults).
867
+ _fk = safe_name(uploaded.name)
868
+ with st.expander('Column configuration', expanded=True):
869
+ left, right = st.columns(2)
870
+ with left:
871
+ leaf_cols = st.multiselect('Leaf variable column(s)', cols,
872
+ default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
873
+ group_cols = st.multiselect('Context column(s) (optional)', cols,
874
+ default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
875
+ help='Optional contextual columns for display/filtering.')
876
+ with right:
877
+ text_cols = st.multiselect('Text/description column(s)', cols,
878
+ default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
879
+ meta_cols = st.multiselect('Metadata/type column(s)', cols,
880
+ default=[c for c in auto_cfg.get('metadata_cols', []) if c in cols], key=f'meta_{_fk}')
881
+
882
+ if not leaf_cols:
883
+ st.error('Choose at least one leaf variable column.')
884
+ st.stop()
885
+
886
+ cfg = {'leaf_cols': leaf_cols, 'group_cols': group_cols,
887
+ 'text_cols': text_cols, 'metadata_cols': meta_cols}
888
+
889
+ if st.button('Build baseline hierarchy', type='primary'):
890
+ # ── load Word2Vec model (cached after first call) ──────────────────────
891
+ with st.spinner('Loading Word2Vec model (first run downloads ~66 MB)…'):
892
+ _w2v = _load_w2v()
893
+ if _w2v is None:
894
+ st.stop()
895
+
896
+ with st.spinner('Building hierarchy…'):
897
+ _can = build_canonical(df, cfg, source=Path(uploaded.name).stem)
898
+
899
+ if group_filter.strip():
900
+ _can = _can[_can['_group_path'].str.contains(
901
+ group_filter.strip(), case=False, na=False)].copy()
902
+
903
+ if len(_can) > max_items:
904
+ _can = _can.head(max_items).copy()
905
+
906
+ _can = _can.reset_index(drop=True)
907
+
908
+ if len(_can) < 2:
909
+ st.error('Need at least 2 variables after filtering.')
910
+ st.stop()
911
+
912
+ _pname = Path(uploaded.name).stem
913
+ _nodes = build_hierarchy(_can, _w2v, project=_pname,
914
+ max_depth=tx_max_depth,
915
+ min_cluster_size=tx_min_size,
916
+ branch_max=tx_branch)
917
+
918
+ st.session_state['_bl_nodes'] = _nodes
919
+ st.session_state['_bl_can'] = _can
920
+ st.session_state['_bl_project'] = _pname
921
+
922
+ if '_bl_nodes' not in st.session_state:
923
+ st.info('Configure columns above then click **Build baseline hierarchy**.')
924
+ st.stop()
925
+
926
+ nodes = st.session_state['_bl_nodes']
927
+ can = st.session_state['_bl_can']
928
+ project_name = st.session_state['_bl_project']
929
+
930
+ _sm = _structural_stats(nodes)
931
+ n_leaves = len([n for n in nodes if n['type'] == 'attribute'])
932
+ n_internal = len([n for n in nodes if n['type'] == 'aggregation'])
933
+
934
+ st.divider()
935
+ c1, c2, c3, c4 = st.columns(4)
936
+ c1.metric('Variables', n_leaves)
937
+ c2.metric('Aggregation nodes', n_internal)
938
+ c3.metric('Max depth', _sm['max_depth'])
939
+ c4.metric('Avg branching', _sm['avg_branching_factor'])
940
+
941
+ tabs = st.tabs(['Visualization', 'Node detail', 'Canonical table', 'Export', 'Evaluation'])
942
+
943
+ with tabs[0]:
944
+ # ── Visualization controls (above chart β€” matches Approach 1 / 2) ─────────
945
+ vc1, vc2, vc3 = st.columns([3, 2, 1])
946
+ with vc1:
947
+ viz_mode = st.radio(
948
+ 'View mode',
949
+ ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
950
+ horizontal=True, index=0,
951
+ help='Sunburst best for large hierarchies [Taxonomizer]. '
952
+ 'Node-link best for inspecting structure at moderate depth.')
953
+ with vc2:
954
+ display_depth = st.slider('Depth (Level of Detail)', 1, 8, 4, 1,
955
+ help='How many levels to reveal at once.')
956
+ with vc3:
957
+ show_leaf_labels = st.checkbox('Leaf labels', value=False,
958
+ help='Show variable names on the node-link tree.')
959
+ st.divider()
960
+
961
+ if viz_mode == 'Sunburst (drill-down)':
962
+ st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth),
963
+ use_container_width=True)
964
+ st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
965
+ elif viz_mode == 'Treemap':
966
+ st.plotly_chart(plot_treemap(nodes), use_container_width=True)
967
+ else:
968
+ st.plotly_chart(plot_node_link(nodes, max_depth=display_depth,
969
+ show_leaf_labels=show_leaf_labels),
970
+ use_container_width=True)
971
+
972
+ with tabs[1]:
973
+ nm = _nmap(nodes)
974
+ agg_nodes = [n for n in nodes if n['type'] in ('aggregation', 'root')]
975
+ options = [f'{n["name"]} [{len(_leaf_ids(nodes, int(n["id"])))} vars]'
976
+ for n in agg_nodes]
977
+ if options:
978
+ sel = st.selectbox('Select a node', options)
979
+ sel_name = sel.split(' [')[0]
980
+ sel_node = next((n for n in agg_nodes if n['name'] == sel_name), None)
981
+ if sel_node:
982
+ lids = _leaf_ids(nodes, int(sel_node['id']))
983
+ leaf_ids_set = {nm[i]['metadata']['leaf_id']
984
+ for i in lids if i in nm and 'metadata' in nm[i]}
985
+ sub = can[can['_leaf_id'].isin(leaf_ids_set)]
986
+ st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
987
+ st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
988
+ use_container_width=True)
989
+
990
+ with tabs[2]:
991
+ st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), use_container_width=True)
992
+
993
+ with tabs[3]:
994
+ _base = safe_name(project_name)
995
+ col1, col2 = st.columns(2)
996
+ with col1:
997
+ st.download_button(
998
+ 'Hierarchy JSON',
999
+ data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
1000
+ file_name=f'{_base}_baseline_hierarchy.json',
1001
+ mime='application/json',
1002
+ use_container_width=True,
1003
+ )
1004
+ with col2:
1005
+ st.download_button(
1006
+ 'Canonical CSV',
1007
+ data=can.to_csv(index=False).encode('utf-8'),
1008
+ file_name=f'{_base}_baseline_canonical.csv',
1009
+ mime='text/csv',
1010
+ use_container_width=True,
1011
+ )
1012
+
1013
+ st.divider()
1014
+ # ── Save directly into the project's outputs/baseline/ folder ──────────────
1015
+ _out_dir = Path(__file__).resolve().parent / 'outputs' / 'baseline'
1016
+ st.markdown('### Save to project folder')
1017
+ st.caption(
1018
+ "The download buttons above go to your browser's Downloads folder (a browser "
1019
+ f'restriction). This button instead writes the files into `{_out_dir}` with the '
1020
+ 'dataset name β€” convenient for `evaluate_all.py`.'
1021
+ )
1022
+ if st.button('Save all to outputs/baseline/', type='primary',
1023
+ use_container_width=True):
1024
+ try:
1025
+ _out_dir.mkdir(parents=True, exist_ok=True)
1026
+ (_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
1027
+ json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
1028
+ can.to_csv(_out_dir / f'{_base}_baseline_canonical.csv', index=False)
1029
+ st.success(f'Saved to `{_out_dir}`:\n\n'
1030
+ f'- {_base}_baseline_hierarchy.json\n'
1031
+ f'- {_base}_baseline_canonical.csv')
1032
+ except Exception as _e:
1033
+ st.error(f'Could not save: {_e}')
1034
+
1035
+ with tabs[4]:
1036
+ import hierarchy_eval as he
1037
+
1038
+ st.subheader('Hierarchy Quality Evaluation')
1039
+ st.caption(
1040
+ 'No manually curated reference taxonomy is available for this experiment. '
1041
+ 'The metrics below are reference-free: they assess hierarchy structure, '
1042
+ 'label coherence and interpretability directly.'
1043
+ )
1044
+
1045
+ with st.spinner('Computing reference-free metrics…'):
1046
+ tm = he.traco_metrics(nodes)
1047
+ npmi = he.npmi_coherence(nodes, can['_text'].tolist())
1048
+
1049
+ # ── PRIMARY: reference-free hierarchy quality ─────────────────────────────
1050
+ st.markdown('#### Primary β€” reference-free hierarchy quality')
1051
+ p1, p2, p3 = st.columns(3)
1052
+ p1.metric('Parent–child coherence', tm['pc_coherence'],
1053
+ help='TraCo (Wu et al., AAAI 2024). Mean similarity of each node to its parent. '
1054
+ 'Higher = children correctly nest under their parent theme.')
1055
+ p2.metric('Sibling diversity', tm['sibling_diversity'],
1056
+ help='TraCo (Wu et al., AAAI 2024). Mean distance between sibling nodes. '
1057
+ 'Higher = siblings are distinct (LOW = redundant/repeated siblings).')
1058
+ p3.metric('NPMI label coherence', npmi,
1059
+ help='Lau et al., EACL 2014. Whether node-label terms genuinely co-occur in the '
1060
+ 'data. Higher = meaningful labels, not arbitrary term salads.')
1061
+ st.caption(f'Embedding backend: **{tm["encoder"]}**. '
1062
+ 'Coherence & diversity ∈ [βˆ’1, 1]; NPMI ∈ β‰ˆ[βˆ’1, 1].')
1063
+
1064
+ # ── Label-quality proxies (interpretability) ──────────────────────────────
1065
+ st.markdown('#### Label quality *(interpretability β€” reference-free)*')
1066
+ lq = he.label_quality(nodes)
1067
+ l1, l2, l3 = st.columns(3)
1068
+ l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
1069
+ help='% of internal labels that read as a real concept (short noun '
1070
+ 'phrase, WordNet head) rather than a "/"-joined term fragment.')
1071
+ l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
1072
+ help='% of internal labels duplicating a sibling label (lower is better).')
1073
+ l3.metric('Avg label words', lq['avg_label_words'],
1074
+ help='Mean label length in words (shorter = more name-like).')
1075
+
1076
+ # ── Structural metrics ────────────────────────────────────────────────────
1077
+ st.markdown('#### Structural statistics')
1078
+ sm = he.structural_stats(nodes)
1079
+ s1, s2, s3, s4, s5 = st.columns(5)
1080
+ s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
1081
+ s2.metric('Max leaf depth', sm['max_depth'])
1082
+ s3.metric('Avg leaf depth', sm['avg_leaf_depth'])
1083
+ s4.metric('Avg branching', sm['avg_branching_factor'])
1084
+ s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%",
1085
+ help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
1086
+
version2/data/HCP_S1200_DataDictionary_Oct_30_2023.csv ADDED
The diff for this file is too large to render. See raw diff
 
version2/data/ai-mind-variable-descriptions(in).csv ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Task,Variant,name,description,Decimal Places
2
+ DMS,DMS Recommended Standard,DMSCC,"DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays).",2
3
+ DMS,DMS Recommended Standard,DMSL0SD,"DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay.",4
4
+ DMS,DMS Recommended Standard,DMSL12SD,"DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay.",4
5
+ DMS,DMS Recommended Standard,DMSL4SD,"DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay.",4
6
+ DMS,DMS Recommended Standard,DMSLADSD,"DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay.",4
7
+ DMS,DMS Recommended Standard,DMSLSD,DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,4
8
+ DMS,DMS Recommended Standard,DMSLSSD,"DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations.",4
9
+ DMS,DMS Recommended Standard,DMSMDL,DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,4
10
+ DMS,DMS Recommended Standard,DMSMDL0,DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,4
11
+ DMS,DMS Recommended Standard,DMSMDL12,DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,4
12
+ DMS,DMS Recommended Standard,DMSMDL4,DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,4
13
+ DMS,DMS Recommended Standard,DMSMDLAD,DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,4
14
+ DMS,DMS Recommended Standard,DMSMDLS,DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,4
15
+ DMS,DMS Recommended Standard,DMSML,DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,4
16
+ DMS,DMS Recommended Standard,DMSML0,DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,4
17
+ DMS,DMS Recommended Standard,DMSML12,DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,4
18
+ DMS,DMS Recommended Standard,DMSML4,DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,4
19
+ DMS,DMS Recommended Standard,DMSMLAD,DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,4
20
+ DMS,DMS Recommended Standard,DMSMLS,DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,4
21
+ DMS,DMS Recommended Standard,DMSPC,DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,0
22
+ DMS,DMS Recommended Standard,DMSPC0,KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay.,0
23
+ DMS,DMS Recommended Standard,DMSPC12,KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay.,0
24
+ DMS,DMS Recommended Standard,DMSPC4,KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay.,0
25
+ DMS,DMS Recommended Standard,DMSPCAD,KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay.,0
26
+ DMS,DMS Recommended Standard,DMSPCS,KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli.,0
27
+ DMS,DMS Recommended Standard,DMSPEGC,DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays).,4
28
+ DMS,DMS Recommended Standard,DMSPEGE,KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays).,4
29
+ DMS,DMS Recommended Standard,DMSTC,DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,0
30
+ DMS,DMS Recommended Standard,DMSTC0,DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds.,0
31
+ DMS,DMS Recommended Standard,DMSTC12,DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds.,0
32
+ DMS,DMS Recommended Standard,DMSTC4,DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds.,0
33
+ DMS,DMS Recommended Standard,DMSTCAD,DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay.,0
34
+ DMS,DMS Recommended Standard,DMSTCS,DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli.,0
35
+ DMS,DMS Recommended Standard,DMSTE,"DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen.",0
36
+ DMS,DMS Recommended Standard,DMSTEAD,DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component.,0
37
+ DMS,DMS Recommended Standard,DMSTEC,"DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays).",0
38
+ DMS,DMS Recommended Standard,DMSTECAD,"DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component.",0
39
+ DMS,DMS Recommended Standard,DMSTED,"DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays).",0
40
+ DMS,DMS Recommended Standard,DMSTEDAD,"DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component.",0
41
+ DMS,DMS Recommended Standard,DMSTEP,"DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays).",0
42
+ DMS,DMS Recommended Standard,DMSTEPAD,"DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component.",0
43
+ MOT,MOT Tone 2.0,MOTML,The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials.,1
44
+ MOT,MOT Tone 2.0,MOTSDL,"This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials.",2
45
+ MOT,MOT Tone 2.0,MOTTC,The total number of assessment trials on which the subject made a correct response.,0
46
+ MOT,MOT Tone 2.0,MOTTE,The total number of assessment trials on which the subject failed to make a correct response.,0
47
+ PAL,PAL Recommended Standard Extended,PALFAMS28,"KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard..",0
48
+ PAL,PAL Recommended Standard Extended,PALMETS28,PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage. Does not include 12 box level to provide a direct comparison to Recommended Standard.,0
49
+ PAL,PAL Recommended Standard Extended,PALNPR28,PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached.,0
50
+ PAL,PAL Recommended Standard Extended,PALTA12,PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall.,0
51
+ PAL,PAL Recommended Standard Extended,PALTA2,PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall.,0
52
+ PAL,PAL Recommended Standard Extended,PALTA28,PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems. Does not include 12 box level to provide a direct comparison to Recommended Standard.,0
53
+ PAL,PAL Recommended Standard Extended,PALTA4,PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall.,0
54
+ PAL,PAL Recommended Standard Extended,PALTA6,PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall.,0
55
+ PAL,PAL Recommended Standard Extended,PALTA8,PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall.,0
56
+ PAL,PAL Recommended Standard Extended,PALTE12,PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials.,0
57
+ PAL,PAL Recommended Standard Extended,PALTE2,PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials.,0
58
+ PAL,PAL Recommended Standard Extended,PALTE28,PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials. Does not include 12 box level to provide a direct comparison to Recommended Standard.,0
59
+ PAL,PAL Recommended Standard Extended,PALTE4,PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials.,0
60
+ PAL,PAL Recommended Standard Extended,PALTE6,PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials.,0
61
+ PAL,PAL Recommended Standard Extended,PALTE8,PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials.,0
62
+ PAL,PAL Recommended Standard Extended,PALTEA12,"PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach.",0
63
+ PAL,PAL Recommended Standard Extended,PALTEA2,"PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach.",0
64
+ PAL,PAL Recommended Standard Extended,PALTEA28,"KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task. In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard.",0
65
+ PAL,PAL Recommended Standard Extended,PALTEA4,"PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach.",0
66
+ PAL,PAL Recommended Standard Extended,PALTEA6,"PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach.",0
67
+ PAL,PAL Recommended Standard Extended,PALTEA8,"PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach.",0
68
+ PRM,PRM Recommended Standard 18 Extended,PRMCLSDD,"PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds.",2
69
+ PRM,PRM Recommended Standard 18 Extended,PRMCLSDI,"PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds.",2
70
+ PRM,PRM Recommended Standard 18 Extended,PRMMCLD,"PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",2
71
+ PRM,PRM Recommended Standard 18 Extended,PRMMCLI,"PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",2
72
+ PRM,PRM Recommended Standard 18 Extended,PRMMDCLD,"PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",2
73
+ PRM,PRM Recommended Standard 18 Extended,PRMMDCLI,"PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",2
74
+ PRM,PRM Recommended Standard 18 Extended,PRMPCD,"KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage.",2
75
+ PRM,PRM Recommended Standard 18 Extended,PRMPCI,"KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage.",2
76
+ PRM,PRM Recommended Standard 18 Extended,PRMTSDSP,PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition.,2
77
+ RVP,RVP 3 Targets,RVPA,"KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences.",4
78
+ RVP,RVP 3 Targets,RVPLSD,RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials.,4
79
+ RVP,RVP 3 Targets,RVPMDL,KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials.,4
80
+ RVP,RVP 3 Targets,RVPML,RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials.,4
81
+ RVP,RVP 3 Targets,RVPPFA,KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms Γ· (False Alarms + Correct Rejections)),4
82
+ RVP,RVP 3 Targets,RVPPH,"RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits Γ· total number of sequences)",4
83
+ RVP,RVP 3 Targets,RVPTFA,RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms.,0
84
+ RVP,RVP 3 Targets,RVPTH,RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks.,0
85
+ RVP,RVP 3 Targets,RVPTM,RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks.,0
86
+ SWM,SWM Recommended Standard 2.0 Extended,SWMBE12,KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only.,0
87
+ SWM,SWM Recommended Standard 2.0 Extended,SWMBE4,KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only.,0
88
+ SWM,SWM Recommended Standard 2.0 Extended,SWMBE468,"KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials.",0
89
+ SWM,SWM Recommended Standard 2.0 Extended,SWMBE6,KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only.,0
90
+ SWM,SWM Recommended Standard 2.0 Extended,SWMBE8,KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only.,0
91
+ SWM,SWM Recommended Standard 2.0 Extended,SWMDE12,SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only.,0
92
+ SWM,SWM Recommended Standard 2.0 Extended,SWMDE4,SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only.,0
93
+ SWM,SWM Recommended Standard 2.0 Extended,SWMDE468,"SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials.",0
94
+ SWM,SWM Recommended Standard 2.0 Extended,SWMDE6,SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only.,0
95
+ SWM,SWM Recommended Standard 2.0 Extended,SWMDE8,SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only.,0
96
+ SWM,SWM Recommended Standard 2.0 Extended,SWMPR,"SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete.",0
97
+ SWM,SWM Recommended Standard 2.0 Extended,SWMS,"KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens.",0
98
+ SWM,SWM Recommended Standard 2.0 Extended,SWMS6,"SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes.",0
99
+ SWM,SWM Recommended Standard 2.0 Extended,SWMSX,"SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more.",0
100
+ SWM,SWM Recommended Standard 2.0 Extended,SWMTE12,"SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only.",0
101
+ SWM,SWM Recommended Standard 2.0 Extended,SWMTE4,"SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only.",0
102
+ SWM,SWM Recommended Standard 2.0 Extended,SWMTE468,"SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials.",0
103
+ SWM,SWM Recommended Standard 2.0 Extended,SWMTE6,"SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only.",0
104
+ SWM,SWM Recommended Standard 2.0 Extended,SWMTE8,"SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only.",0
105
+ SWM,SWM Recommended Standard 2.0 Extended,SWMWE12,SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only.,0
106
+ SWM,SWM Recommended Standard 2.0 Extended,SWMWE4,SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only.,0
107
+ SWM,SWM Recommended Standard 2.0 Extended,SWMWE468,"SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials.",0
108
+ SWM,SWM Recommended Standard 2.0 Extended,SWMWE6,SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only.,0
109
+ SWM,SWM Recommended Standard 2.0 Extended,SWMWE8,SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only.,0
version2/data/dictionary_harmonized_categories.csv ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,variable_codename_use,variable_description_use,harmonized_categories,harmonized_categories_description,in_dataset
2
+ 1,DMDBORN4,In what country {were you/was SP} born?,1,"Born in 50 US states or Washington, DC",Demographics
3
+ 2,DMDBORN4,In what country {were you/was SP} born?,2,Others,Demographics
4
+ 3,DMDBORN4,In what country {were you/was SP} born?,77,Refused,Demographics
5
+ 4,DMDBORN4,In what country {were you/was SP} born?,99,Don't Know,Demographics
6
+ 5,DMDEDUC2,Education level - Adults 20+,1,Less Than 9th Grade,Demographics
7
+ 6,DMDEDUC2,Education level - Adults 20+,2,9-11th Grade (Includes 12th grade with no diploma),Demographics
8
+ 7,DMDEDUC2,Education level - Adults 20+,3,High School Grad/GED or Equivalent,Demographics
9
+ 8,DMDEDUC2,Education level - Adults 20+,4,Some College or AA degree,Demographics
10
+ 9,DMDEDUC2,Education level - Adults 20+,5,College Graduate or above,Demographics
11
+ 10,DMDEDUC2,Education level - Adults 20+,7,Refused,Demographics
12
+ 11,DMDEDUC2,Education level - Adults 20+,9,Don't know,Demographics
13
+ 12,DMDEDUC3,Education level - Children/Youth 6-19,0,Never Attended / Kindergarten Only,Demographics
14
+ 13,DMDEDUC3,Education level - Children/Youth 6-19,1,1st Grade,Demographics
15
+ 14,DMDEDUC3,Education level - Children/Youth 6-19,2,2nd Grade,Demographics
16
+ 15,DMDEDUC3,Education level - Children/Youth 6-19,3,3rd Grade,Demographics
17
+ 16,DMDEDUC3,Education level - Children/Youth 6-19,4,4th Grade,Demographics
18
+ 17,DMDEDUC3,Education level - Children/Youth 6-19,5,5th Grade,Demographics
19
+ 18,DMDEDUC3,Education level - Children/Youth 6-19,6,6th Grade,Demographics
20
+ 19,DMDEDUC3,Education level - Children/Youth 6-19,7,7th Grade,Demographics
21
+ 20,DMDEDUC3,Education level - Children/Youth 6-19,8,8th Grade,Demographics
22
+ 21,DMDEDUC3,Education level - Children/Youth 6-19,9,9th Grade,Demographics
23
+ 22,DMDEDUC3,Education level - Children/Youth 6-19,10,10th Grade,Demographics
24
+ 23,DMDEDUC3,Education level - Children/Youth 6-19,11,11th Grade,Demographics
25
+ 24,DMDEDUC3,Education level - Children/Youth 6-19,12,"12th Grade, No Diploma",Demographics
26
+ 25,DMDEDUC3,Education level - Children/Youth 6-19,13,High School Graduate,Demographics
27
+ 26,DMDEDUC3,Education level - Children/Youth 6-19,14,GED or Equivalent,Demographics
28
+ 27,DMDEDUC3,Education level - Children/Youth 6-19,15,More than high school,Demographics
29
+ 28,DMDEDUC3,Education level - Children/Youth 6-19,55,Less Than 5th Grade,Demographics
30
+ 29,DMDEDUC3,Education level - Children/Youth 6-19,66,Less Than 9th Grade,Demographics
31
+ 30,DMDEDUC3,Education level - Children/Youth 6-19,77,Refused,Demographics
32
+ 31,DMDEDUC3,Education level - Children/Youth 6-19,99,Don't know,Demographics
33
+ 32,DMDFMSIZ,Total number of people in the Family,1,1,Demographics
34
+ 33,DMDFMSIZ,Total number of people in the Family,2,2,Demographics
35
+ 34,DMDFMSIZ,Total number of people in the Family,3,3,Demographics
36
+ 35,DMDFMSIZ,Total number of people in the Family,4,4,Demographics
37
+ 36,DMDFMSIZ,Total number of people in the Family,5,5,Demographics
38
+ 37,DMDFMSIZ,Total number of people in the Family,6,6,Demographics
39
+ 38,DMDFMSIZ,Total number of people in the Family,7,7 or more people in the Family,Demographics
40
+ 39,DMDHHSIZ,Total number of people in the Household,1,1,Demographics
41
+ 40,DMDHHSIZ,Total number of people in the Household,2,2,Demographics
42
+ 41,DMDHHSIZ,Total number of people in the Household,3,3,Demographics
43
+ 42,DMDHHSIZ,Total number of people in the Household,4,4,Demographics
44
+ 43,DMDHHSIZ,Total number of people in the Household,5,5,Demographics
45
+ 44,DMDHHSIZ,Total number of people in the Household,6,6,Demographics
46
+ 45,DMDHHSIZ,Total number of people in the Household,7,7 or more people in the Household,Demographics
47
+ 46,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,1,<20 years,Demographics
48
+ 47,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,2,20-39 years,Demographics
49
+ 48,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,3,40-59 years,Demographics
50
+ 49,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,4,60+ years,Demographics
51
+ 50,DMDYRSUS,Length of time the participant has been in the US.,1,Less than 1 year,Demographics
52
+ 51,DMDYRSUS,Length of time the participant has been in the US.,2,"1 yr., less than 5 yrs.",Demographics
53
+ 52,DMDYRSUS,Length of time the participant has been in the US.,3,"5 yrs., less than 10 yrs.",Demographics
54
+ 53,DMDYRSUS,Length of time the participant has been in the US.,4,"10 yrs., less than 15 yrs.",Demographics
55
+ 54,DMDHRBR4,HH reference person's country of birth,1,"Born in 50 US states or Washington, DC",Demographics
56
+ 55,DMDHRBR4,HH reference person's country of birth,2,Others,Demographics
57
+ 56,DMDHRBR4,HH reference person's country of birth,77,Refused,Demographics
58
+ 57,DMDHRBR4,HH reference person's country of birth,99,Don't Know,Demographics
59
+ 58,DMDHREDU,HH reference person's education level,1,Less than high school degree,Demographics
60
+ 59,DMDHREDU,HH reference person's education level,2,High school grad/GED or some college/AA degree,Demographics
61
+ 60,DMDHREDU,HH reference person's education level,3,College graduate or above,Demographics
62
+ 61,DMDHREDU,HH reference person's education level,7,Refused,Demographics
63
+ 62,DMDHREDU,HH reference person's education level,9,Don't know,Demographics
64
+ 63,DMDHREDU,HH reference person's education level,9,Don't Know,Demographics
65
+ 64,DMDHREDU,HH reference person's education level,3,High school grad/GED or some college/AA degree,Demographics
66
+ 65,DMDHRGND,Gender of the household reference person,1,Male,Demographics
67
+ 66,DMDHRGND,Gender of the household reference person,2,Female,Demographics
68
+ 67,DMDHRMAR,Marital Status of household reference person,1,Married/Living with partner,Demographics
69
+ 68,DMDHRMAR,Marital Status of household reference person,2,Widowed/Divorced/Separated,Demographics
70
+ 69,DMDHRMAR,Marital Status of household reference person,3,Never Married,Demographics
71
+ 70,DMDHRMAR,Marital Status of household reference person,77,Refused,Demographics
72
+ 71,DMDHRMAR,Marital Status of household reference person,99,Don't Know,Demographics
73
+ 72,DMDHSEDU,HH reference person's spouse's education level,1,Less than high school degree,Demographics
74
+ 73,DMDHSEDU,HH reference person's spouse's education level,2,High school grad/GED or some college/AA degree,Demographics
75
+ 74,DMDHSEDU,HH reference person's spouse's education level,3,College graduate or above,Demographics
76
+ 75,DMDHSEDU,HH reference person's spouse's education level,7,Refused,Demographics
77
+ 76,DMDHSEDU,HH reference person's spouse's education level,9,Don't Know,Demographics
78
+ 77,DMDMARTL,Marital status,1,Married,Demographics
79
+ 78,DMDMARTL,Marital status,2,Widowed,Demographics
80
+ 79,DMDMARTL,Marital status,3,Divorced,Demographics
81
+ 80,DMDMARTL,Marital status,4,Separated,Demographics
82
+ 81,DMDMARTL,Marital status,5,Never married,Demographics
83
+ 82,DMDMARTL,Marital status,6,Living with partner,Demographics
84
+ 83,DMDMARTL,Marital status,77,Refused,Demographics
85
+ 84,DMDMARTL,Marital status,99,Don't know,Demographics
86
+ 85,DMDYRSUS,Length of time the participant has been in the US.,5,"15 yrs., less than 20 yrs.",Demographics
87
+ 86,DMDYRSUS,Length of time the participant has been in the US.,6,"20 yrs., less than 30 yrs.",Demographics
88
+ 87,DMDYRSUS,Length of time the participant has been in the US.,7,"30 yrs., less than 40 yrs.",Demographics
89
+ 88,DMDYRSUS,Length of time the participant has been in the US.,8,"40 yrs., less than 50 yrs.",Demographics
90
+ 89,DMDYRSUS,Length of time the participant has been in the US.,9,50 years or more,Demographics
91
+ 90,DMDYRSUS,Length of time the participant has been in the US.,77,Refused,Demographics
92
+ 91,DMDYRSUS,Length of time the participant has been in the US.,99,Don't know,Demographics
93
+ 92,FIALANG,Language of the Family Interview Instrument,1,English,Demographics
94
+ 93,FIALANG,Language of the Family Interview Instrument,2,Spanish,Demographics
95
+ 94,FIALANG,Language of the Family Interview Instrument,3,Other,Demographics
96
+ 95,INDFMIN2,Total family income (reported as a range value in dollars),1,"$ 0 to $ 4,999",Demographics
97
+ 96,INDFMIN2,Total family income (reported as a range value in dollars),2,"$ 5,000 to $ 9,999",Demographics
98
+ 97,INDFMIN2,Total family income (reported as a range value in dollars),3,"$10,000 to $14,999",Demographics
99
+ 98,INDFMIN2,Total family income (reported as a range value in dollars),4,"$15,000 to $19,999",Demographics
100
+ 99,INDFMIN2,Total family income (reported as a range value in dollars),5,"$20,000 to $24,999",Demographics
101
+ 100,INDFMIN2,Total family income (reported as a range value in dollars),6,"$25,000 to $34,999",Demographics
102
+ 101,INDFMIN2,Total family income (reported as a range value in dollars),7,"$35,000 to $44,999",Demographics
103
+ 102,INDFMIN2,Total family income (reported as a range value in dollars),8,"$45,000 to $54,999",Demographics
104
+ 103,INDFMIN2,Total family income (reported as a range value in dollars),16,"$50,000 and over",Demographics
105
+ 104,INDFMIN2,Total family income (reported as a range value in dollars),99,Don't know,Demographics
106
+ 105,INDFMIN2,Total family income (reported as a range value in dollars),9,"$55,000 to $64,999",Demographics
107
+ 106,INDFMIN2,Total family income (reported as a range value in dollars),10,"$65,000 to $74,999",Demographics
108
+ 107,INDFMIN2,Total family income (reported as a range value in dollars),12,"$20,000 and Over",Demographics
109
+ 108,INDFMIN2,Total family income (reported as a range value in dollars),13,"Under $20,000",Demographics
110
+ 109,INDFMIN2,Total family income (reported as a range value in dollars),14,"$75,000 to $99,999",Demographics
111
+ 110,INDFMIN2,Total family income (reported as a range value in dollars),15,"$100,000 and Over",Demographics
112
+ 111,INDFMIN2,Total family income (reported as a range value in dollars),77,Refused,Demographics
113
+ 112,INDFMIN2,Total family income (reported as a range value in dollars),11,"$75,000 and Over",Demographics
114
+ 113,RIDRETH1,Recode of reported race and Hispanic origin information,1,Mexican American,Demographics
115
+ 114,RIDRETH1,Recode of reported race and Hispanic origin information,3,Non-Hispanic White,Demographics
116
+ 115,RIDRETH1,Recode of reported race and Hispanic origin information,4,Non-Hispanic Black,Demographics
117
+ 116,RIDRETH1,Recode of reported race and Hispanic origin information,5,Other Race - Including Multi-Racial,Demographics
118
+ 117,RIDRETH1,Recode of reported race and Hispanic origin information,2,Other Hispanic,Demographics
119
+ 118,RIDSTATR,Interview and Examination Status of the Sample Person.,1,Interviewed Only,Demographics
120
+ 119,RIDSTATR,Interview and Examination Status of the Sample Person.,2,Both Interviewed and MEC examined,Demographics
121
+ 120,MCD180B,Age when told you had congestive heart failure,16,16 years or younger,Questionnaire
122
+ 121,MCD180B,Age when told you had congestive heart failure,17-79,17-79 years old,Questionnaire
123
+ 122,MCD180B,Age when told you had congestive heart failure,17-84,17-84 years old,Questionnaire
124
+ 123,MCD180B,Age when told you had congestive heart failure,17-89,17-89 years old,Questionnaire
125
+ 124,MCD180B,Age when told you had congestive heart failure,18-79,18-79 years old,Questionnaire
126
+ 125,MCD180B,Age when told you had congestive heart failure,80,80 years or older,Questionnaire
127
+ 126,MCD180B,Age when told you had congestive heart failure,85,85 years or older,Questionnaire
128
+ 127,MCD180B,Age when told you had congestive heart failure,90,90 + years,Questionnaire
129
+ 128,MCD180B,Age when told you had congestive heart failure,99999,Don't know,Questionnaire
130
+ 129,MCD180B,Age when told you had congestive heart failure,77777,Refused,Questionnaire
131
+ 130,MCD180C,Age when told had coronary heart disease,16,16 years or younger,Questionnaire
132
+ 131,MCD180C,Age when told had coronary heart disease,17-79,17-79 years old,Questionnaire
133
+ 132,MCD180C,Age when told had coronary heart disease,17-84,17-84 years old,Questionnaire
134
+ 133,MCD180C,Age when told had coronary heart disease,20-79,20-79 years old,Questionnaire
135
+ 134,MCD180C,Age when told had coronary heart disease,80,80 years or older,Questionnaire
136
+ 135,MCD180C,Age when told had coronary heart disease,85,85 years or older,Questionnaire
137
+ 136,MCD180C,Age when told had coronary heart disease,99999,Don't know,Questionnaire
138
+ 137,MCD180C,Age when told had coronary heart disease,77777,Refused,Questionnaire
139
+ 138,MCD180D,Age when told you had angina pectoris,16,16 years or younger,Questionnaire
140
+ 139,MCD180D,Age when told you had angina pectoris,17-84,17-84 years old,Questionnaire
141
+ 140,MCD180D,Age when told you had angina pectoris,85,85 years or older,Questionnaire
142
+ 141,MCD180D,Age when told you had angina pectoris,99999,Don't know,Questionnaire
143
+ 142,MCD180D,Age when told you had angina pectoris,77777,Refused,Questionnaire
144
+ 143,MCD180D,Age when told you had angina pectoris,17-79,17-79 years old,Questionnaire
145
+ 144,MCD180D,Age when told you had angina pectoris,20-79,20-79 years old,Questionnaire
146
+ 145,MCD180D,Age when told you had angina pectoris,80,80 years or older,Questionnaire
147
+ 146,MCD180E,Age when told you had heart attack,16,16 years or younger,Questionnaire
148
+ 147,MCD180E,Age when told you had heart attack,17-79,17-79 years old,Questionnaire
149
+ 148,MCD180E,Age when told you had heart attack,17-84,17-84 years old,Questionnaire
150
+ 149,MCD180E,Age when told you had heart attack,17-88,17-88 years old,Questionnaire
151
+ 150,MCD180E,Age when told you had heart attack,19-79,19-79 years old,Questionnaire
152
+ 151,MCD180E,Age when told you had heart attack,80,80 years or older,Questionnaire
153
+ 152,MCD180E,Age when told you had heart attack,85,85 years or older,Questionnaire
154
+ 153,MCD180E,Age when told you had heart attack,90,90 + years,Questionnaire
155
+ 154,MCD180E,Age when told you had heart attack,99999,Don't know,Questionnaire
156
+ 155,MCD180E,Age when told you had heart attack,77777,Refused,Questionnaire
157
+ 156,MCD180F,Age when told you had a stroke,16,16 years or younger,Questionnaire
158
+ 157,MCD180F,Age when told you had a stroke,17-79,17-79 years old,Questionnaire
159
+ 158,MCD180F,Age when told you had a stroke,17-84,17-84 years old,Questionnaire
160
+ 159,MCD180F,Age when told you had a stroke,17-89,17-89 years old,Questionnaire
161
+ 160,MCD180F,Age when told you had a stroke,80,80 years or older,Questionnaire
162
+ 161,MCD180F,Age when told you had a stroke,85,85 years or older,Questionnaire
163
+ 162,MCD180F,Age when told you had a stroke,90,90 + years,Questionnaire
164
+ 163,MCD180F,Age when told you had a stroke,99999,Don't know,Questionnaire
165
+ 164,MCD180F,Age when told you had a stroke,77777,Refused,Questionnaire
166
+ 165,MCD180G,Age when told you had emphysema,16,16 years or younger,Questionnaire
167
+ 166,MCD180G,Age when told you had emphysema,17-79,17-79 years old,Questionnaire
168
+ 167,MCD180G,Age when told you had emphysema,17-84,17-84 years old,Questionnaire
169
+ 168,MCD180G,Age when told you had emphysema,17-89,17-89 years old,Questionnaire
170
+ 169,MCD180G,Age when told you had emphysema,80,80 years or older,Questionnaire
171
+ 170,MCD180G,Age when told you had emphysema,85,85 years or older,Questionnaire
172
+ 171,MCD180G,Age when told you had emphysema,90,90 + years,Questionnaire
173
+ 172,MCD180G,Age when told you had emphysema,99999,Don't know,Questionnaire
174
+ 173,MCD180G,Age when told you had emphysema,77777,Refused,Questionnaire
175
+ 174,MCD180K,Age when told you had chronic bronchitis,16,16 years or younger,Questionnaire
176
+ 175,MCD180K,Age when told you had chronic bronchitis,17-79,17-79 years old,Questionnaire
177
+ 176,MCD180K,Age when told you had chronic bronchitis,17-83,17-83 years old,Questionnaire
178
+ 177,MCD180K,Age when told you had chronic bronchitis,17-89,17-89 years old,Questionnaire
179
+ 178,MCD180K,Age when told you had chronic bronchitis,80,80 years or older,Questionnaire
180
+ 179,MCD180K,Age when told you had chronic bronchitis,85,85 years or older,Questionnaire
181
+ 180,MCD180K,Age when told you had chronic bronchitis,90,90 + years,Questionnaire
182
+ 181,MCD180K,Age when told you had chronic bronchitis,99999,Don't know,Questionnaire
183
+ 182,MCD180K,Age when told you had chronic bronchitis,77777,Refused,Questionnaire
184
+ 183,MCD180L,Age when told you had a liver condition,16,16 years or younger,Questionnaire
185
+ 184,MCD180L,Age when told you had a liver condition,17-78,17-78 years old,Questionnaire
186
+ 185,MCD180L,Age when told you had a liver condition,17-79,17-79 years old,Questionnaire
187
+ 186,MCD180L,Age when told you had a liver condition,17-83,17-83 years old,Questionnaire
188
+ 187,MCD180L,Age when told you had a liver condition,80,80 years or older,Questionnaire
189
+ 188,MCD180L,Age when told you had a liver condition,85,85 years or older,Questionnaire
190
+ 189,MCD180L,Age when told you had a liver condition,99999,Don't know,Questionnaire
191
+ 190,MCD180L,Age when told you had a liver condition,77777,Refused,Questionnaire
192
+ 191,MCQ180H,Age when told you had a goiter,16,16 years or younger,Questionnaire
193
+ 192,MCQ180H,Age when told you had a goiter,17-84,17-84 years old,Questionnaire
194
+ 193,MCQ180H,Age when told you had a goiter,90,90 + years,Questionnaire
195
+ 194,MCQ180H,Age when told you had a goiter,99999,Don't know,Questionnaire
196
+ 195,MCD180M,Age when told you had thyroid problem,17-89,17-89 years old,Questionnaire
197
+ 196,MCD180M,Age when told you had thyroid problem,16,16 years or younger,Questionnaire
198
+ 197,MCD180M,Age when told you had thyroid problem,99999,Don't know,Questionnaire
199
+ 198,MCD180M,Age when told you had thyroid problem,17-84,17-84 years old,Questionnaire
200
+ 199,MCD180M,Age when told you had thyroid problem,80,80 years or older,Questionnaire
201
+ 200,MCD180M,Age when told you had thyroid problem,85,85 years or older,Questionnaire
202
+ 201,MCD180M,Age when told you had thyroid problem,77777,Refused,Questionnaire
203
+ 202,MCD180M,Age when told you had thyroid problem,17-79,17-79 years old,Questionnaire
204
+ 203,MCD180N,Age when told you had gout,16,16 years or younger,Questionnaire
205
+ 204,MCD180N,Age when told you had gout,17-79,17-79 years old,Questionnaire
206
+ 205,MCD180N,Age when told you had gout,17-86,17-86 years old,Questionnaire
207
+ 206,MCD180N,Age when told you had gout,80,80 years or older,Questionnaire
208
+ 207,MCD180N,Age when told you had gout,99999,Don't know,Questionnaire
209
+ 208,MCD180N,Age when told you had gout,77777,Refused,Questionnaire
210
+ 209,MCQ025,Age when first had asthma,Jan-19,1-19 years old,Questionnaire
211
+ 210,MCQ025,Age when first had asthma,Jan-79,1-79 years old,Questionnaire
212
+ 211,MCQ025,Age when first had asthma,Jan-84,1-84 years old,Questionnaire
213
+ 212,MCQ025,Age when first had asthma,Jan-88,1-88 years old,Questionnaire
214
+ 213,MCQ025,Age when first had asthma,80,80 years or older,Questionnaire
215
+ 214,MCQ025,Age when first had asthma,85,85 years or older,Questionnaire
216
+ 215,MCQ025,Age when first had asthma,99999,Don't know,Questionnaire
217
+ 216,MCQ025,Age when first had asthma,1,Less than 1 year,Questionnaire
218
+ 217,MCQ025,Age when first had asthma,77777,Refused,Questionnaire
219
+ 218,MCD180A,Age when told you had arthritis,16,16 years or younger,Questionnaire
220
+ 219,MCD180A,Age when told you had arthritis,17-89,17-89 years old,Questionnaire
221
+ 220,MCD180A,Age when told you had arthritis,90,90 + years,Questionnaire
222
+ 221,MCD180A,Age when told you had arthritis,99999,Don't know,Questionnaire
223
+ 222,MCD180A,Age when told you had arthritis,17-79,17-79 years old,Questionnaire
224
+ 223,MCD180A,Age when told you had arthritis,80,80 years or older,Questionnaire
225
+ 224,MCD180A,Age when told you had arthritis,77777,Refused,Questionnaire
226
+ 225,MCD180A,Age when told you had arthritis,17-84,17-84 years old,Questionnaire
227
+ 226,MCD180A,Age when told you had arthritis,85,85 years or older,Questionnaire
228
+ 227,MCQ180H,Age when told you had a goiter,17-72,17-72 years old,Questionnaire
229
+ 228,MCQ180H,Age when told you had a goiter,85,85 years or older,Questionnaire
230
+ 229,MCQ180H,Age when told you had a goiter,77777,Refused,Questionnaire
231
+ 230,MCQ195,Which type of arthritis was it,9,Don't know,Questionnaire
232
+ 231,MCQ195,Which type of arthritis was it,2,Osteoarthritis or degenerative arthritis,Questionnaire
233
+ 232,MCQ195,Which type of arthritis was it,4,Other,Questionnaire
234
+ 233,MCQ195,Which type of arthritis was it,3,Psoriatic arthritis,Questionnaire
235
+ 234,MCQ195,Which type of arthritis was it,7,Refused,Questionnaire
236
+ 235,MCQ195,Which type of arthritis was it,1,Rheumatoid arthritis,Questionnaire
237
+ 236,MCQ240A,Age when bladder cancer first diagnosed,17-78,17-78 years old,Questionnaire
238
+ 237,MCQ240A,Age when bladder cancer first diagnosed,17-83,17-83 years old,Questionnaire
239
+ 238,MCQ240A,Age when bladder cancer first diagnosed,16,16 years or younger,Questionnaire
240
+ 239,MCQ240A,Age when bladder cancer first diagnosed,80,80 years or older,Questionnaire
241
+ 240,MCQ240A,Age when bladder cancer first diagnosed,85,85 years or older,Questionnaire
242
+ 241,MCQ240A,Age when bladder cancer first diagnosed,99999,Don't know,Questionnaire
243
+ 242,MCQ240A,Age when bladder cancer first diagnosed,77777,Refused,Questionnaire
244
+ 243,MCQ240B,Age when blood cancer was first diagnosed,16,16 years or younger,Questionnaire
245
+ 244,MCQ240B,Age when blood cancer was first diagnosed,17-66,17-66 years old,Questionnaire
246
+ 245,MCQ240B,Age when blood cancer was first diagnosed,17-70,17-70 years old,Questionnaire
247
+ 246,MCQ240B,Age when blood cancer was first diagnosed,80,80 years or older,Questionnaire
248
+ 247,MCQ240B,Age when blood cancer was first diagnosed,85,85 years or older,Questionnaire
249
+ 248,MCQ240B,Age when blood cancer was first diagnosed,99999,Don't know,Questionnaire
250
+ 249,MCQ240B,Age when blood cancer was first diagnosed,77777,Refused,Questionnaire
251
+ 250,MCQ240C,Age when bone cancer was first diagnosed,17-77,17-77 years old,Questionnaire
252
+ 251,MCQ240C,Age when bone cancer was first diagnosed,17-84,17-84 years old,Questionnaire
253
+ 252,MCQ240C,Age when bone cancer was first diagnosed,16,16 years or younger,Questionnaire
254
+ 253,MCQ240C,Age when bone cancer was first diagnosed,55-76,55-76 years old,Questionnaire
255
+ 254,MCQ240C,Age when bone cancer was first diagnosed,80,80 years or older,Questionnaire
256
+ 255,MCQ240C,Age when bone cancer was first diagnosed,85,85 years or older,Questionnaire
257
+ 256,MCQ240C,Age when bone cancer was first diagnosed,99999,Don't know,Questionnaire
258
+ 257,MCQ240C,Age when bone cancer was first diagnosed,77777,Refused,Questionnaire
259
+ 258,MCQ240CC,Age when uterine cancer was first diagnosed,16,16 years or younger,Questionnaire
260
+ 259,MCQ240CC,Age when uterine cancer was first diagnosed,17-77,17-77 years old,Questionnaire
261
+ 260,MCQ240CC,Age when uterine cancer was first diagnosed,17-84,17-84 years old,Questionnaire
262
+ 261,MCQ240CC,Age when uterine cancer was first diagnosed,20-72,20-72 years old,Questionnaire
263
+ 262,MCQ240CC,Age when uterine cancer was first diagnosed,80,80 years or older,Questionnaire
264
+ 263,MCQ240CC,Age when uterine cancer was first diagnosed,85,85 years or older,Questionnaire
265
+ 264,MCQ240CC,Age when uterine cancer was first diagnosed,99999,Don't know,Questionnaire
266
+ 265,MCQ240CC,Age when uterine cancer was first diagnosed,77777,Refused,Questionnaire
267
+ 266,MCQ240D,Age when brain cancer was first diagnosed,16,16 years or younger,Questionnaire
268
+ 267,MCQ240D,Age when brain cancer was first diagnosed,17-73,17-73 years old,Questionnaire
269
+ 268,MCQ240D,Age when brain cancer was first diagnosed,17-75,17-75 years old,Questionnaire
270
+ 269,MCQ240D,Age when brain cancer was first diagnosed,80,80 years or older,Questionnaire
271
+ 270,MCQ240D,Age when brain cancer was first diagnosed,85,85 years or older,Questionnaire
272
+ 271,MCQ240D,Age when brain cancer was first diagnosed,99999,Don't know,Questionnaire
273
+ 272,MCQ240D,Age when brain cancer was first diagnosed,77777,Refused,Questionnaire
274
+ 273,MCQ240DD,Age when some other type of cancer was first diagnosed,16,16 years or younger,Questionnaire
275
+ 274,MCQ240DD,Age when some other type of cancer was first diagnosed,17-77,17-77 years old,Questionnaire
276
+ 275,MCQ240DD,Age when some other type of cancer was first diagnosed,17-78,17-78 years old,Questionnaire
277
+ 276,MCQ240DD,Age when some other type of cancer was first diagnosed,17-83,17-83 years old,Questionnaire
278
+ 277,MCQ240DD,Age when some other type of cancer was first diagnosed,80,80 years or older,Questionnaire
279
+ 278,MCQ240DD,Age when some other type of cancer was first diagnosed,85,85 years or older,Questionnaire
280
+ 279,MCQ240DD,Age when some other type of cancer was first diagnosed,99999,Don't know,Questionnaire
281
+ 280,MCQ240DD,Age when some other type of cancer was first diagnosed,77777,Refused,Questionnaire
282
+ 281,MCQ240DK,Age when cancer was first diagnosed,20-80,20-80 years old,Questionnaire
283
+ 282,MCQ240DK,Age when cancer was first diagnosed,23-47,23-47 years old,Questionnaire
284
+ 283,MCQ240DK,Age when cancer was first diagnosed,80,80 years or older,Questionnaire
285
+ 284,MCQ240DK,Age when cancer was first diagnosed,85,85 years or older,Questionnaire
286
+ 285,MCQ240DK,Age when cancer was first diagnosed,99999,Don't know,Questionnaire
287
+ 286,MCQ240DK,Age when cancer was first diagnosed,77777,Refused,Questionnaire
288
+ 287,MCQ240E,Age when breast cancer was first diagnosed,16,16 years or younger,Questionnaire
289
+ 288,MCQ240E,Age when breast cancer was first diagnosed,17-78,17-78 years old,Questionnaire
290
+ 289,MCQ240E,Age when breast cancer was first diagnosed,17-79,17-79 years old,Questionnaire
291
+ 290,MCQ240E,Age when breast cancer was first diagnosed,17-84,17-84 years old,Questionnaire
292
+ 291,MCQ240E,Age when breast cancer was first diagnosed,80,80 years or older,Questionnaire
293
+ 292,MCQ240E,Age when breast cancer was first diagnosed,85,85 years or older,Questionnaire
294
+ 293,MCQ240E,Age when breast cancer was first diagnosed,99999,Don't know,Questionnaire
295
+ 294,MCQ240E,Age when breast cancer was first diagnosed,77777,Refused,Questionnaire
296
+ 295,MCQ240F,Age when cervical cancer was first diagnosed,16,16 years or younger,Questionnaire
297
+ 296,MCQ240F,Age when cervical cancer was first diagnosed,17-65,17-65 years old,Questionnaire
298
+ 297,MCQ240F,Age when cervical cancer was first diagnosed,17-73,17-73 years old,Questionnaire
299
+ 298,MCQ240F,Age when cervical cancer was first diagnosed,80,80 years or older,Questionnaire
300
+ 299,MCQ240F,Age when cervical cancer was first diagnosed,85,85 years or older,Questionnaire
301
+ 300,MCQ240F,Age when cervical cancer was first diagnosed,99999,Don't know,Questionnaire
302
+ 301,MCQ240F,Age when cervical cancer was first diagnosed,77777,Refused,Questionnaire
303
+ 302,MCQ240G,Age when colon cancer was first diagnosed,16,16 years or younger,Questionnaire
304
+ 303,MCQ240G,Age when colon cancer was first diagnosed,17-79,17-79 years old,Questionnaire
305
+ 304,MCQ240G,Age when colon cancer was first diagnosed,17-84,17-84 years old,Questionnaire
306
+ 305,MCQ240G,Age when colon cancer was first diagnosed,21-79,21-79 years old,Questionnaire
307
+ 306,MCQ240G,Age when colon cancer was first diagnosed,80,80 years or older,Questionnaire
308
+ 307,MCQ240G,Age when colon cancer was first diagnosed,85,85 years or older,Questionnaire
309
+ 308,MCQ240G,Age when colon cancer was first diagnosed,99999,Don't know,Questionnaire
310
+ 309,MCQ240G,Age when colon cancer was first diagnosed,77777,Refused,Questionnaire
311
+ 310,MCQ240L,Age when leukemia was first diagnosed,17-70,17-70 years old,Questionnaire
312
+ 311,MCQ240L,Age when leukemia was first diagnosed,17-75,17-75 years old,Questionnaire
313
+ 312,MCQ240L,Age when leukemia was first diagnosed,28-84,28-84 years old,Questionnaire
314
+ 313,MCQ240L,Age when leukemia was first diagnosed,16,16 years or younger,Questionnaire
315
+ 314,MCQ240L,Age when leukemia was first diagnosed,80,80 years or older,Questionnaire
316
+ 315,MCQ240L,Age when leukemia was first diagnosed,85,85 years or older,Questionnaire
317
+ 316,MCQ240L,Age when leukemia was first diagnosed,99999,Don't know,Questionnaire
318
+ 317,MCQ240L,Age when leukemia was first diagnosed,77777,Refused,Questionnaire
319
+ 318,MCQ240N,Age when lung cancer was first diagnosed,16,16 years or younger,Questionnaire
320
+ 319,MCQ240N,Age when lung cancer was first diagnosed,17-76,17-76 years old,Questionnaire
321
+ 320,MCQ240N,Age when lung cancer was first diagnosed,17-84,17-84 years old,Questionnaire
322
+ 321,MCQ240N,Age when lung cancer was first diagnosed,29-79,29-79 years old,Questionnaire
323
+ 322,MCQ240N,Age when lung cancer was first diagnosed,80,80 years or older,Questionnaire
324
+ 323,MCQ240N,Age when lung cancer was first diagnosed,85,85 years or older,Questionnaire
325
+ 324,MCQ240N,Age when lung cancer was first diagnosed,99999,Don't know,Questionnaire
326
+ 325,MCQ240N,Age when lung cancer was first diagnosed,77777,Refused,Questionnaire
327
+ 326,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,16,16 years or younger,Questionnaire
328
+ 327,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,17-76,17-76 years old,Questionnaire
329
+ 328,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,17-80,17-80 years old,Questionnaire
330
+ 329,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,19-79,19-79 years old,Questionnaire
331
+ 330,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,80,80 years or older,Questionnaire
332
+ 331,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,85,85 years or older,Questionnaire
333
+ 332,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,99999,Don't know,Questionnaire
334
+ 333,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,77777,Refused,Questionnaire
335
+ 334,MCQ240P,Age when melanoma was first diagnosed,16,16 years or younger,Questionnaire
336
+ 335,MCQ240P,Age when melanoma was first diagnosed,17-78,17-78 years old,Questionnaire
337
+ 336,MCQ240P,Age when melanoma was first diagnosed,17-79,17-79 years old,Questionnaire
338
+ 337,MCQ240P,Age when melanoma was first diagnosed,17-83,17-83 years old,Questionnaire
339
+ 338,MCQ240P,Age when melanoma was first diagnosed,80,80 years or older,Questionnaire
340
+ 339,MCQ240P,Age when melanoma was first diagnosed,85,85 years or older,Questionnaire
341
+ 340,MCQ240P,Age when melanoma was first diagnosed,99999,Don't know,Questionnaire
342
+ 341,MCQ240P,Age when melanoma was first diagnosed,77777,Refused,Questionnaire
343
+ 342,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",16,16 years or younger,Questionnaire
344
+ 343,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",17-79,17-79 years old,Questionnaire
345
+ 344,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",27-70,27-70 years old,Questionnaire
346
+ 345,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",30-70,30-70 years old,Questionnaire
347
+ 346,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",80,80 years or older,Questionnaire
348
+ 347,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",85,85 years or older,Questionnaire
349
+ 348,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",99999,Don't know,Questionnaire
350
+ 349,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",77777,Refused,Questionnaire
351
+ 350,MCQ240U,Age when prostate cancer was first diagnosed,17-79,17-79 years old,Questionnaire
352
+ 351,MCQ240U,Age when prostate cancer was first diagnosed,17-84,17-84 years old,Questionnaire
353
+ 352,MCQ240U,Age when prostate cancer was first diagnosed,16,16 years or younger,Questionnaire
354
+ 353,MCQ240U,Age when prostate cancer was first diagnosed,32-79,32-79 years old,Questionnaire
355
+ 354,MCQ240U,Age when prostate cancer was first diagnosed,80,80 years or older,Questionnaire
356
+ 355,MCQ240U,Age when prostate cancer was first diagnosed,85,85 years or older,Questionnaire
357
+ 356,MCQ240U,Age when prostate cancer was first diagnosed,99999,Don't know,Questionnaire
358
+ 357,MCQ240U,Age when prostate cancer was first diagnosed,77777,Refused,Questionnaire
359
+ 358,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,16,16 years or younger,Questionnaire
360
+ 359,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,17-78,17-78 years old,Questionnaire
361
+ 360,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,17-79,17-79 years old,Questionnaire
362
+ 361,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,17-84,17-84 years old,Questionnaire
363
+ 362,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,80,80 years or older,Questionnaire
364
+ 363,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,85,85 years or older,Questionnaire
365
+ 364,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,99999,Don't know,Questionnaire
366
+ 365,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,77777,Refused,Questionnaire
367
+ 366,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,16,16 years or younger,Questionnaire
368
+ 367,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,17-79,17-79 years old,Questionnaire
369
+ 368,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,17-84,17-84 years old,Questionnaire
370
+ 369,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,18-79,18-79 years old,Questionnaire
371
+ 370,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,80,80 years or older,Questionnaire
372
+ 371,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,85,85 years or older,Questionnaire
373
+ 372,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,99999,Don't know,Questionnaire
374
+ 373,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,77777,Refused,Questionnaire
375
+ 374,MCQ240Z,Age when stomach cancer was first diagnosed,16,16 years or younger,Questionnaire
376
+ 375,MCQ240Z,Age when stomach cancer was first diagnosed,17-79,17-79 years old,Questionnaire
377
+ 376,MCQ240Z,Age when stomach cancer was first diagnosed,22-82,22-82 years old,Questionnaire
378
+ 377,MCQ240Z,Age when stomach cancer was first diagnosed,32-76,32-76 years old,Questionnaire
379
+ 378,MCQ240Z,Age when stomach cancer was first diagnosed,80,80 years or older,Questionnaire
380
+ 379,MCQ240Z,Age when stomach cancer was first diagnosed,85,85 years or older,Questionnaire
381
+ 380,MCQ240Z,Age when stomach cancer was first diagnosed,99999,Don't know,Questionnaire
382
+ 381,MCQ240Z,Age when stomach cancer was first diagnosed,77777,Refused,Questionnaire
383
+ 382,MCQ280,About how old was she when she fractured her hip (the first time)?,1-101,1-101 years old,Questionnaire
384
+ 383,MCQ280,About how old was she when she fractured her hip (the first time)?,555,50 +,Questionnaire
385
+ 384,MCQ280,About how old was she when she fractured her hip (the first time)?,9-107,9-107 years old,Questionnaire
386
+ 385,MCQ280,About how old was she when she fractured her hip (the first time)?,99999,Don't know,Questionnaire
387
+ 386,MCQ280,About how old was she when she fractured her hip (the first time)?,77777,Refused,Questionnaire
388
+ 387,MCQ280,About how old was she when she fractured her hip (the first time)?,444,Under 50,Questionnaire
389
+ 388,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,16,16 years or younger,Questionnaire
390
+ 389,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,17-79,17-79 years old,Questionnaire
391
+ 390,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,17-85,17-85 years old,Questionnaire
392
+ 391,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,80,80 years or older,Questionnaire
393
+ 392,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,999,Don't know,Questionnaire
394
+ 393,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,777,Refused,Questionnaire
395
+ 394,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,Nov-99,11-99 years old,Questionnaire
396
+ 395,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,15-79,15-79 years old,Questionnaire
397
+ 396,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,20-87,20-87 years old,Questionnaire
398
+ 397,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,80,80 years or older,Questionnaire
399
+ 398,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,90,90 + years,Questionnaire
400
+ 399,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,99999,Don't know,Questionnaire
401
+ 400,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,77777,Refused,Questionnaire
402
+ 401,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",Jan-95,1-95 times,Questionnaire
403
+ 402,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",Jan-99,1-99 times,Questionnaire
404
+ 403,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",100,100 +,Questionnaire
405
+ 404,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",99999,Don't know,Questionnaire
406
+ 405,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",77777,Refused,Questionnaire
407
+ 406,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",Jan-91,1-91 times,Questionnaire
408
+ 407,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",Jan-99,1-99 times,Questionnaire
409
+ 408,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",100,100 +,Questionnaire
410
+ 409,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",999,Don't know,Questionnaire
411
+ 410,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",777,Refused,Questionnaire
412
+ 411,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",Jan-91,1-91 times,Questionnaire
413
+ 412,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",Jan-99,1-99 times,Questionnaire
414
+ 413,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",100,100 +,Questionnaire
415
+ 414,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",99999,Don't know,Questionnaire
416
+ 415,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",77777,Refused,Questionnaire
417
+ 416,BMIWAIST,Waist Circumference Comment,1,Could not obtain,Response
418
+ 417,NA,NA,1,Breakfast,Dietary
419
+ 418,NA,NA,2,Brunch,Dietary
420
+ 419,NA,NA,3,Lunch,Dietary
421
+ 420,NA,NA,4,Snack/beverage,Dietary
422
+ 421,NA,NA,5,Dinner/supper,Dietary
423
+ 422,NA,NA,6,Infant feeding,Dietary
424
+ 423,NA,NA,7,Extended consumption,Dietary
425
+ 424,NA,NA,8,Other,Dietary
426
+ 425,NA,NA,9,Desayuno (Spanish),Dietary
427
+ 426,NA,NA,10,Almuerzo (Spanish),Dietary
428
+ 427,NA,NA,11,Comida (Spanish),Dietary
429
+ 428,NA,NA,12,Merienda (Spanish),Dietary
430
+ 429,NA,NA,13,Cena (Spanish),Dietary
431
+ 430,NA,NA,14,Entre comida/bebida (Spanish),Dietary
432
+ 431,NA,NA,15,Bocadillo (Spanish),Dietary
433
+ 432,NA,NA,16,Botana (Spanish),Dietary
434
+ 433,NA,NA,99,Don't know,Dietary
435
+ 434,NA,NA,2,Lunch,Dietary
436
+ 435,NA,NA,3,Dinner/supper,Dietary
437
+ 436,NA,NA,5,Brunch,Dietary
438
+ 437,NA,NA,6,Snack/beverage,Dietary
439
+ 438,NA,NA,8,Infant feeding,Dietary
440
+ 439,NA,NA,9,Extended consumption,Dietary
441
+ 440,NA,NA,10,Desayano (Spanish),Dietary
442
+ 441,NA,NA,11,Almuerzo (Spanish),Dietary
443
+ 442,NA,NA,12,Comida (Spanish),Dietary
444
+ 443,NA,NA,13,Merienda (Spanish),Dietary
445
+ 444,NA,NA,14,Cena (Spanish),Dietary
446
+ 445,NA,NA,15,Entre comida/bebida/tentempie (Spanish),Dietary
447
+ 446,NA,NA,17,Bocadillo (Spanish),Dietary
448
+ 447,NA,NA,91,Other,Dietary
449
+ 448,NA,NA,3,Dinner,Dietary
450
+ 449,NA,NA,4,Supper,Dietary
451
+ 450,NA,NA,6,Snack,Dietary
452
+ 451,NA,NA,7,Drink,Dietary
453
+ 452,NA,NA,10,Desayano (breakfast),Dietary
454
+ 453,NA,NA,11,Almuerzo (breakfast),Dietary
455
+ 454,NA,NA,12,Comida (lunch),Dietary
456
+ 455,NA,NA,13,Merienda (snack),Dietary
457
+ 456,NA,NA,14,Cena (dinner),Dietary
458
+ 457,NA,NA,15,Entre comida (snack),Dietary
459
+ 458,NA,NA,16,Botana (snack),Dietary
460
+ 459,NA,NA,17,Bocadillo (snack),Dietary
461
+ 460,NA,NA,18,Tentempie (snack),Dietary
462
+ 461,NA,NA,19,Bebida (drink),Dietary
463
+ 462,NA,NA,0,Non-combination food,Dietary
464
+ 463,NA,NA,90,Other mixtures,Dietary
465
+ 464,NA,NA,9,Dried beans and vegetable w/ additions,Dietary
466
+ 465,NA,NA,1,Beverage w/ additions,Dietary
467
+ 466,NA,NA,3,Bread/baked products w/ additions,Dietary
468
+ 467,NA,NA,2,Cereal w/ additions,Dietary
469
+ 468,NA,NA,14,Chips w/ additions,Dietary
470
+ 469,NA,NA,12,"Meat, poultry, fish",Dietary
471
+ 470,NA,NA,7,Frozen meals,Dietary
472
+ 471,NA,NA,10,Fruit w/ additions,Dietary
473
+ 472,NA,NA,4,Salad,Dietary
474
+ 473,NA,NA,5,Sandwiches,Dietary
475
+ 474,NA,NA,6,Soup,Dietary
476
+ 475,NA,NA,11,Tortilla products,Dietary
477
+ 476,NA,NA,1,Beverage w/ adds,Dietary
478
+ 477,NA,NA,2,Cereal w/ adds,Dietary
479
+ 478,NA,NA,3,Bread/baked products w/ adds,Dietary
480
+ 479,NA,NA,8,Ice cream/frozen yogurt w/ additions,Dietary
481
+ 480,NA,NA,9,Dried beans and vegetable w/ adds,Dietary
482
+ 481,NA,NA,10,Fruit w/ adds,Dietary
483
+ 482,NA,NA,11,Tortilla Products,Dietary
484
+ 483,NA,NA,13,Lunchables,Dietary
485
+ 484,DRXDRSTZ,Dietary Recall Status,1,Reliable and met the minimum criteria,Dietary
486
+ 485,DRXDRSTZ,Dietary Recall Status,2,Not reliable or not met the minimum criteria,Dietary
487
+ 486,DRXDRSTZ,Dietary Recall Status,9,Interview lost due to computer malfunction or file transfer problem,Dietary
488
+ 487,DRXDRSTZ,Dietary Recall Status,4,Reported consuming breast-milk,Dietary
489
+ 488,DRXDRSTZ,Dietary Recall Status,88,Blank but applicable,Dietary
490
+ 489,DRXDRSTZ,Dietary Recall Status,5,Not done,Dietary
491
+ 490,NA,NA,2,No,Dietary
492
+ 491,NA,NA,1,Yes (home),Dietary
493
+ 492,NA,NA,7,Refused,Dietary
494
+ 493,NA,NA,9,Don't know,Dietary
495
+ 494,DRXTWSZ,Tap Water Source,1,Community supply,Dietary
496
+ 495,DRXTWSZ,Tap Water Source,91,Other,Dietary
497
+ 496,DRXTWSZ,Tap Water Source,4,Don't drink tap water,Dietary
498
+ 497,DRXTWSZ,Tap Water Source,99,Don't know,Dietary
499
+ 498,DBQ095Z,Type of salt you usually add at table,4,Doesn't use or add salt products at the table,Dietary
500
+ 499,DBQ095Z,Type of salt you usually add at table,1,"Ordinary salt [includes regular iodized salt, sea salt and seasoning salts made with regular salt]",Dietary
501
+ 500,DBQ095Z,Type of salt you usually add at table,2,Lite salt,Dietary
502
+ 501,DBQ095Z,Type of salt you usually add at table,3,Salt substitute,Dietary
503
+ 502,DBQ095Z,Type of salt you usually add at table,88,Blank but applicable,Dietary
504
+ 503,DBQ095Z,Type of salt you usually add at table,99,Don't know,Dietary
505
+ 504,DBQ095Z,Type of salt you usually add at table,7,Refused,Dietary
506
+ 505,DBQ095Z,Type of salt you usually add at table,91,Other,Dietary
507
+ 506,DRXHELP,Who helped in responding for this interview,1,SP,Dietary
508
+ 507,DRXHELP,Who helped in responding for this interview,4,Parent of SP,Dietary
509
+ 508,DRXHELP,Who helped in responding for this interview,5,Spouse of SP,Dietary
510
+ 509,DRXHELP,Who helped in responding for this interview,6,Child of SP,Dietary
511
+ 510,DRXHELP,Who helped in responding for this interview,7,Grandparent of SP,Dietary
512
+ 511,DRXHELP,Who helped in responding for this interview,8,"Friend, Partner, Non Relative",Dietary
513
+ 512,DRXHELP,Who helped in responding for this interview,9,"Translator, not a HH member",Dietary
514
+ 513,DRXHELP,Who helped in responding for this interview,10,"Child care provider, Caretaker",Dietary
515
+ 514,DRXHELP,Who helped in responding for this interview,11,Other Relative,Dietary
516
+ 515,DRXHELP,Who helped in responding for this interview,12,No One,Dietary
517
+ 516,DRXHELP,Who helped in responding for this interview,14,Other specify,Dietary
518
+ 517,DRXHELP,Who helped in responding for this interview,77,Refused,Dietary
519
+ 518,DRXHELP,Who helped in responding for this interview,99,Don't know,Dietary
520
+ 519,DRXMRESP,Who was the main respondent for this interview?,1,SP,Dietary
521
+ 520,DRXMRESP,Who was the main respondent for this interview?,97,Proxy,Dietary
522
+ 521,DRXMRESP,Who was the main respondent for this interview?,98,SP and proxy,Dietary
523
+ 522,DRXMRESP,Who was the main respondent for this interview?,88,Blank but applicable,Dietary
524
+ 523,DRXMRESP,Who was the main respondent for this interview?,2,Mother of SP,Dietary
525
+ 524,DRXMRESP,Who was the main respondent for this interview?,3,Father of SP,Dietary
526
+ 525,DRXMRESP,Who was the main respondent for this interview?,5,Spouse of SP,Dietary
527
+ 526,DRXMRESP,Who was the main respondent for this interview?,6,Child of SP,Dietary
528
+ 527,DRXMRESP,Who was the main respondent for this interview?,7,Grandparent of SP,Dietary
529
+ 528,DRXMRESP,Who was the main respondent for this interview?,8,"Friend, Partner, Non Relative",Dietary
530
+ 529,DRXMRESP,Who was the main respondent for this interview?,9,"Translator, not a HH member",Dietary
531
+ 530,DRXMRESP,Who was the main respondent for this interview?,10,"Child care provider, Caretaker",Dietary
532
+ 531,DRXMRESP,Who was the main respondent for this interview?,11,Other Relative,Dietary
533
+ 532,DRXMRESP,Who was the main respondent for this interview?,14,Other specify,Dietary
534
+ 533,DRXMRESP,Who was the main respondent for this interview?,77,Refused,Dietary
535
+ 534,DRXMRESP,Who was the main respondent for this interview?,99,Don't know,Dietary
536
+ 535,DRXTWSZ,Tap Water Source,1,Community supply,Dietary
537
+ 536,DRXTWSZ,Tap Water Source,91,Other,Dietary
538
+ 537,DRXTWSZ,Tap Water Source,4,Don't drink tap water,Dietary
539
+ 538,DRXTWSZ,Tap Water Source,99,Don't know,Dietary
540
+ 539,DRXHELP,Who helped in responding for this interview,1,SP,Dietary
541
+ 540,DRXHELP,Who helped in responding for this interview,4,Parent of SP,Dietary
542
+ 541,DRXHELP,Who helped in responding for this interview,5,Spouse of SP,Dietary
543
+ 542,DRXHELP,Who helped in responding for this interview,6,Child of SP,Dietary
544
+ 543,DRXHELP,Who helped in responding for this interview,7,Grandparent of SP,Dietary
545
+ 544,DRXHELP,Who helped in responding for this interview,8,"Friend, Partner, Non Relative",Dietary
546
+ 545,DRXHELP,Who helped in responding for this interview,9,"Translator, not a HH member",Dietary
547
+ 546,DRXHELP,Who helped in responding for this interview,10,"Child care provider, Caretaker",Dietary
548
+ 547,DRXHELP,Who helped in responding for this interview,11,Other Relative,Dietary
549
+ 548,DRXHELP,Who helped in responding for this interview,12,No One,Dietary
550
+ 549,DRXHELP,Who helped in responding for this interview,14,Other specify,Dietary
551
+ 550,DRXHELP,Who helped in responding for this interview,77,Refused,Dietary
552
+ 551,DRXHELP,Who helped in responding for this interview,99,Don't know,Dietary
553
+ 552,DRXMRESP,Who was the main respondent for this interview?,1,SP,Dietary
554
+ 553,DRXMRESP,Who was the main respondent for this interview?,2,Mother of SP,Dietary
555
+ 554,DRXMRESP,Who was the main respondent for this interview?,3,Father of SP,Dietary
556
+ 555,DRXMRESP,Who was the main respondent for this interview?,5,Spouse of SP,Dietary
557
+ 556,DRXMRESP,Who was the main respondent for this interview?,6,Child of SP,Dietary
558
+ 557,DRXMRESP,Who was the main respondent for this interview?,7,Grandparent of SP,Dietary
559
+ 558,DRXMRESP,Who was the main respondent for this interview?,8,"Friend, Partner, Non Relative",Dietary
560
+ 559,DRXMRESP,Who was the main respondent for this interview?,9,"Translator, not a HH member",Dietary
561
+ 560,DRXMRESP,Who was the main respondent for this interview?,10,"Child care provider, Caretaker",Dietary
562
+ 561,DRXMRESP,Who was the main respondent for this interview?,11,Other Relative,Dietary
563
+ 562,DRXMRESP,Who was the main respondent for this interview?,14,Other specify,Dietary
564
+ 563,DRXMRESP,Who was the main respondent for this interview?,77,Refused,Dietary
565
+ 564,DRXMRESP,Who was the main respondent for this interview?,99,Don't know,Dietary
566
+ 565,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,1,Rarely,Dietary
567
+ 566,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,2,Occasionally,Dietary
568
+ 567,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,3,Very often,Dietary
569
+ 568,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,88,Blank but applicable,Dietary
570
+ 569,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,9,Don't know,Dietary
571
+ 570,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,7,Refused,Dietary
version2/data/tidytuesday_json_val.json ADDED
@@ -0,0 +1,1911 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "date_posted": "2023-02-28",
4
+ "project_name": "African Language Sentiment",
5
+ "project_source": [
6
+ "https://r4ds.io/join",
7
+ "https://arxiv.org/pdf/2302.08956.pdf",
8
+ "https://github.com/shmuhammad2004",
9
+ "https://github.com/afrisenti-semeval/afrisent-semeval-2023"
10
+ ],
11
+ "description": "The data this week comes fromAfriSenti: Sentiment Analysis dataset for 14 African languagesvia@shmuhammad2004(the corresponding author on theassociated paper, and an active member of theR4DS Online Learning Community Slack). This repository contains data for the SemEval 2023 Shared Task 12: Sentiment Analysis in African Languages (AfriSenti-SemEval). The source repository also includes sentiment lexicons for several languages.",
12
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28",
13
+ "data_dictionary": [
14
+ {
15
+ "variable": [
16
+ "language_iso_code",
17
+ "tweet",
18
+ "label",
19
+ "intended_use"
20
+ ],
21
+ "class": [
22
+ "character",
23
+ "character",
24
+ "character",
25
+ "character"
26
+ ],
27
+ "description": [
28
+ "The unique code used to identify the language",
29
+ "The text content of a tweet",
30
+ "A sentiment label of positive, negative, or neutral assigned by a native speaker of that language",
31
+ "Whether the data came from the dev, test, or train set for that language"
32
+ ]
33
+ },
34
+ {
35
+ "variable": [
36
+ "language_iso_code",
37
+ "language"
38
+ ],
39
+ "class": [
40
+ "character",
41
+ "character"
42
+ ],
43
+ "description": [
44
+ "The unique code used to identify the language",
45
+ "The name of the language"
46
+ ]
47
+ },
48
+ {
49
+ "variable": [
50
+ "language_iso_code",
51
+ "script"
52
+ ],
53
+ "class": [
54
+ "character",
55
+ "character"
56
+ ],
57
+ "description": [
58
+ "The unique code used to identify the language",
59
+ "The script used to write the language"
60
+ ]
61
+ },
62
+ {
63
+ "variable": [
64
+ "language_iso_code",
65
+ "country"
66
+ ],
67
+ "class": [
68
+ "character",
69
+ "character"
70
+ ],
71
+ "description": [
72
+ "The unique code used to identify the language",
73
+ "A country in which the language is spoken"
74
+ ]
75
+ },
76
+ {
77
+ "variable": [
78
+ "country",
79
+ "region"
80
+ ],
81
+ "class": [
82
+ "character",
83
+ "character"
84
+ ],
85
+ "description": [
86
+ "A country in which the language is spoken",
87
+ "The region of Africa in which that country is categorized. Note that Mozambique is categorized as \\\"East Africa\\\", \\\"Southern Africa\\\", and \\\"Southeastern Africa\\\""
88
+ ]
89
+ }
90
+ ],
91
+ "data": {
92
+ "file_name": [
93
+ "afrisenti.csv",
94
+ "country_regions.csv",
95
+ "language_countries.csv",
96
+ "language_scripts.csv",
97
+ "languages.csv"
98
+ ],
99
+ "file_url": [
100
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/afrisenti.csv",
101
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/country_regions.csv",
102
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/language_countries.csv",
103
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/language_scripts.csv",
104
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/languages.csv"
105
+ ]
106
+ },
107
+ "data_load": {
108
+ "file_name": [
109
+ "afrisenti.csv",
110
+ "country_regions.csv",
111
+ "language_countries.csv",
112
+ "language_scripts.csv",
113
+ "languages.csv"
114
+ ],
115
+ "file_url": [
116
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/afrisenti.csv",
117
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/country_regions.csv",
118
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/language_countries.csv",
119
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/language_scripts.csv",
120
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/languages.csv"
121
+ ]
122
+ }
123
+ },
124
+ {
125
+ "date_posted": "2023-05-02",
126
+ "project_name": "The Portal Project",
127
+ "project_source": [
128
+ "https://www.weecology.org/",
129
+ "https://weecology.github.io/portalr/",
130
+ "https://portal.weecology.org/",
131
+ "https://datacarpentry.org/ecology-workshop/",
132
+ "https://www.data-retriever.org/"
133
+ ],
134
+ "description": "The data this week comes from thePortal Project. This is a long-term ecological research site studying the dynamics of desert rodents, plants, ants and weather in Arizona. The Portal Project is a long-term ecological study being conducted near Portal, AZ. Since 1977, the site has been used to study the interactions among rodents, ants and plants and their respective responses to climate. To study the interactions among organisms, they experimentally manipulate access to 24 study plots. This study has produced over 100 scientific papers and is one of the longest running ecological studies in the U.S. TheWeecology research groupmonitors rodents, plants, ants, and weather. All data from the Portal Project are made openly available in near real-time so that they can provide the maximum benefit to scientific research and outreach. The core dataset is managed using an automated living data workflow run using GitHub and Continuous Analysis. This dataset focuses on the rodent data. Full data is available through these resources: The Portal Project data can also be accessed through the Data Retriever, a package manager for data. Data Retriever A teaching focused version of the dataset is also maintained with some of the complexities of the data removed to make it easy to use for computational training purposes. This dataset serves as the core dataset for theData Carpentry Ecologymaterial and has been downloaded almost 50,000 times. Thanks to @ethanwhite for the data cleaning script. This script downloads the data using the{portalr}package. It filters for the species and plot data, and years greater than 1977.",
135
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02",
136
+ "data_dictionary": [
137
+ {
138
+ "variable": [
139
+ "plot",
140
+ "treatment"
141
+ ],
142
+ "class": [
143
+ "double",
144
+ "character"
145
+ ],
146
+ "description": [
147
+ "Plot number",
148
+ "Treatment type"
149
+ ]
150
+ },
151
+ {
152
+ "variable": [
153
+ "species",
154
+ "scientificname",
155
+ "taxa",
156
+ "commonname",
157
+ "censustarget",
158
+ "unidentified",
159
+ "rodent",
160
+ "granivore",
161
+ "minhfl",
162
+ "meanhfl",
163
+ "maxhfl",
164
+ "minwgt",
165
+ "meanwgt",
166
+ "maxwgt",
167
+ "juvwgt"
168
+ ],
169
+ "class": [
170
+ "character",
171
+ "character",
172
+ "character",
173
+ "character",
174
+ "double",
175
+ "double",
176
+ "double",
177
+ "double",
178
+ "double",
179
+ "double",
180
+ "double",
181
+ "double",
182
+ "double",
183
+ "double",
184
+ "double"
185
+ ],
186
+ "description": [
187
+ "Species",
188
+ "Scientific Name",
189
+ "Taxa",
190
+ "Common Name",
191
+ "Target species (0 or 1)",
192
+ "Unidentified (0 or 1)",
193
+ "Rodent (0 or 1)",
194
+ "Granivore (0 or 1)",
195
+ "Minimum hindfoot length",
196
+ "Mean hindfoot length",
197
+ "Maximum hindfoot length",
198
+ "Minimum weight",
199
+ "Mean weight",
200
+ "Maximum weight",
201
+ "Juvenile weight"
202
+ ]
203
+ },
204
+ {
205
+ "variable": [
206
+ "censusdate",
207
+ "month",
208
+ "day",
209
+ "year",
210
+ "treatment",
211
+ "plot",
212
+ "stake",
213
+ "species",
214
+ "sex",
215
+ "reprod",
216
+ "age",
217
+ "testes",
218
+ "vagina",
219
+ "pregnant",
220
+ "nipples",
221
+ "lactation",
222
+ "hfl",
223
+ "wgt",
224
+ "tag",
225
+ "note2",
226
+ "ltag",
227
+ "note3"
228
+ ],
229
+ "class": [
230
+ "double",
231
+ "double",
232
+ "double",
233
+ "double",
234
+ "character",
235
+ "double",
236
+ "double",
237
+ "character",
238
+ "character",
239
+ "character",
240
+ "character",
241
+ "character",
242
+ "character",
243
+ "character",
244
+ "character",
245
+ "character",
246
+ "double",
247
+ "double",
248
+ "character",
249
+ "character",
250
+ "character",
251
+ "character"
252
+ ],
253
+ "description": [
254
+ "Census date",
255
+ "Month",
256
+ "Day",
257
+ "Year",
258
+ "Treatment type",
259
+ "Plot number",
260
+ "Stake number",
261
+ "Species code",
262
+ "Sex",
263
+ "Reproductive condition",
264
+ "Age",
265
+ "Testes (Scrotal, Recent, or Minor)",
266
+ "Vagina (Swollen, Plugged, or Both)",
267
+ "Pregnant",
268
+ "Nipples (Enlarged, Swollen, or Both)",
269
+ "Lactating",
270
+ "Hindfoot length",
271
+ "Weight",
272
+ "Primary individual identifier",
273
+ "Newly tagged individual for 'tag'",
274
+ "Secondary tag information when ear tags were used in both ears",
275
+ "Newly tagged individual for 'ltag'"
276
+ ]
277
+ }
278
+ ],
279
+ "data": {
280
+ "file_name": [
281
+ "plots.csv",
282
+ "species.csv",
283
+ "surveys.csv"
284
+ ],
285
+ "file_url": [
286
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02/plots.csv",
287
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02/species.csv",
288
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02/surveys.csv"
289
+ ]
290
+ },
291
+ "data_load": {
292
+ "file_name": [
293
+ "plots.csv",
294
+ "species.csv",
295
+ "surveys.csv"
296
+ ],
297
+ "file_url": [
298
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-02/plots.csv",
299
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-02/species.csv",
300
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-02/surveys.csv"
301
+ ]
302
+ }
303
+ },
304
+ {
305
+ "date_posted": "2023-04-04",
306
+ "project_name": "Premier League Match Data 2021-2022",
307
+ "project_source": [
308
+ "https://www.kaggle.com/datasets/evangower/premier-league-match-data",
309
+ "https://theathletic.com/3459766/2022/07/29/liverpool-manchester-city-premier-league-fouls-yellow-card/",
310
+ "https://github.com/evangower",
311
+ "https://www.kaggle.com/code/evangower/who-wins-the-epl-if-games-end-at-half-time/"
312
+ ],
313
+ "description": "The data this week comes from thePremier League Match Data 2021-2022viaEvan Goweron Kaggle. You can explore match day statistics of every game and every team during the 2021-22 season of the English Premier League Data. Data includes teams playing, date, referee, and stats for home and away side such as fouls, shots, cards, and more! Also included is a dataset of the weekly rankings for the season. The data was collected from the official website of the Premier League. Evan then cleaned the data using google sheets. Evan did an analysis ofWho wins the EPL if games end at half time?and there'san article from the Athleticabout fouls conceded per yellow card article. No data cleaning",
314
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-04-04",
315
+ "data_dictionary": [
316
+ {
317
+ "variable": [
318
+ "Date",
319
+ "HomeTeam",
320
+ "AwayTeam",
321
+ "FTHG",
322
+ "FTAG",
323
+ "FTR",
324
+ "HTHG",
325
+ "HTAG",
326
+ "HTR",
327
+ "Referee",
328
+ "HS",
329
+ "AS",
330
+ "HST",
331
+ "AST",
332
+ "HF",
333
+ "AF",
334
+ "HC",
335
+ "AC",
336
+ "HY",
337
+ "AY",
338
+ "HR",
339
+ "AR"
340
+ ],
341
+ "class": [
342
+ "character",
343
+ "character",
344
+ "character",
345
+ "double",
346
+ "double",
347
+ "character",
348
+ "double",
349
+ "double",
350
+ "character",
351
+ "character",
352
+ "double",
353
+ "double",
354
+ "double",
355
+ "double",
356
+ "double",
357
+ "double",
358
+ "double",
359
+ "double",
360
+ "double",
361
+ "double",
362
+ "double",
363
+ "double"
364
+ ],
365
+ "description": [
366
+ "The date when the match was played",
367
+ "The home team",
368
+ "The away team",
369
+ "Full time home goals",
370
+ "Full time away goals",
371
+ "Full time result",
372
+ "Halftime home goals",
373
+ "Halftime away goals",
374
+ "Halftime results",
375
+ "Referee of the match",
376
+ "Number of shots taken by the home team",
377
+ "Number of shots taken by the away team",
378
+ "Number of shots on target by the home team",
379
+ "Number of shots on target by the away team",
380
+ "Number of fouls by the home team",
381
+ "Number of fouls by the away team",
382
+ "Number of corners taken by the home team",
383
+ "Number of corners taken by the away team",
384
+ "Number of yellow cards received by the home team",
385
+ "Number of yellow cards received by the away team",
386
+ "Number of red cards received by the home team",
387
+ "Number of red cards received by the away team"
388
+ ]
389
+ }
390
+ ],
391
+ "data": {
392
+ "file_name": [
393
+ "soccer21-22.csv"
394
+ ],
395
+ "file_url": [
396
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-04-04/soccer21-22.csv"
397
+ ]
398
+ },
399
+ "data_load": {
400
+ "file_name": [
401
+ "soccer21-22.csv"
402
+ ],
403
+ "file_url": [
404
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-04-04/soccer21-22.csv"
405
+ ]
406
+ }
407
+ },
408
+ {
409
+ "date_posted": "2023-02-07",
410
+ "project_name": "Big Tech Stock Prices",
411
+ "project_source": [
412
+ "https://github.com/rfordatascience/tidytuesday/issues/509",
413
+ "https://www.morningstar.com/articles/1129535/5-charts-on-big-tech-stocks-collapse",
414
+ "https://www.kaggle.com/datasets/evangower/big-tech-stock-prices"
415
+ ],
416
+ "description": "The data this week comes from Yahoo Finance viaKaggle(byEvan Gower). This dataset consists of the daily stock prices and volume of 14 different tech companies, including Apple (AAPL), Amazon (AMZN), Alphabet (GOOGL), and Meta Platforms (META) and more! A number of articles have examined the collapse of \"Big Tech\" stock prices, includingthis article from morningstar.com. Note: Allstock_symbols have 3271 prices, except META (2688) and TSLA (3148) because they were not publicly traded for part of the period examined.",
417
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-07",
418
+ "data_dictionary": [
419
+ {
420
+ "variable": [
421
+ "stock_symbol",
422
+ "date",
423
+ "open",
424
+ "high",
425
+ "low",
426
+ "close",
427
+ "adj_close",
428
+ "volume"
429
+ ],
430
+ "class": [
431
+ "character",
432
+ "double",
433
+ "double",
434
+ "double",
435
+ "double",
436
+ "double",
437
+ "double",
438
+ "double"
439
+ ],
440
+ "description": [
441
+ "stock_symbol",
442
+ "date",
443
+ "The price at market open.",
444
+ "The highest price for that day.",
445
+ "The lowest price for that day.",
446
+ "The price at market close, adjusted for splits.",
447
+ "The closing price after adjustments for all applicable splits and dividend distributions. Data is adjusted using appropriate split and dividend multipliers, adhering to Center for Research in Security Prices (CRSP) standards.",
448
+ "The number of shares traded on that day."
449
+ ]
450
+ },
451
+ {
452
+ "variable": [
453
+ "stock_symbol",
454
+ "company"
455
+ ],
456
+ "class": [
457
+ "character",
458
+ "character"
459
+ ],
460
+ "description": [
461
+ "stock_symbol",
462
+ "Full name of the company."
463
+ ]
464
+ }
465
+ ],
466
+ "data": {
467
+ "file_name": [
468
+ "big_tech_companies.csv",
469
+ "big_tech_stock_prices.csv"
470
+ ],
471
+ "file_url": [
472
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-07/big_tech_companies.csv",
473
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-07/big_tech_stock_prices.csv"
474
+ ]
475
+ },
476
+ "data_load": {
477
+ "file_name": [
478
+ "big_tech_companies.csv",
479
+ "big_tech_stock_prices.csv"
480
+ ],
481
+ "file_url": [
482
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_companies.csv",
483
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_stock_prices.csv"
484
+ ]
485
+ }
486
+ },
487
+ {
488
+ "date_posted": "2023-03-21",
489
+ "project_name": "Programming Languages",
490
+ "project_source": [
491
+ "https://github.com/rfordatascience/tidytuesday/issues/530",
492
+ "https://pldb.com/posts/does-every-programming-language-support-line-comments.html",
493
+ "https://pldb.com/csv.html",
494
+ "https://pldb.com/index.html",
495
+ "https://pldb.com/posts/index.html"
496
+ ],
497
+ "description": "The data this week comes from theProgramming Language DataBase. Thanks toJesus M. Castagnettofor the suggestion! The PLDB has ablogwith numerous articles exploring the data, such asDoes every programming language have line comments?. The data is user-submitted, so you might want to confirm the accuracy of anything particularly surprising that you find before stating it with certainty! Thefull data dictionaryis available from PLDB.com.",
498
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-21",
499
+ "data_dictionary": [
500
+ {
501
+ "variable": [
502
+ "pldb_id",
503
+ "title",
504
+ "description",
505
+ "type",
506
+ "appeared",
507
+ "creators",
508
+ "website",
509
+ "domain_name",
510
+ "domain_name_registered",
511
+ "reference",
512
+ "isbndb",
513
+ "book_count",
514
+ "semantic_scholar",
515
+ "language_rank",
516
+ "github_repo",
517
+ "github_repo_stars",
518
+ "github_repo_forks",
519
+ "github_repo_updated",
520
+ "github_repo_subscribers",
521
+ "github_repo_created",
522
+ "github_repo_description",
523
+ "github_repo_issues",
524
+ "github_repo_first_commit",
525
+ "github_language",
526
+ "github_language_tm_scope",
527
+ "github_language_type",
528
+ "github_language_ace_mode",
529
+ "github_language_file_extensions",
530
+ "github_language_repos",
531
+ "wikipedia",
532
+ "wikipedia_daily_page_views",
533
+ "wikipedia_backlinks_count",
534
+ "wikipedia_summary",
535
+ "wikipedia_page_id",
536
+ "wikipedia_appeared",
537
+ "wikipedia_created",
538
+ "wikipedia_revision_count",
539
+ "wikipedia_related",
540
+ "features_has_comments",
541
+ "features_has_semantic_indentation",
542
+ "features_has_line_comments",
543
+ "line_comment_token",
544
+ "last_activity",
545
+ "number_of_users",
546
+ "number_of_jobs",
547
+ "origin_community",
548
+ "central_package_repository_count",
549
+ "file_type",
550
+ "is_open_source"
551
+ ],
552
+ "class": [
553
+ "character",
554
+ "character",
555
+ "character",
556
+ "character",
557
+ "double",
558
+ "character",
559
+ "character",
560
+ "character",
561
+ "double",
562
+ "character",
563
+ "double",
564
+ "double",
565
+ "integer",
566
+ "double",
567
+ "character",
568
+ "double",
569
+ "double",
570
+ "double",
571
+ "double",
572
+ "double",
573
+ "character",
574
+ "double",
575
+ "double",
576
+ "character",
577
+ "character",
578
+ "character",
579
+ "character",
580
+ "character",
581
+ "double",
582
+ "character",
583
+ "double",
584
+ "double",
585
+ "character",
586
+ "double",
587
+ "double",
588
+ "double",
589
+ "double",
590
+ "character",
591
+ "logical",
592
+ "logical",
593
+ "logical",
594
+ "character",
595
+ "double",
596
+ "double",
597
+ "double",
598
+ "character",
599
+ "double",
600
+ "character",
601
+ "logical"
602
+ ],
603
+ "description": [
604
+ "A standardized, uniquified version of the language name, used as an ID on the PLDB site.",
605
+ "The official title of the language.",
606
+ "Description of the repo on GitHub.",
607
+ "Which category in PLDB's subjective ontology does this entity fit into.",
608
+ "What year was the language publicly released and/or announced?",
609
+ "Name(s) of the original creators of the language delimited by \\\" and \\\"",
610
+ "URL of the official homepage for the language project.",
611
+ "If the project website is on its own domain.",
612
+ "When was this domain first registered?",
613
+ "A link to more info about this entity.",
614
+ "Books about this language from ISBNdb.",
615
+ "Computed; the number of books found for this language at isbndb.com",
616
+ "Papers about this language from Semantic Scholar.",
617
+ "Computed; A rank for the language, taking into account various online rankings. The computation for this column is not currently clear.",
618
+ "URL of the official GitHub repo for the project if it hosted there.",
619
+ "How many stars of the repo?",
620
+ "How many forks of the repo?",
621
+ "What year was the last commit made?",
622
+ "How many subscribers to the repo?",
623
+ "When was the Github repo for this entity created?",
624
+ "Description of the repo on GitHub.",
625
+ "How many isses on the repo?",
626
+ "What year the first commit made in this git repo?",
627
+ "GitHub has a set of supported languages as defined here",
628
+ "The TextMate scope that represents this programming language.",
629
+ "Either data, programming, markup, prose, or nil.",
630
+ "A String name of the Ace Mode used for highlighting whenever a file is edited. This must match one of the filenames in http://git.io/3XO_Cg. Use \\\"text\\\" if a mode does not exist.",
631
+ "An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).",
632
+ "How many repos for this language does GitHub report?",
633
+ "URL of the entity on Wikipedia, if and only if it has a page dedicated to it.",
634
+ "How many page views per day does this Wikipedia page get? Useful as a signal for rankings. Available via WP api.",
635
+ "How many pages on WP link to this page?",
636
+ "What is the text summary of the language from the Wikipedia page?",
637
+ "Waht is the internal ID for this entity on WP?",
638
+ "When does Wikipedia claim this entity first appeared?",
639
+ "When was the Wikipedia page for this entity created?",
640
+ "How many revisions does this page have?",
641
+ "What languages does Wikipedia have as related?",
642
+ "Does this language have a comment character?",
643
+ "Does indentation have semantic meaning in this language?",
644
+ "Does this language support inline comments (as opposed to comments that must span an entire line)?",
645
+ "Defined as a token that can be placed anywhere on a line and starts a comment that cannot be stopped except by a line break character or end of file.",
646
+ "Computed; The most recent of any year field in the PLDB for this language.",
647
+ "Computed; \\\"Crude user estimate from a linear model.",
648
+ "Computed; The estimated number of job openings for programmers in this language.",
649
+ "In what community(ies) did the language first originate?",
650
+ "Number of packages in a central repository. If this value is not known, it is set to 0 (so \\\"0\\\" can mean \\\"no repository exists\\\", \\\"the repository exists but is empty\\\" (unlikely), or \\\"we do not know if a repository exists\\\". This value is definitely incorrect for R.",
651
+ "What is the file encoding for programs in this language?",
652
+ "Is it an open source project?"
653
+ ]
654
+ }
655
+ ],
656
+ "data": {
657
+ "file_name": [
658
+ "languages.csv"
659
+ ],
660
+ "file_url": [
661
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-21/languages.csv"
662
+ ]
663
+ },
664
+ "data_load": {
665
+ "file_name": [
666
+ "languages.csv"
667
+ ],
668
+ "file_url": [
669
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-03-21/languages.csv"
670
+ ]
671
+ }
672
+ },
673
+ {
674
+ "date_posted": "2023-05-23",
675
+ "project_name": "Central Park Squirrel Census",
676
+ "project_source": [
677
+ "https://data.cityofnewyork.us/Environment/2018-Central-Park-Squirrel-Census-Squirrel-Data/vfnx-vebw",
678
+ "https://www.thesquirrelcensus.com/"
679
+ ],
680
+ "description": "Squirrel data! The data this week comes from the2018 Central Park Squirrel Census. The Squirrel Censusis a multimedia science, design, and storytelling project focusing on the Eastern gray (Sciurus carolinensis). They count squirrels and present their findings to the public. The dataset contains squirrel data for each of the 3,023 sightings, including location coordinates, age, primary and secondary fur color, elevation, activities, communications, and interactions between squirrels and with humans. No data cleaning",
681
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-23",
682
+ "data_dictionary": [
683
+ {
684
+ "variable": [
685
+ "X",
686
+ "Y",
687
+ "Unique Squirrel ID",
688
+ "Hectare",
689
+ "Shift",
690
+ "Date",
691
+ "Hectare Squirrel Number",
692
+ "Age",
693
+ "Primary Fur Color",
694
+ "Highlight Fur Color",
695
+ "Combination of Primary and Highlight Color",
696
+ "Color notes",
697
+ "Location",
698
+ "Above Ground Sighter Measurement",
699
+ "Specific Location",
700
+ "Running",
701
+ "Chasing",
702
+ "Climbing",
703
+ "Eating",
704
+ "Foraging",
705
+ "Other Activities",
706
+ "Kuks",
707
+ "Quaas",
708
+ "Moans",
709
+ "Tail flags",
710
+ "Tail twitches",
711
+ "Approaches",
712
+ "Indifferent",
713
+ "Runs from",
714
+ "Other Interactions",
715
+ "Lat/Long"
716
+ ],
717
+ "class": [
718
+ "double",
719
+ "double",
720
+ "character",
721
+ "character",
722
+ "character",
723
+ "double",
724
+ "double",
725
+ "character",
726
+ "character",
727
+ "character",
728
+ "character",
729
+ "character",
730
+ "character",
731
+ "character",
732
+ "character",
733
+ "logical",
734
+ "logical",
735
+ "logical",
736
+ "logical",
737
+ "logical",
738
+ "character",
739
+ "logical",
740
+ "logical",
741
+ "logical",
742
+ "logical",
743
+ "logical",
744
+ "logical",
745
+ "logical",
746
+ "logical",
747
+ "character",
748
+ "character"
749
+ ],
750
+ "description": [
751
+ "Longitude coordinate for squirrel sighting point",
752
+ "Latitude coordinate for squirrel sighting point",
753
+ "Identification tag for each squirrel sightings. The tag is comprised of \\\"Hectare ID\\\" + \\\"Shift\\\" + \\\"Date\\\" + \\\"Hectare Squirrel Number.\\\"",
754
+ "ID tag, which is derived from the hectare grid used to divide and count the park area. One axis that runs predominantly north-to-south is numerical (1-42), and the axis that runs predominantly east-to-west is roman characters (A-I).",
755
+ "Value is either \\\"AM\\\" or \\\"PM,\\\" to communicate whether or not the sighting session occurred in the morning or late afternoon.",
756
+ "Concatenation of the sighting session day and month.",
757
+ "Number within the chronological sequence of squirrel sightings for a discrete sighting session.",
758
+ "Value is either \\\"Adult\\\" or \\\"Juvenile.\\\"",
759
+ "Primary Fur Color - value is either \\\"Gray,\\\" \\\"Cinnamon\\\" or \\\"Black.\\\"",
760
+ "Discrete value or string values comprised of \\\"Gray,\\\" \\\"Cinnamon\\\" or \\\"Black.\\\"",
761
+ "A combination of the previous two columns; this column gives the total permutations of primary and highlight colors observed.",
762
+ "Sighters occasionally added commentary on the squirrel fur conditions. These notes are provided here.",
763
+ "Value is either \\\"Ground Plane\\\" or \\\"Above Ground.\\\" Sighters were instructed to indicate the location of where the squirrel was when first sighted.",
764
+ "For squirrel sightings on the ground plane, fields were populated with a value of \\\"FALSE.\\\"",
765
+ "Sighters occasionally added commentary on the squirrel location. These notes are provided here.",
766
+ "Squirrel was seen running.",
767
+ "Squirrel was seen chasing another squirrel.",
768
+ "Squirrel was seen climbing a tree or other environmental landmark.",
769
+ "Squirrel was seen eating.",
770
+ "Squirrel was seen foraging for food.",
771
+ "Other activities squirrels were observed doing.",
772
+ "Squirrel was heard kukking, a chirpy vocal communication used for a variety of reasons.",
773
+ "Squirrel was heard quaaing, an elongated vocal communication which can indicate the presence of a ground predator such as a dog.",
774
+ "Squirrel was heard moaning, a high-pitched vocal communication which can indicate the presence of an air predator such as a hawk.",
775
+ "Squirrel was seen flagging its tail. Flagging is a whipping motion used to exaggerate squirrel's size and confuse rivals or predators. Looks as if the squirrel is scribbling with tail into the air.",
776
+ "Squirrel was seen twitching its tail. Looks like a wave running through the tail, like a breakdancer doing the arm wave. Often used to communicate interest, curiosity.",
777
+ "Squirrel was seen approaching human, seeking food.",
778
+ "Squirrel was indifferent to human presence.",
779
+ "Squirrel was seen running from humans, seeing them as a threat.",
780
+ "Sighter notes on other types of interactions between squirrels and humans.",
781
+ "Latitude and longitude"
782
+ ]
783
+ }
784
+ ],
785
+ "data": {
786
+ "file_name": [
787
+ "squirrel_data.csv"
788
+ ],
789
+ "file_url": [
790
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-23/squirrel_data.csv"
791
+ ]
792
+ },
793
+ "data_load": {
794
+ "file_name": [
795
+ "squirrel_data.csv"
796
+ ],
797
+ "file_url": [
798
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-23/squirrel_data.csv"
799
+ ]
800
+ }
801
+ },
802
+ {
803
+ "date_posted": "2023-01-17",
804
+ "project_name": "Art History",
805
+ "project_source": [
806
+ "https://research.repository.duke.edu/concern/datasets/q811kk70n?locale=en",
807
+ "https://github.com/hollandstam1/thesis",
808
+ "https://saralemus7.github.io/arthistory/",
809
+ "https://github.com/saralemus7/arthistory"
810
+ ],
811
+ "description": "The data this week comes from thearthistory data package This dataset contains data that was used for Holland Stam's thesis work, titledQuantifying art historical narratives. The data was collected to assess the demographic representation of artists through editions of Janson's History of Art and Gardner's Art Through the Ages, two of the most popular art history textbooks used in the American education system. In this package specifically, both artist-level and work-level data was collected along with variables regarding the artists' demographics and numeric metrics for describing how much space they or their work took up in each edition of each textbook. This package contains three datasets: Acknowledging arthistory Citation Lemus S, Stam H (2022). arthistory: Art History Textbook Data.https://github.com/saralemus7/arthistory,https://saralemus7.github.io/arthistory/. Examples of analyses are included inHolland Stam's thesisin Quarto files. No data cleaning",
812
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-01-17",
813
+ "data_dictionary": [
814
+ {
815
+ "variable": [
816
+ "artist_name",
817
+ "edition_number",
818
+ "year",
819
+ "artist_nationality",
820
+ "artist_nationality_other",
821
+ "artist_gender",
822
+ "artist_race",
823
+ "artist_ethnicity",
824
+ "book",
825
+ "space_ratio_per_page_total",
826
+ "artist_unique_id",
827
+ "moma_count_to_year",
828
+ "whitney_count_to_year",
829
+ "artist_race_nwi"
830
+ ],
831
+ "class": [
832
+ "character",
833
+ "double",
834
+ "double",
835
+ "character",
836
+ "character",
837
+ "character",
838
+ "character",
839
+ "character",
840
+ "character",
841
+ "double",
842
+ "double",
843
+ "double",
844
+ "double",
845
+ "character"
846
+ ],
847
+ "description": [
848
+ "The name of each artist",
849
+ "The edition number of the textbook from either Janson's History or Art or Gardner's Art Through the Ages.",
850
+ "The year of publication for a given edition of Janson or Gardner.",
851
+ "The nationality of a given artist.",
852
+ "The nationality of the artist. Of the total count of artists through all editions of Janson's History of Art and Gardner's Art Through the Ages, 77.32% account for French, Spanish, British, American and German. Therefore, the categorical strings of this variable are French, Spanish, British, American, German and Other",
853
+ "The gender of the artist",
854
+ "The race of the artist",
855
+ "The ethnicity of the artist",
856
+ "Which book, either Janson or Gardner the particular artist at that particular time was included.",
857
+ "The area in centimeters squared of both the text and the figure of a particular artist in a given edition of Janson's History of Art divided by the area in centimeters squared of a single page of the respective edition. This variable is continuous.",
858
+ "The unique identifying number assigned to artists across books is denoted in alphabetical order. This variable is discrete.",
859
+ "The total count of exhibitions ever held by the Museum of Modern Art (MoMA) of a particular artist at a given year of publication. This variable is discrete.",
860
+ "The count of exhibitions held by The Whitney of a particular artist at a particular moment of time, as highlighted by year. This variable in discrete.",
861
+ "The non-white indicator for artist race, meaning if an artist's race is denoted as either white or non-white."
862
+ ]
863
+ }
864
+ ],
865
+ "data": {
866
+ "file_name": [
867
+ "artists.csv"
868
+ ],
869
+ "file_url": [
870
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-01-17/artists.csv"
871
+ ]
872
+ },
873
+ "data_load": {
874
+ "file_name": [
875
+ "artists.csv"
876
+ ],
877
+ "file_url": [
878
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-17/artists.csv"
879
+ ]
880
+ }
881
+ },
882
+ {
883
+ "date_posted": "2023-07-04",
884
+ "project_name": "Historical Markers",
885
+ "project_source": [
886
+ "http://www.geonames.org/",
887
+ "https://www.hmdb.org/geolists.asp?c=United%20States%20of%20America",
888
+ "https://www.hmdb.org/stats.asp",
889
+ "https://www.hmdb.org/",
890
+ "https://github.com/rfordatascience/tidytuesday/issues/574#issuecomment-1601050053"
891
+ ],
892
+ "description": "The data this week comes from theHistorical Marker Database USA Index. Learn more about the markers on theHMDb.org site, which includes a number of articles, includingDatabase Counts and Statistics. We included a dataset of places that donothave entries in the Historical Markers Database. You might try to combine that with information fromgeonames.org(code: HSTS) to find markers that need to be submitted. Thanks toJesus M. Castagnettofor the geonames tip!",
893
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-04",
894
+ "data_dictionary": [
895
+ {
896
+ "variable": [
897
+ "marker_id",
898
+ "marker_no",
899
+ "title",
900
+ "subtitle",
901
+ "addl_subtitle",
902
+ "year_erected",
903
+ "erected_by",
904
+ "latitude_minus_s",
905
+ "longitude_minus_w",
906
+ "street_address",
907
+ "city_or_town",
908
+ "section_or_quarter",
909
+ "county_or_parish",
910
+ "state_or_prov",
911
+ "location",
912
+ "missing",
913
+ "link"
914
+ ],
915
+ "class": [
916
+ "double",
917
+ "character",
918
+ "character",
919
+ "character",
920
+ "character",
921
+ "integer",
922
+ "character",
923
+ "double",
924
+ "double",
925
+ "character",
926
+ "character",
927
+ "character",
928
+ "character",
929
+ "character",
930
+ "character",
931
+ "character",
932
+ "character"
933
+ ],
934
+ "description": [
935
+ "Unique ID for this marker in the HMdb.",
936
+ "Number of this marker in the state numbering scheme.",
937
+ "Main title of the marker.",
938
+ "Subtitle of the marker, if any.",
939
+ "Additional subtitle text.",
940
+ "The year in which the marker was erected.",
941
+ "The organization which erected the marker.",
942
+ "The latitude of the marker.",
943
+ "The longitude of the marker.",
944
+ "The street address of the marker, if available.",
945
+ "The city, town, etc in which the marker is located.",
946
+ "The section of the city, town, etc, when available.",
947
+ "The county, parish, or similar designation in which the marker appears.",
948
+ "The state, province, territory, etc in which the marker appears.",
949
+ "A description of the marker's location.",
950
+ "Whether the marker is \\\"Reported missing\\\" or \\\"Confirmed missing\\\". NA values indicate that the marker has neither been reported missing nor confirmed as missing.",
951
+ "The HMDb link to the marker. Links include additional details, such as photos and topic lists to which this marker belongs."
952
+ ]
953
+ },
954
+ {
955
+ "variable": [
956
+ "county",
957
+ "state"
958
+ ],
959
+ "class": [
960
+ "character",
961
+ "character"
962
+ ],
963
+ "description": [
964
+ "County or equivalent.",
965
+ "State or territory."
966
+ ]
967
+ }
968
+ ],
969
+ "data": {
970
+ "file_name": [
971
+ "historical_markers.csv",
972
+ "no_markers.csv"
973
+ ],
974
+ "file_url": [
975
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-04/historical_markers.csv",
976
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-04/no_markers.csv"
977
+ ]
978
+ },
979
+ "data_load": {
980
+ "file_name": [
981
+ "historical_markers.csv",
982
+ "no_markers.csv"
983
+ ],
984
+ "file_url": [
985
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-07-04/historical_markers.csv",
986
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-07-04/no_markers.csv"
987
+ ]
988
+ }
989
+ },
990
+ {
991
+ "date_posted": "2023-02-14",
992
+ "project_name": "Hollywood Age Gaps",
993
+ "project_source": [
994
+ "https://www.data-is-plural.com/archive/2018-02-07-edition/",
995
+ "https://tidytues.day/2021/2021-03-09",
996
+ "https://hollywoodagegap.com/"
997
+ ],
998
+ "description": "The data this week comes fromHollywood Age GapviaData Is Plural. An informational site showing the age gap between movie love interests. The data follows certain rules: The two (or more) actors play actual love interests (not just friends, coworkers, or some other non-romantic type of relationship) The youngest of the two actors is at least 17 years old Not animated characters We previously provided a dataset about theBechdel Test. It might be interesting to see whether there is any correlation between these datasets! The Bechdel Test dataset also included additional information about the films that were used in that dataset. Note: The age gaps dataset includes \"gender\" columns, which always contain the values \"man\" or \"woman\". These values appear to indicate how thecharactersin each film identify. Some of these values do not match how theactoridentifies. We apologize if any characters are misgendered in the data!",
999
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-14",
1000
+ "data_dictionary": [
1001
+ {
1002
+ "variable": [
1003
+ "movie_name",
1004
+ "release_year",
1005
+ "director",
1006
+ "age_difference",
1007
+ "couple_number",
1008
+ "actor_1_name",
1009
+ "actor_2_name",
1010
+ "character_1_gender",
1011
+ "character_2_gender",
1012
+ "actor_1_birthdate",
1013
+ "actor_2_birthdate",
1014
+ "actor_1_age",
1015
+ "actor_2_age"
1016
+ ],
1017
+ "class": [
1018
+ "character",
1019
+ "integer",
1020
+ "character",
1021
+ "integer",
1022
+ "integer",
1023
+ "character",
1024
+ "character",
1025
+ "character",
1026
+ "character",
1027
+ "date",
1028
+ "date",
1029
+ "integer",
1030
+ "integer"
1031
+ ],
1032
+ "description": [
1033
+ "Name of the film",
1034
+ "Release year",
1035
+ "Director of the film",
1036
+ "Age difference between the characters in whole years",
1037
+ "An identifier for the couple in case multiple couples are listed for this film",
1038
+ "The name of the older actor in this couple",
1039
+ "The name of the younger actor in this couple",
1040
+ "The gender of the older character, as identified by the person who submitted the data for this couple",
1041
+ "The gender of the younger character, as identified by the person who submitted the data for this couple",
1042
+ "The birthdate of the older member of the couple",
1043
+ "The birthdate of the younger member of the couple",
1044
+ "The age of the older actor when the film was released",
1045
+ "The age of the younger actor when the film was released"
1046
+ ]
1047
+ }
1048
+ ],
1049
+ "data": {
1050
+ "file_name": [
1051
+ "age_gaps.csv"
1052
+ ],
1053
+ "file_url": [
1054
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-14/age_gaps.csv"
1055
+ ]
1056
+ },
1057
+ "data_load": {
1058
+ "file_name": [
1059
+ "age_gaps.csv"
1060
+ ],
1061
+ "file_url": [
1062
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv"
1063
+ ]
1064
+ }
1065
+ },
1066
+ {
1067
+ "date_posted": "2023-08-15",
1068
+ "project_name": "Spam E-mail",
1069
+ "project_source": [
1070
+ "https://vincentarelbundock.github.io/Rdatasets/index.html",
1071
+ "https://archive.ics.uci.edu/dataset/94/spambase",
1072
+ "https://search.r-project.org/CRAN/refmans/kernlab/html/spam.html",
1073
+ "https://vincentarelbundock.github.io/Rdatasets/doc/DAAG/spam7.html"
1074
+ ],
1075
+ "description": "The data this week comes from Vincent Arel-Bundock's Rdatasets package(https://vincentarelbundock.github.io/Rdatasets/index.html). Rdatasets is a collection of 2246 datasets which were originally distributed alongside the statistical software environment R and some of its add-on packages. The goal is to make these data more broadly accessible for teaching and statistical software development. We're working with thespam emaildataset. This is a subset of thespam e-mail database. This is a dataset collected at Hewlett-Packard Labs by Mark Hopkins, Erik Reeber, George Forman, and Jaap Suermondt and shared with theUCI Machine Learning Repository. The dataset classifies 4601 e-mails as spam or non-spam, with additional variables indicating the frequency of certain words and characters in the e-mail. First column was removed.",
1076
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-15",
1077
+ "data_dictionary": [
1078
+ {
1079
+ "variable": [
1080
+ "crl.tot",
1081
+ "dollar",
1082
+ "bang",
1083
+ "money",
1084
+ "n000",
1085
+ "make",
1086
+ "yesno"
1087
+ ],
1088
+ "class": [
1089
+ "double",
1090
+ "double",
1091
+ "double",
1092
+ "double",
1093
+ "double",
1094
+ "double",
1095
+ "character"
1096
+ ],
1097
+ "description": [
1098
+ "Total length of uninterrupted sequences of capitals",
1099
+ "Occurrences of the dollar sign, as percent of total number of characters",
1100
+ "Occurrences of β€˜!’, as percent of total number of characters",
1101
+ "Occurrences of β€˜money’, as percent of total number of characters",
1102
+ "Occurrences of the string β€˜000’, as percent of total number of words",
1103
+ "Occurrences of β€˜make’, as a percent of total number of words",
1104
+ "Outcome variable, a factor with levels 'n' not spam, 'y' spam"
1105
+ ]
1106
+ }
1107
+ ],
1108
+ "data": {
1109
+ "file_name": [
1110
+ "spam.csv"
1111
+ ],
1112
+ "file_url": [
1113
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-15/spam.csv"
1114
+ ]
1115
+ },
1116
+ "data_load": {
1117
+ "file_name": [
1118
+ "spam.csv"
1119
+ ],
1120
+ "file_url": [
1121
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv"
1122
+ ]
1123
+ }
1124
+ },
1125
+ {
1126
+ "date_posted": "2023-03-07",
1127
+ "project_name": "Numbats in Australia",
1128
+ "project_source": [
1129
+ "/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-07/data/numbats.csv",
1130
+ "https://www.ala.org.au",
1131
+ "https://github.com/numbats/numbats-tidytuesday",
1132
+ "https://bie.ala.org.au/species/https://biodiversity.org.au/afd/taxa/6c72d199-f0f1-44d3-8197-224a2f7cff5f"
1133
+ ],
1134
+ "description": "The data this week comes from theAtlas of Living Australia. Thanks to Di Cook forpreparing this week's dataset! ThisNumbat page at the Atlas of Living Australiatalks about these endangered species in greater detail. Acsvfile of numbat sightings is provided. The code to refresh the data is below. Questions that would be interesting to answer are:",
1135
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-07",
1136
+ "data_dictionary": [
1137
+ {
1138
+ "variable": [
1139
+ "decimalLatitude",
1140
+ "decimalLongitude",
1141
+ "eventDate",
1142
+ "scientificName",
1143
+ "taxonConceptID",
1144
+ "recordID",
1145
+ "dataResourceName",
1146
+ "year",
1147
+ "month",
1148
+ "wday",
1149
+ "hour",
1150
+ "day",
1151
+ "dryandra",
1152
+ "prcp",
1153
+ "tmax",
1154
+ "tmin"
1155
+ ],
1156
+ "class": [
1157
+ "double",
1158
+ "double",
1159
+ "datetime",
1160
+ "factor",
1161
+ "factor",
1162
+ "character",
1163
+ "factor",
1164
+ "integer",
1165
+ "factor",
1166
+ "factor",
1167
+ "integer",
1168
+ "date",
1169
+ "logical",
1170
+ "double",
1171
+ "double",
1172
+ "double"
1173
+ ],
1174
+ "description": [
1175
+ "decimalLatitude",
1176
+ "decimalLongitude",
1177
+ "eventDate",
1178
+ "Either \\\"Myrmecobius fasciatus\\\" or \\\"Myrmecobius fasciatus rufus\\\"",
1179
+ "The URL for this (sub)species",
1180
+ "recordID",
1181
+ "dataResourceName",
1182
+ "The 4-digit year of the event (when available)",
1183
+ "The 3-letter month abbreviation of the event (when available)",
1184
+ "The 3-letter weekday abbreviation of the event (when available)",
1185
+ "The hour of the event (when available)",
1186
+ "The date of the event (when available)",
1187
+ "whether the observation was in Dryandra Woodland",
1188
+ "Precipitation on that day in Dryandra Woodland (when relevant), in millimeters",
1189
+ "Maximum temperature on that day in Dryandra Woodland (when relevant), in degrees Celsius",
1190
+ "Minimum temperature on that day in Dryandra Woodland (when relevant), in degrees Celsius"
1191
+ ]
1192
+ }
1193
+ ],
1194
+ "data": {
1195
+ "file_name": [
1196
+ "numbats.csv"
1197
+ ],
1198
+ "file_url": [
1199
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-07/numbats.csv"
1200
+ ]
1201
+ },
1202
+ "data_load": {
1203
+ "file_name": [
1204
+ "numbats.csv"
1205
+ ],
1206
+ "file_url": [
1207
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-03-07/numbats.csv"
1208
+ ]
1209
+ }
1210
+ },
1211
+ {
1212
+ "date_posted": "2023-11-28",
1213
+ "project_name": "Doctor Who Episodes",
1214
+ "project_source": [
1215
+ "https://en.wikipedia.org/wiki/List_of_Doctor_Who_episodes_(2005%E2%80%93present)",
1216
+ "https://github.com/KittJonathan/datardis/tree/main/misc",
1217
+ "https://cran.r-project.org/package=datardis",
1218
+ "https://github.com/KittJonathan/datardis"
1219
+ ],
1220
+ "description": "Doctor Who is an extremely long-running British television program. The show was revived in 2005, and has proven very popular since then. To celebrate this year's 60th anniversary of Doctor Who, we have three datasets. The data this week comes from Wikipedia's [List of Doctor Who episodes](https://en.wikipedia.org/wiki/List_of_Doctor_Who_episodes_(2005%E2%80%93present)via the{datardis} packagebyJonathan Kitt. Thank you to Jonathan for compiling and sharing this data! As of 2023-11-24, the data only includes episodes from the \"revived\" era. For an added challenge, consider submitting a pull request to the {datardis} package to update thedata-extraction scriptsto also fetch the \"classic\" era data! Clean data from the{datardis} package.",
1221
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28",
1222
+ "data_dictionary": [
1223
+ {
1224
+ "variable": [
1225
+ "era",
1226
+ "season_number",
1227
+ "serial_title",
1228
+ "story_number",
1229
+ "episode_number",
1230
+ "episode_title",
1231
+ "type",
1232
+ "first_aired",
1233
+ "production_code",
1234
+ "uk_viewers",
1235
+ "rating",
1236
+ "duration"
1237
+ ],
1238
+ "class": [
1239
+ "character",
1240
+ "double",
1241
+ "character",
1242
+ "character",
1243
+ "double",
1244
+ "character",
1245
+ "character",
1246
+ "double",
1247
+ "character",
1248
+ "double",
1249
+ "double",
1250
+ "double"
1251
+ ],
1252
+ "description": [
1253
+ "Whether the episode is in the \\\"classic\\\" or \\\"revived\\\" era. All data in this dataset is within the \\\"revived\\\" era.",
1254
+ "The season number within the era. Note that some episodes are outside of a season.",
1255
+ "Serial title if available",
1256
+ "Story number",
1257
+ "Episode number in season",
1258
+ "Episode title",
1259
+ "\\\"episode\\\" or \\\"special\\\"",
1260
+ "Date the episode first aired in the U.K.",
1261
+ "Episode's production code if available",
1262
+ "Number of U.K. viewers (millions)",
1263
+ "Episode's rating",
1264
+ "Episode's duration in minutes"
1265
+ ]
1266
+ },
1267
+ {
1268
+ "variable": [
1269
+ "story_number",
1270
+ "director"
1271
+ ],
1272
+ "class": [
1273
+ "character",
1274
+ "character"
1275
+ ],
1276
+ "description": [
1277
+ "Story number",
1278
+ "Episode's director"
1279
+ ]
1280
+ },
1281
+ {
1282
+ "variable": [
1283
+ "story_number",
1284
+ "writer"
1285
+ ],
1286
+ "class": [
1287
+ "character",
1288
+ "character"
1289
+ ],
1290
+ "description": [
1291
+ "Story number",
1292
+ "Episode's writer"
1293
+ ]
1294
+ }
1295
+ ],
1296
+ "data": {
1297
+ "file_name": [
1298
+ "drwho_directors.csv",
1299
+ "drwho_episodes.csv",
1300
+ "drwho_writers.csv"
1301
+ ],
1302
+ "file_url": [
1303
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28/drwho_directors.csv",
1304
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28/drwho_episodes.csv",
1305
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28/drwho_writers.csv"
1306
+ ]
1307
+ },
1308
+ "data_load": {
1309
+ "file_name": [
1310
+ "drwho_directors.csv",
1311
+ "drwho_episodes.csv",
1312
+ "drwho_writers.csv"
1313
+ ],
1314
+ "file_url": [
1315
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-28/drwho_directors.csv",
1316
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-28/drwho_episodes.csv",
1317
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-28/drwho_writers.csv"
1318
+ ]
1319
+ }
1320
+ },
1321
+ {
1322
+ "date_posted": "2023-11-14",
1323
+ "project_name": "Diwali Sales Data",
1324
+ "project_source": [
1325
+ "https://www.kaggle.com/code/bhushanshelke69/diwali-data-exploration",
1326
+ "https://github.com/vikasvachheta08/Diwali_Sales_Analysis_Using_Python",
1327
+ "https://www.kaggle.com/datasets/saadharoon27/diwali-sales-dataset"
1328
+ ],
1329
+ "description": "This week is Diwali, the festival of lights! The data this week comes fromsales datafor a retail store during the Diwali festival period in India. The data is shared on Kaggle by Saad Haroon. This week we're sharing Python data analysis examples! There's a few out there, but these ones fromBrushan ShelkeorVikas Vachheta(see the Diwali_Sales_Analysis.ipynb file for the code) are some data exploration analyses. Data was downloaded fromKaggle, and theStatusandunnamed1columns removed.",
1330
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-14",
1331
+ "data_dictionary": [
1332
+ {
1333
+ "variable": [
1334
+ "User_ID",
1335
+ "Cust_name",
1336
+ "Product_ID",
1337
+ "Gender",
1338
+ "Age Group",
1339
+ "Age",
1340
+ "Marital_Status",
1341
+ "State",
1342
+ "Zone",
1343
+ "Occupation",
1344
+ "Product_Category",
1345
+ "Orders",
1346
+ "Amount"
1347
+ ],
1348
+ "class": [
1349
+ "double",
1350
+ "character",
1351
+ "character",
1352
+ "character",
1353
+ "character",
1354
+ "double",
1355
+ "double",
1356
+ "character",
1357
+ "character",
1358
+ "character",
1359
+ "character",
1360
+ "double",
1361
+ "double"
1362
+ ],
1363
+ "description": [
1364
+ "User identification number",
1365
+ "Customer name",
1366
+ "Product identification number",
1367
+ "Gender of the customer (e.g. Male, Female)",
1368
+ "Age group of the customer",
1369
+ "Age of the customer",
1370
+ "Marital status of the customer (e.g. Married, Single)",
1371
+ "State of the customer",
1372
+ "Geographic zone of the customer",
1373
+ "Occupation of the customer",
1374
+ "Category of the product",
1375
+ "Number of orders made by the customer",
1376
+ "Amount in Indian rupees spent by the customer"
1377
+ ]
1378
+ }
1379
+ ],
1380
+ "data": {
1381
+ "file_name": [
1382
+ "diwali_sales_data.csv"
1383
+ ],
1384
+ "file_url": [
1385
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-14/diwali_sales_data.csv"
1386
+ ]
1387
+ },
1388
+ "data_load": {
1389
+ "file_name": [
1390
+ "diwali_sales_data.csv"
1391
+ ],
1392
+ "file_url": [
1393
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-14/diwali_sales_data.csv"
1394
+ ]
1395
+ }
1396
+ },
1397
+ {
1398
+ "date_posted": "2023-12-12",
1399
+ "project_name": "Holiday Movies",
1400
+ "project_source": [
1401
+ "https://networkdatascience.ceu.edu/article/2019-12-16/christmas-movies",
1402
+ "https://developer.imdb.com/non-commercial-datasets/"
1403
+ ],
1404
+ "description": "Happy holidays! This week we're exploring \"holiday\" movies: movies with \"holiday\", \"Christmas\", \"Hanukkah\", or \"Kwanzaa\" (or variants thereof) in their title! The data this week comes from theInternet Movie Database. We don't have an article using exactly this dataset, but you might get inspiration from thisChristmas Moviesblog post by MilΓ‘n Janosov at Central European University.",
1405
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-12-12",
1406
+ "data_dictionary": [
1407
+ {
1408
+ "variable": [
1409
+ "tconst",
1410
+ "title_type",
1411
+ "primary_title",
1412
+ "original_title",
1413
+ "year",
1414
+ "runtime_minutes",
1415
+ "genres",
1416
+ "simple_title",
1417
+ "average_rating",
1418
+ "num_votes",
1419
+ "christmas",
1420
+ "hanukkah",
1421
+ "kwanzaa",
1422
+ "holiday"
1423
+ ],
1424
+ "class": [
1425
+ "character",
1426
+ "character",
1427
+ "character",
1428
+ "character",
1429
+ "double",
1430
+ "double",
1431
+ "character",
1432
+ "character",
1433
+ "double",
1434
+ "double",
1435
+ "logical",
1436
+ "logical",
1437
+ "logical",
1438
+ "logical"
1439
+ ],
1440
+ "description": [
1441
+ "alphanumeric unique identifier of the title",
1442
+ "the type/format of the title (movie, video, or tvMovie)",
1443
+ "the more popular title / the title used by the filmmakers on promotional materials at the point of release",
1444
+ "original title, in the original language",
1445
+ "the release year of a title",
1446
+ "primary runtime of the title, in minutes",
1447
+ "includes up to three genres associated with the title (comma-delimited)",
1448
+ "the title in lowercase, with punctuation removed, for easier filtering and grouping",
1449
+ "weighted average of all the individual user ratings on IMDb",
1450
+ "number of votes the title has received on IMDb (titles with fewer than 10 votes were not included in this dataset)",
1451
+ "whether the title includes \\\"christmas\\\", \\\"xmas\\\", \\\"x mas\\\", etc",
1452
+ "whether the title includes \\\"hanukkah\\\", \\\"chanukah\\\", etc",
1453
+ "whether the title includes \\\"kwanzaa\\\"",
1454
+ "whether the title includes the word \\\"holiday\\\""
1455
+ ]
1456
+ },
1457
+ {
1458
+ "variable": [
1459
+ "tconst",
1460
+ "genres"
1461
+ ],
1462
+ "class": [
1463
+ "character",
1464
+ "character"
1465
+ ],
1466
+ "description": [
1467
+ "alphanumeric unique identifier of the title",
1468
+ "genres associated with the title, one row per genre"
1469
+ ]
1470
+ }
1471
+ ],
1472
+ "data": {
1473
+ "file_name": [
1474
+ "holiday_movie_genres.csv",
1475
+ "holiday_movies.csv"
1476
+ ],
1477
+ "file_url": [
1478
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-12-12/holiday_movie_genres.csv",
1479
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-12-12/holiday_movies.csv"
1480
+ ]
1481
+ },
1482
+ "data_load": {
1483
+ "file_name": [
1484
+ "holiday_movie_genres.csv",
1485
+ "holiday_movies.csv"
1486
+ ],
1487
+ "file_url": [
1488
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-12-12/holiday_movie_genres.csv",
1489
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-12-12/holiday_movies.csv"
1490
+ ]
1491
+ }
1492
+ },
1493
+ {
1494
+ "date_posted": "2024-02-13",
1495
+ "project_name": "Valentine's Day Consumer Data",
1496
+ "project_source": [
1497
+ "https://github.com/rfordatascience/tidytuesday/tree/master/data/2022/2022-01-25",
1498
+ "https://nrf.com/research-insights/holiday-data-and-trends/valentines-day/valentines-day-data-center",
1499
+ "https://www.kaggle.com/datasets/infinator/happy-valentines-day-2022",
1500
+ "https://github.com/rfordatascience/tidytuesday/tree/master/data/2022/2022-01-18"
1501
+ ],
1502
+ "description": "Happy Valentine's Day! This week we're exploringValentine's Day survey data. The National Retail Federation in the United States conducts surveys and has created aValentine's Day Data Centerso you can explore the data on how consumers celebrate. The NRF has surveyed consumers about how they plan to celebrate Valentine’s Day annually for over a decade. Take a deeper dive into the data from the last 10 years, and use the interactive charts to explore a demographic breakdown of total spending, average spending, types of gifts planned and spending per type of gift. The NRF has continued to collect data. The data for this week is from 2010 to 2022, as organized by Suraj Das for a Kaggle dataset. In the historical surveys gender was collected as only 'Men' and 'Women', which does not accurately include all genders. If you're looking for other Valentine's Day type datasets, check out previous datasets onchocolateorboard games(a good Valentine's Day activity!). Data was downloaded fromSunja aa Kaggle dataset. Data from historical_gift_trends_per_person_spending.csv, historical_spending_average_expected_spending.csv and historical_spending_percent_celebrating.csv were combined into historical_spending.csv. Data from planned_gifts_age.csv and spending_or_celebrating_age_1.csv were combined into gifts_age.csv. Data from planned_gifts_gender.csv and spending_or_celebrating_gender_1.csv were combined into gifts_gender.csv. Percentage signs and dollar signs were removed from all numerical values.",
1503
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13",
1504
+ "data_dictionary": [
1505
+ {
1506
+ "variable": [
1507
+ "Year",
1508
+ "PercentCelebrating",
1509
+ "PerPerson",
1510
+ "Candy",
1511
+ "Flowers",
1512
+ "Jewelry",
1513
+ "GreetingCards",
1514
+ "EveningOut",
1515
+ "Clothing",
1516
+ "GiftCards"
1517
+ ],
1518
+ "class": [
1519
+ "double",
1520
+ "double",
1521
+ "double",
1522
+ "double",
1523
+ "double",
1524
+ "double",
1525
+ "double",
1526
+ "double",
1527
+ "double",
1528
+ "double"
1529
+ ],
1530
+ "description": [
1531
+ "Year",
1532
+ "Percent of people celebrating Valentines Day",
1533
+ "Average amount each person is spending",
1534
+ "Average amount spending on candy",
1535
+ "Average amount spending on flowers",
1536
+ "Average amount spending on jewelry",
1537
+ "Average amount spending on greeting cards",
1538
+ "Average amount spending on an evening out",
1539
+ "Average amount spending on clothing",
1540
+ "Average amount spending on gift cards"
1541
+ ]
1542
+ },
1543
+ {
1544
+ "variable": [
1545
+ "Age",
1546
+ "SpendingCelebrating",
1547
+ "Candy",
1548
+ "Flowers",
1549
+ "Jewelry",
1550
+ "GreetingCards",
1551
+ "EveningOut",
1552
+ "Clothing",
1553
+ "GiftCards"
1554
+ ],
1555
+ "class": [
1556
+ "character",
1557
+ "double",
1558
+ "double",
1559
+ "double",
1560
+ "double",
1561
+ "double",
1562
+ "double",
1563
+ "double",
1564
+ "double"
1565
+ ],
1566
+ "description": [
1567
+ "Age",
1568
+ "Percent spending money on or celebrating Valentines Day",
1569
+ "Average percent spending on candy",
1570
+ "Average percent spending on flowers",
1571
+ "Average percent spending on jewelry",
1572
+ "Average percent spending on greeting cards",
1573
+ "Average percent spending on an evening out",
1574
+ "Average percent spending on clothing",
1575
+ "Average percent spending on gift cards"
1576
+ ]
1577
+ },
1578
+ {
1579
+ "variable": [
1580
+ "Gender",
1581
+ "SpendingCelebrating",
1582
+ "Candy",
1583
+ "Flowers",
1584
+ "Jewelry",
1585
+ "GreetingCards",
1586
+ "EveningOut",
1587
+ "Clothing",
1588
+ "GiftCards"
1589
+ ],
1590
+ "class": [
1591
+ "character",
1592
+ "double",
1593
+ "double",
1594
+ "double",
1595
+ "double",
1596
+ "double",
1597
+ "double",
1598
+ "double",
1599
+ "double"
1600
+ ],
1601
+ "description": [
1602
+ "Gender only including Men or Women",
1603
+ "Percent spending money on or celebrating Valentines Day",
1604
+ "Average percent spending on candy",
1605
+ "Average percent spending on flowers",
1606
+ "Average percent spending on jewelry",
1607
+ "Average percent spending on greeting cards",
1608
+ "Average percent spending on an evening out",
1609
+ "Average percent spending on clothing",
1610
+ "Average percent spending on gift cards"
1611
+ ]
1612
+ }
1613
+ ],
1614
+ "data": {
1615
+ "file_name": [
1616
+ "gifts_age.csv",
1617
+ "gifts_gender.csv",
1618
+ "historical_spending.csv"
1619
+ ],
1620
+ "file_url": [
1621
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13/gifts_age.csv",
1622
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13/gifts_gender.csv",
1623
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13/historical_spending.csv"
1624
+ ]
1625
+ },
1626
+ "data_load": {
1627
+ "file_name": [
1628
+ "gifts_age.csv",
1629
+ "gifts_gender.csv",
1630
+ "historical_spending.csv"
1631
+ ],
1632
+ "file_url": [
1633
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-13/gifts_age.csv",
1634
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-13/gifts_gender.csv",
1635
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-13/historical_spending.csv"
1636
+ ]
1637
+ }
1638
+ },
1639
+ {
1640
+ "date_posted": "2023-08-08",
1641
+ "project_name": "Hot Ones Episodes",
1642
+ "project_source": [
1643
+ "https://en.wikipedia.org/wiki/List_of_Hot_Ones_episodes",
1644
+ "https://github.com/borstell",
1645
+ "https://github.com/rfordatascience/tidytuesday/issues/591",
1646
+ "https://en.wikipedia.org/wiki/Hot_Ones"
1647
+ ],
1648
+ "description": "The data this week comes from Wikipedia articles:Hot OnesandList of Hot Ones episodes. Thank you toCarl BΓΆrstellfor thesuggestion and cleaning script! Hot Ones is an American YouTube talk show, created by Chris Schonberger, hosted by Sean Evans and produced by First We Feast and Complex Media. Its basic premise involves celebrities being interviewed by Evans over a platter of increasingly spicy chicken wings.",
1649
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08",
1650
+ "data_dictionary": [
1651
+ {
1652
+ "variable": [
1653
+ "season",
1654
+ "episode_overall",
1655
+ "episode_season",
1656
+ "title",
1657
+ "original_release",
1658
+ "guest",
1659
+ "guest_appearance_number",
1660
+ "finished"
1661
+ ],
1662
+ "class": [
1663
+ "integer",
1664
+ "integer",
1665
+ "integer",
1666
+ "character",
1667
+ "date",
1668
+ "character",
1669
+ "integer",
1670
+ "logical"
1671
+ ],
1672
+ "description": [
1673
+ "The season number.",
1674
+ "The overall count of this episode, from 1-300.",
1675
+ "The count of this episode within this season.",
1676
+ "The title of the episode.",
1677
+ "The date on which the episode was originally available on YouTube.",
1678
+ "The name of the guest.",
1679
+ "The number of appearances by this guest so far as of this date.",
1680
+ "Whether the guest finished trying all of the sauces."
1681
+ ]
1682
+ },
1683
+ {
1684
+ "variable": [
1685
+ "season",
1686
+ "sauce_number",
1687
+ "sauce_name",
1688
+ "scoville"
1689
+ ],
1690
+ "class": [
1691
+ "integer",
1692
+ "integer",
1693
+ "character",
1694
+ "integer"
1695
+ ],
1696
+ "description": [
1697
+ "The season number.",
1698
+ "The number of this sauce, from 1 (least hot) to 10 (hottest).",
1699
+ "The name of the sauce.",
1700
+ "The rating of the sauce in Scoville heat units."
1701
+ ]
1702
+ },
1703
+ {
1704
+ "variable": [
1705
+ "season",
1706
+ "episodes",
1707
+ "note",
1708
+ "original_release",
1709
+ "last_release"
1710
+ ],
1711
+ "class": [
1712
+ "integer",
1713
+ "integer",
1714
+ "character",
1715
+ "date",
1716
+ "date"
1717
+ ],
1718
+ "description": [
1719
+ "The season number.",
1720
+ "The count of episodes in this season.",
1721
+ "Notes about this season.",
1722
+ "The date of the first episode in this season.",
1723
+ "The date of the last episode of this season (if that episode has aired at the time of scraping)."
1724
+ ]
1725
+ }
1726
+ ],
1727
+ "data": {
1728
+ "file_name": [
1729
+ "episodes.csv",
1730
+ "sauces.csv",
1731
+ "seasons.csv"
1732
+ ],
1733
+ "file_url": [
1734
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08/episodes.csv",
1735
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08/sauces.csv",
1736
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08/seasons.csv"
1737
+ ]
1738
+ },
1739
+ "data_load": {
1740
+ "file_name": [
1741
+ "episodes.csv",
1742
+ "sauces.csv",
1743
+ "seasons.csv"
1744
+ ],
1745
+ "file_url": [
1746
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-08/episodes.csv",
1747
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-08/sauces.csv",
1748
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-08/seasons.csv"
1749
+ ]
1750
+ }
1751
+ },
1752
+ {
1753
+ "date_posted": "2023-07-25",
1754
+ "project_name": "Scurvy",
1755
+ "project_source": [
1756
+ "https://github.com/higgi13425/medicaldata/tree/master/data-raw",
1757
+ "https://htmlpreview.github.io/?https://github.com/higgi13425/medicaldata/blob/master/man/description_docs/scurvy_desc.html",
1758
+ "https://higgi13425.github.io/medicaldata/"
1759
+ ],
1760
+ "description": "The data this week comes from themedicaldata R package. This is a data package from Peter Higgins, with 19 medical datasets for teaching Reproducible Medical Research with R. We're using thescurvy dataset. Source: This data set is from a study published in 1757 in A Treatise on the Scurvy in Three Parts, by James Lind. This data set contains 12 participants with scurvy. In 1757, it was not known that scurvy is a manifestation of vitamin C deficiency. A variety of remedies had been anecdotally reported, but Lind was the first to test different regimens of acidic substances (including citrus fruits) against each other in a randomized, controlled trial. 6 distinct therapies were tested in 12 seamen with symptomatic scurvy, who were selected for similar severity. Six days of therapy were provided, and endpoints were reported in the text at the end of 6 days. These include rotting of the gums, skin sores, weakness of the knees, and lassitude, which are described in terms of severity. These have been translated into Likert scales from 0(none) to 3(severe). A dichotomous endpoint, fitness for duty, was also reported. Scurvy was a common affliction of seamen on long voyages, leading to mouth sores, skin lesions, weakness of the knees, and lassitude. Scurvy could be fatal on long voyages. James Lind reported the treatment of 12 seamen with scurvy in 1757, in _A Treatise on the Scurvy in Three Parts). This 476 page bloviation can be found scanned to the Google Books website A Treatise on the Scurvy. Pages 149-153 are a rare gem among what can be generously described as 400+ pages of evidence-free blathering, and these 4 pages may represent the first report of a controlled clinical trial. Lind was the ship’s surgeon on board the HMS Salisbury, and had a number of scurvy-affected seamen at his disposal. Many remedies had been described and advocated for, with no more than anecdotal evidence. On May 20, 1747, Lind decided to try the 6 therapies on the Salisbury in a comparative study in 12 affected seamen. He selected 12 with roughly similar severity, with notable skin and mouth sores, weakness of the knees, and significant lassitude, making them unfit for duty. They each received the standard shipboard diet of gruel and mutton broth, supplemented with occasional biscuits and puddings. Each treatment was a dietary supplement (including citrus fruits) or a medicinal. This data frame was reconstructed from Lind’s account as recorded on these 4 pages, with his estimates of severity translated to a 4 point Likert scale (0-3) for each of the symptoms he described at his chosen endpoint on day 6. A somewhat fanciful study_id variable was added, along with detailed descriptions of the dosing schedule of each treatment. Of note, there is some dispute about whether this was truly the first clinical trial, or whether it actually happened, as there are no contemporaneous corroborating accounts. See link about the historical debate. Lind reported that the seamen treated with 2 lemons and an orange daily did best, followed by those treated with cider. Those treated with elixir of vitriol only had improvement in mouth sores. One imagines that acidic substances (like dilute sulfuric acid, vinegar, cider, and citrus fruits) might have been rather painful on these mouth sores. Unfortunately, the burial of the 4 valuable pages of data in 476 pages of noise, a publication delay of 10 years, and Lind’s half-hearted conclusions (he was focused on acidity), meant that it took until 1795 before the British Navy mandated daily limes for seamen. The first column was removed from the scurvy.csv file available athttps://github.com/higgi13425/medicaldata/tree/master/data-raw.",
1761
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-25",
1762
+ "data_dictionary": [
1763
+ {
1764
+ "variable": [
1765
+ "study_id",
1766
+ "treatment",
1767
+ "dosing_regimen_for_scurvy",
1768
+ "gum_rot_d6",
1769
+ "skin_sores_d6",
1770
+ "weakness_of_the_knees_d6",
1771
+ "lassitude_d6",
1772
+ "fit_for_duty_d6"
1773
+ ],
1774
+ "class": [
1775
+ "double",
1776
+ "character",
1777
+ "character",
1778
+ "character",
1779
+ "character",
1780
+ "character",
1781
+ "character",
1782
+ "character"
1783
+ ],
1784
+ "description": [
1785
+ "Participant ID",
1786
+ "Treatment; cider, dilute_sulfuric_acid, vinegar, sea_water, citrus, purgative_mixture",
1787
+ "Dosing Regimen; 1 quart per day; 25 drops of elixir of vitriol, three times a day; two spoonfuls, three times daily; half pint daily; two lemons and an orange daily; a nutmeg-sized paste of garlic, mustard seed, horseradish, balsam of Peru, and gum myrrh three times a day",
1788
+ "Gum Rot on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
1789
+ "Skin Sores on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
1790
+ "Weakness of the Knees on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
1791
+ "Lassitude on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
1792
+ "Fit for Duty on Day 6; 0_no, 1_yes"
1793
+ ]
1794
+ }
1795
+ ],
1796
+ "data": {
1797
+ "file_name": [
1798
+ "scurvy.csv"
1799
+ ],
1800
+ "file_url": [
1801
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-25/scurvy.csv"
1802
+ ]
1803
+ },
1804
+ "data_load": {
1805
+ "file_name": [
1806
+ "scurvy.csv"
1807
+ ],
1808
+ "file_url": [
1809
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-07-25/scurvy.csv"
1810
+ ]
1811
+ }
1812
+ },
1813
+ {
1814
+ "date_posted": "2023-11-07",
1815
+ "project_name": "US House Election Results",
1816
+ "project_source": [
1817
+ "https://electionlab.mit.edu/",
1818
+ "https://electionlab.mit.edu/articles/new-report-how-we-voted-2022",
1819
+ "https://docs.posit.co/ide/user/ide/guide/tools/copilot.html",
1820
+ "https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2"
1821
+ ],
1822
+ "description": "It's election day in the United States! To celebrate, the data this week comes from theMIT Election Data and Science Lab(MEDSL). Hat tip this week to theRStudio GitHub Copilot integration, which suggested the MEDSL. From the MEDSL's reportNew Report: How We Voted in 2022: The Survey of the Performance of American Elections (SPAE) provides information about how Americans experienced voting in the most recent federal election. The survey has been conducted after federal elections since 2008, and is the only public opinion project in the country that is dedicated explicitly to understanding how voters themselves experience the election process. We're specifically providing data on House elections from 1976-2022. Check out theMEDSL websitefor additional datasets and tools. Be sure to cite the MEDSL in your work: Clean data and dictionary downloaded from theHarvard Dataverse",
1823
+ "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-07",
1824
+ "data_dictionary": [
1825
+ {
1826
+ "variable": [
1827
+ "year",
1828
+ "state",
1829
+ "state_po",
1830
+ "state_fips",
1831
+ "state_cen",
1832
+ "state_ic",
1833
+ "office",
1834
+ "district",
1835
+ "stage",
1836
+ "runoff",
1837
+ "special",
1838
+ "candidate",
1839
+ "party",
1840
+ "writein",
1841
+ "mode",
1842
+ "candidatevotes",
1843
+ "totalvotes",
1844
+ "unofficial",
1845
+ "version",
1846
+ "fusion_ticket"
1847
+ ],
1848
+ "class": [
1849
+ "double",
1850
+ "character",
1851
+ "character",
1852
+ "double",
1853
+ "double",
1854
+ "double",
1855
+ "character",
1856
+ "character",
1857
+ "character",
1858
+ "logical",
1859
+ "logical",
1860
+ "character",
1861
+ "character",
1862
+ "logical",
1863
+ "character",
1864
+ "double",
1865
+ "double",
1866
+ "logical",
1867
+ "double",
1868
+ "logical"
1869
+ ],
1870
+ "description": [
1871
+ "year in which election was held",
1872
+ "state name",
1873
+ "U.S. postal code state abbreviation",
1874
+ "State FIPS code",
1875
+ "U.S. Census state code",
1876
+ "ICPSR state code",
1877
+ "U.S. House (constant)",
1878
+ "district number. At-large districts are coded as 0 (zero)",
1879
+ "electoral stage (gen = general elections, pri = primary elections)",
1880
+ "runoff election",
1881
+ "special election",
1882
+ "name of the candidate as it appears in the House Clerk report",
1883
+ "party of the candidate (always entirely lowercase) (Parties are as they appear in the House Clerk report. In states that allow candidates to appear on multiple party lines, separate vote totals are indicated for each party. Therefore, for analysis that involves candidate totals, it will be necessary to aggregate across all party lines within a district. For analysis that focuses on two-party vote totals, it will be necessary to account for major party candidates who receive votes under multiple party labels. Minnesota party labels are given as they appear on the Minnesota ballots. Future versions of this file will include codes for candidates who are endorsed by major parties, regardless of the party label under which they receive votes.)",
1884
+ "vote totals associated with write-in candidates",
1885
+ "mode of voting; states with data that doesn't break down returns by mode are marked as \\\"total\\\"",
1886
+ "votes received by this candidate for this particular party",
1887
+ "total number of votes cast for this election",
1888
+ "TRUE/FALSE indicator for unofficial result (to be updated later); this appears only for 2018 data in some cases",
1889
+ "date when this dataset was finalized",
1890
+ "A TRUE/FALSE indicator as to whether the given candidate is running on a fusion party ticket, which will in turn mean that a candidate will appear multiple times, but by different parties, for a given election. States with fusion tickets include Connecticut, New Jersey, New York, and South Carolina."
1891
+ ]
1892
+ }
1893
+ ],
1894
+ "data": {
1895
+ "file_name": [
1896
+ "house.csv"
1897
+ ],
1898
+ "file_url": [
1899
+ "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-07/house.csv"
1900
+ ]
1901
+ },
1902
+ "data_load": {
1903
+ "file_name": [
1904
+ "house.csv"
1905
+ ],
1906
+ "file_url": [
1907
+ "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-07/house.csv"
1908
+ ]
1909
+ }
1910
+ }
1911
+ ]
version2/demo.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metadata Hierarchy Explorer β€” TFM 2026
3
+ Navigation router (Streamlit st.navigation).
4
+
5
+ Sidebar layout:
6
+ Metadata Hierarchy Explorer / TFM 2026 (branding, top)
7
+ Demo View (pre-built results viewer)
8
+ Build hierarchy (collapsible) (upload a CSV and run a method)
9
+ β€’ the three methods (descriptive names from methods.py)
10
+ """
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ import streamlit as st
15
+
16
+ # Shared method names live in views/methods.py β€” make it importable.
17
+ sys.path.insert(0, str(Path(__file__).resolve().parent / "views"))
18
+ from methods import METHODS # noqa: E402
19
+
20
+ st.set_page_config(
21
+ page_title="Metadata Hierarchy Explorer",
22
+ layout="wide",
23
+ )
24
+
25
+ # ── Pages ────────────────────────────────────────────────────────────────────
26
+ viewer = st.Page("views/viewer.py", title="Demo View", default=True)
27
+ base = st.Page("views/run_baseline.py", title=METHODS["Baseline"]["title"])
28
+ appr1 = st.Page("views/run_approach_1.py", title=METHODS["Approach 1"]["title"])
29
+ appr2 = st.Page("views/run_approach_2.py", title=METHODS["Approach 2"]["title"])
30
+
31
+ # Hidden default nav β€” we render our own links so we control the order.
32
+ pg = st.navigation([viewer, base, appr1, appr2], position="hidden")
33
+
34
+ # ── Sidebar: branding + navigation (Built Hierarchy above Demo View) ─────────
35
+ with st.sidebar:
36
+ st.title("Metadata Hierarchy Explorer")
37
+ st.caption("TFM 2026 β€” Metadata hierarchy construction")
38
+ st.markdown("---")
39
+ with st.expander("Built Hierarchy", expanded=False):
40
+ st.caption("Upload a CSV and run a method live.")
41
+ st.page_link(base, label=METHODS["Baseline"]["title"])
42
+ st.page_link(appr1, label=METHODS["Approach 1"]["title"])
43
+ st.page_link(appr2, label=METHODS["Approach 2"]["title"])
44
+ st.page_link(viewer, label="Demo View")
45
+
46
+ # ── The selected page renders here (its own controls included) ───────────────
47
+ pg.run()
version2/hierarchy_eval.py ADDED
@@ -0,0 +1,622 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ hierarchy_eval.py β€” shared, reference-free hierarchy evaluation for the TFM.
3
+
4
+ WHY REFERENCE-FREE?
5
+ -------------------
6
+ No manually curated reference taxonomy is bundled with the thesis experiments.
7
+ The dataset group columns are metadata supplied by the input file, not an
8
+ independent reference taxonomy. Approach 1 and Approach 2 use group information
9
+ during construction; the Baseline avoids it during construction, but it still
10
+ does not become a manually verified taxonomy. The defensible headline
11
+ evaluation is therefore reference-free.
12
+
13
+ PRIMARY METRICS (no manual reference required) β€” fair cross-approach comparison
14
+ -------------------------------------------
15
+ β€’ Parent–child coherence β€” TraCo (Wu et al., AAAI 2024, arXiv:2401.14113)
16
+ β€’ Sibling diversity β€” TraCo (same paper)
17
+ β€’ NPMI label coherence β€” Lau et al., EACL 2014 (aclanthology.org/E14-1056);
18
+ orig. Mimno et al., EMNLP 2010
19
+ β€’ Label quality β€” interpretability proxies (concept-valid label %,
20
+ sibling redundancy, avg label words). Captures the
21
+ dimension coherence misses (meaningful inner labels,
22
+ Taxonomizer's stated goal).
23
+ β€’ Structural statistics β€” HiExpan-style reporting (Shen et al., KDD 2018)
24
+
25
+ All of the above use the SAME encoder/corpus for every approach, so the
26
+ cross-approach comparison is fair. NOTE: coherence (TraCo/NPMI) can favour the
27
+ data-derived baseline, so interpretability + a human study are needed to show
28
+ the approaches' advantage.
29
+
30
+ GROUP-COLUMN METRICS (ARI / AMI / NMI / Purity) β€” descriptive only
31
+ ------------------------------------------------------------------
32
+ These compare a system partition with input grouping metadata. They are useful
33
+ sanity checks, but they are not thesis accuracy scores and are not comparable as
34
+ reference-taxonomy recovery. NMI and Purity are especially inflated by over-splitting.
35
+ """
36
+ from __future__ import annotations
37
+
38
+ import re
39
+ from collections import Counter
40
+
41
+ import numpy as np
42
+
43
+ # ──────────────────────────────────────────────────────────────────────────────
44
+ # Tree helpers
45
+ # ──────────────────────────────────────────────────────────────────────────────
46
+ def build_parent_map(nodes: list) -> dict:
47
+ pm: dict = {}
48
+ for n in nodes:
49
+ for c in n.get('related', []):
50
+ cid = int(c)
51
+ if cid not in pm:
52
+ pm[cid] = int(n['id'])
53
+ return pm
54
+
55
+
56
+ def structural_stats(nodes: list) -> dict:
57
+ pm = build_parent_map(nodes)
58
+
59
+ def depth_of(nid: int) -> int:
60
+ d = 0
61
+ while nid in pm:
62
+ nid = pm[nid]; d += 1
63
+ return d
64
+
65
+ agg = [n for n in nodes if n.get('type') == 'aggregation']
66
+ leafs = [n for n in nodes if n.get('type') == 'attribute']
67
+ depths = [depth_of(int(n['id'])) for n in leafs]
68
+ branches = [len(n.get('related', [])) for n in agg]
69
+ singletons = sum(1 for b in branches if b == 1)
70
+ return {
71
+ 'n_aggregation_nodes': len(agg),
72
+ 'max_depth': int(max(depths, default=0)),
73
+ 'avg_leaf_depth': round(float(np.mean(depths)), 2) if depths else 0.0,
74
+ 'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0,
75
+ 'singleton_nodes_%': round(100.0 * singletons / max(len(agg), 1), 1),
76
+ }
77
+
78
+
79
+ # ──────────────────────────────────────────────────────────────────────────────
80
+ # Encoder β€” SBERT if available, TF-IDF fallback. Loaded once, reused.
81
+ # ──────────────────────────────────────────────────────────────────────────────
82
+ _SBERT = None
83
+ _SBERT_TRIED = False
84
+
85
+
86
+ def _get_sbert():
87
+ global _SBERT, _SBERT_TRIED
88
+ if _SBERT_TRIED:
89
+ return _SBERT
90
+ _SBERT_TRIED = True
91
+ try:
92
+ from sentence_transformers import SentenceTransformer
93
+ _SBERT = SentenceTransformer('all-MiniLM-L6-v2')
94
+ except Exception:
95
+ _SBERT = None
96
+ return _SBERT
97
+
98
+
99
+ def encode(texts: list):
100
+ """Return (unit-normalised vectors, backend_name)."""
101
+ texts = [str(t) if str(t).strip() else '_' for t in texts]
102
+ model = _get_sbert()
103
+ if model is not None:
104
+ v = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
105
+ return np.asarray(v, dtype=float), 'SBERT (all-MiniLM-L6-v2)'
106
+ from sklearn.feature_extraction.text import TfidfVectorizer
107
+ X = TfidfVectorizer(stop_words='english', max_features=2000,
108
+ min_df=1).fit_transform(texts).toarray().astype(float)
109
+ norms = np.linalg.norm(X, axis=1, keepdims=True)
110
+ return X / np.where(norms == 0, 1.0, norms), 'TF-IDF (SBERT unavailable)'
111
+
112
+
113
+ # ──────────────────────────────────────────────────────────────────────────────
114
+ # TraCo reference-free metrics (Wu et al., AAAI 2024)
115
+ # ──────────────────────────────────────────────────────────────────────────────
116
+ def traco_metrics(nodes: list) -> dict:
117
+ """Parent–child coherence and sibling diversity over node *labels*."""
118
+ usable = [n for n in nodes if n.get('type') in ('aggregation', 'attribute')]
119
+ if len(usable) < 2:
120
+ return {'pc_coherence': 0.0, 'sibling_diversity': 0.0, 'encoder': 'n/a'}
121
+
122
+ ids = [int(n['id']) for n in usable]
123
+ labels = [str(n.get('name', '')) for n in usable]
124
+ vecs, backend = encode(labels)
125
+ id2v = {i: vecs[k] for k, i in enumerate(ids)}
126
+
127
+ pc_sims, sib_divs = [], []
128
+ for n in nodes:
129
+ if n.get('type') == 'root':
130
+ continue
131
+ pid = int(n['id'])
132
+ if pid not in id2v:
133
+ continue
134
+ children = [int(c) for c in n.get('related', []) if int(c) in id2v]
135
+ for cid in children:
136
+ pc_sims.append(float(np.dot(id2v[pid], id2v[cid])))
137
+ if len(children) >= 2:
138
+ cv = np.array([id2v[c] for c in children])
139
+ S = cv @ cv.T
140
+ nc = len(children)
141
+ divs = [1.0 - float(S[i, j]) for i in range(nc) for j in range(i + 1, nc)]
142
+ sib_divs.append(float(np.mean(divs)))
143
+
144
+ return {
145
+ 'pc_coherence': round(float(np.mean(pc_sims)), 4) if pc_sims else 0.0,
146
+ 'sibling_diversity': round(float(np.mean(sib_divs)), 4) if sib_divs else 0.0,
147
+ 'encoder': backend,
148
+ }
149
+
150
+
151
+ # ──────────────────────────────────────────────────────────────────────────────
152
+ # NPMI label coherence (Lau et al., EACL 2014; Mimno et al., EMNLP 2010)
153
+ # Reference corpus = the variable descriptions themselves.
154
+ # ──────────────────────────────────────────────────────────────────────────────
155
+ _TOKEN_RE = re.compile(r'[a-z][a-z]{2,}')
156
+ _STOP = set(
157
+ 'the a an and or of to in for on with by at from as is are be this that these '
158
+ 'those it its was were has have had not no than then so such can will may '
159
+ 'group description name label value type using used per each'.split()
160
+ )
161
+
162
+
163
+ def _tokens(text: str) -> set:
164
+ return {w for w in _TOKEN_RE.findall(str(text).lower()) if w not in _STOP}
165
+
166
+
167
+ def npmi_coherence(nodes: list, corpus_texts: list, topn: int = 5) -> float:
168
+ """Average NPMI of each aggregation node's label terms over the corpus.
169
+
170
+ Returns a value in roughly [-1, 1]; higher = node labels use term
171
+ combinations that genuinely co-occur in the data (meaningful, not random).
172
+ """
173
+ docs = [_tokens(t) for t in corpus_texts]
174
+ docs = [d for d in docs if d]
175
+ N = len(docs)
176
+ if N < 2:
177
+ return 0.0
178
+
179
+ df: Counter = Counter()
180
+ for d in docs:
181
+ for w in d:
182
+ df[w] += 1
183
+
184
+ # Collect the term sets we actually need (node labels)
185
+ label_termsets: list = []
186
+ needed_terms: set = set()
187
+ for n in nodes:
188
+ if n.get('type') != 'aggregation':
189
+ continue
190
+ terms = [w for w in _tokens(n.get('name', '')) if df.get(w, 0) > 0]
191
+ terms = sorted(terms, key=lambda w: df[w], reverse=True)[:topn]
192
+ if len(terms) >= 2:
193
+ label_termsets.append(terms)
194
+ needed_terms.update(terms)
195
+
196
+ if not label_termsets:
197
+ return 0.0
198
+
199
+ # Pair co-occurrence counts (only for needed pairs)
200
+ needed_pairs = set()
201
+ for terms in label_termsets:
202
+ for i in range(len(terms)):
203
+ for j in range(i + 1, len(terms)):
204
+ needed_pairs.add(frozenset((terms[i], terms[j])))
205
+
206
+ co: Counter = Counter()
207
+ for d in docs:
208
+ present = d & needed_terms
209
+ if len(present) < 2:
210
+ continue
211
+ pl = list(present)
212
+ for i in range(len(pl)):
213
+ for j in range(i + 1, len(pl)):
214
+ pair = frozenset((pl[i], pl[j]))
215
+ if pair in needed_pairs:
216
+ co[pair] += 1
217
+
218
+ eps = 1e-12
219
+ node_scores: list = []
220
+ for terms in label_termsets:
221
+ pair_npmis: list = []
222
+ for i in range(len(terms)):
223
+ for j in range(i + 1, len(terms)):
224
+ wi, wj = terms[i], terms[j]
225
+ c_ij = co.get(frozenset((wi, wj)), 0)
226
+ p_ij = (c_ij + eps) / N
227
+ p_i = df[wi] / N
228
+ p_j = df[wj] / N
229
+ pmi = np.log(p_ij / (p_i * p_j + eps) + eps)
230
+ npmi = pmi / (-np.log(p_ij + eps))
231
+ pair_npmis.append(float(npmi))
232
+ if pair_npmis:
233
+ node_scores.append(float(np.mean(pair_npmis)))
234
+
235
+ return round(float(np.mean(node_scores)), 4) if node_scores else 0.0
236
+
237
+
238
+ # ──────────────────────────────────────────────────────────────────────────────
239
+ # Secondary (descriptive, caveated): group-structure preservation
240
+ # ──────────────────────────────────────────────────────────────────────────────
241
+ def _depth1_assignments(nodes: list, can) -> list:
242
+ pm = build_parent_map(nodes)
243
+
244
+ def depth1(nid: int) -> int:
245
+ while pm.get(nid, -1) not in (-1, 0):
246
+ nid = pm[nid]
247
+ return nid
248
+
249
+ lid_to_nid = {}
250
+ row_to_nid = {}
251
+ for n in nodes:
252
+ if n.get('type') != 'attribute' or 'metadata' not in n:
253
+ continue
254
+ meta = n.get('metadata', {})
255
+ if meta.get('leaf_id'):
256
+ lid_to_nid[str(meta['leaf_id'])] = int(n['id'])
257
+ if meta.get('row_index') is not None:
258
+ try:
259
+ row_to_nid[int(meta['row_index'])] = int(n['id'])
260
+ except Exception:
261
+ pass
262
+ leaf_col = '_leaf_id' if '_leaf_id' in can.columns else '_id'
263
+ row_col = '_row' if '_row' in can.columns else None
264
+ out = []
265
+ for i, row in can.iterrows():
266
+ lid = str(row.get(leaf_col, ''))
267
+ if lid in lid_to_nid:
268
+ out.append(depth1(lid_to_nid[lid]))
269
+ continue
270
+ try:
271
+ rid = int(row.get(row_col, i)) if row_col else int(i)
272
+ except Exception:
273
+ rid = int(i)
274
+ out.append(depth1(row_to_nid[rid]) if rid in row_to_nid else -1)
275
+ return out
276
+
277
+
278
+ def _purity(y_true, y_pred) -> float:
279
+ clusters: dict = {}
280
+ for t, p in zip(y_true, y_pred):
281
+ clusters.setdefault(p, []).append(t)
282
+ correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values())
283
+ return correct / max(len(y_true), 1)
284
+
285
+
286
+ def group_preservation(nodes: list, can) -> dict:
287
+ """NMI / ARI / Purity of the depth-1 partition vs input grouping metadata.
288
+
289
+ CAVEAT: the group column is not a manually curated reference taxonomy, so
290
+ this is a descriptive 'structure preservation' figure, NOT an accuracy metric.
291
+ """
292
+ from sklearn.metrics import (normalized_mutual_info_score, adjusted_rand_score,
293
+ adjusted_mutual_info_score)
294
+ from sklearn.preprocessing import LabelEncoder
295
+ import pandas as pd
296
+
297
+ # group column robust to either canonical schema (_group_path or _group)
298
+ gcol = '_group_path' if '_group_path' in can.columns else '_group'
299
+ y_true_raw = can[gcol].apply(
300
+ lambda x: str(x).split(' > ')[0].strip()
301
+ if pd.notna(x) and str(x) not in ('', 'nan') else 'Ungrouped'
302
+ ).tolist()
303
+ y_pred_raw = _depth1_assignments(nodes, can)
304
+
305
+ y_true = LabelEncoder().fit_transform(y_true_raw)
306
+ y_pred = LabelEncoder().fit_transform(y_pred_raw)
307
+ return {
308
+ # ARI and AMI are chance-corrected β€” the trustworthy numbers.
309
+ 'ARI': round(float(adjusted_rand_score(y_true, y_pred)), 4),
310
+ 'AMI': round(float(adjusted_mutual_info_score(y_true, y_pred)), 4),
311
+ # NMI and Purity are reported for completeness but are inflated by
312
+ # over-splitting (more clusters β†’ higher), so they are NOT headline.
313
+ 'NMI': round(float(normalized_mutual_info_score(
314
+ y_true, y_pred, average_method='arithmetic')), 4),
315
+ 'Purity': round(_purity(y_true_raw, y_pred_raw), 4),
316
+ }
317
+
318
+ def label_quality(nodes: list) -> dict:
319
+ """Reference-free interpretability proxies for internal-node labels.
320
+
321
+ Captures the dimension Taxonomizer is *about* β€” meaningful inner-node labels β€”
322
+ which coherence metrics miss. Fully automatic, no manual reference required:
323
+
324
+ β€’ concept_label_pct β€” % of internal labels that read as a real concept:
325
+ a short phrase (<=3 words) whose head word is a known English noun
326
+ (WordNet). Penalises '/'-joined contrastive term fragments.
327
+ β€’ redundancy_pct β€” % of internal labels that duplicate a sibling's
328
+ label (same normalised text under the same parent).
329
+ β€’ avg_label_words β€” mean label length in words (shorter = more name-like).
330
+ """
331
+ pm = build_parent_map(nodes)
332
+ internal = [n for n in nodes if n.get('type') == 'aggregation']
333
+ if not internal:
334
+ return {'concept_label_pct': 0.0, 'redundancy_pct': 0.0, 'avg_label_words': 0.0}
335
+
336
+ # WordNet noun check (optional; degrade gracefully if unavailable)
337
+ try:
338
+ from nltk.corpus import wordnet as wn
339
+ def _is_noun(w):
340
+ return bool(wn.synsets(w, pos=wn.NOUN))
341
+ except Exception:
342
+ def _is_noun(w):
343
+ return len(w) > 2 # fallback: any real-ish word
344
+
345
+ def _norm(s): return re.sub(r'[^a-z0-9]+', ' ', str(s).lower()).strip()
346
+
347
+ concept = 0
348
+ wordcounts = []
349
+ for n in internal:
350
+ raw = str(n.get('name', ''))
351
+ words = _norm(raw).split()
352
+ wordcounts.append(len(words))
353
+ # '/'-joined fragments are NOT concept labels
354
+ is_fragment = '/' in raw
355
+ head = words[-1] if words else ''
356
+ if (not is_fragment) and 1 <= len(words) <= 3 and head and _is_noun(head):
357
+ concept += 1
358
+
359
+ # sibling redundancy
360
+ by_parent: dict = {}
361
+ for n in internal:
362
+ p = pm.get(int(n['id']), -1)
363
+ by_parent.setdefault(p, []).append(_norm(n.get('name', '')))
364
+ redundant = 0
365
+ for sibs in by_parent.values():
366
+ seen = set()
367
+ for s in sibs:
368
+ if s in seen:
369
+ redundant += 1
370
+ seen.add(s)
371
+
372
+ n_int = len(internal)
373
+ return {
374
+ 'concept_label_pct': round(100.0 * concept / n_int, 1),
375
+ 'redundancy_pct': round(100.0 * redundant / n_int, 1),
376
+ 'avg_label_words': round(float(np.mean(wordcounts)), 2),
377
+ }
378
+
379
+
380
+ # ──────────────────────────────────────────────────────────────────────────────
381
+ # Optional manual-reference comparison β€” Edge-F1 / Ancestor-F1
382
+ #
383
+ # HiExpan (Shen et al., KDD 2018) scores a system taxonomy against a hand-built
384
+ # reference taxonomy with Edge-F1 (direct parent–child links) and Ancestor-F1
385
+ # (all ancestor links). Because our internal-node *labels* differ between a
386
+ # manual reference tree and each system, we use the label-free leaf-pair formulation (the
387
+ # pair-counting tradition, Fowlkes & Mallows 1983):
388
+ #
389
+ # β€’ Edge-F1 β€” over pairs of leaves that share the same IMMEDIATE parent
390
+ # (i.e. they are siblings). Strict: rewards correct granularity.
391
+ # β€’ Ancestor-F1 β€” over pairs of leaves that share ANY non-root ancestor
392
+ # (i.e. they are grouped together somewhere). Lenient.
393
+ #
394
+ # Leaves are matched between reference and system by their attribute-node NAME (the
395
+ # variable label) β€” the one field all three approaches expose for every leaf.
396
+ # Only leaves present in BOTH the manual subset and the system tree are scored,
397
+ # so a small hand-built subset could evaluate a full hierarchy if one is created.
398
+ # ──────────────────────────────────────────────────────────────────────────────
399
+ def _pred_leaf_lineage(nodes: list) -> dict:
400
+ """leaf name β†’ list of ancestor node ids (root-most first, excl. root & leaf)."""
401
+ pm = build_parent_map(nodes)
402
+ id_to_node = {int(n['id']): n for n in nodes}
403
+ lineage: dict = {}
404
+ for n in nodes:
405
+ if n.get('type') != 'attribute':
406
+ continue
407
+ name = str(n.get('name', ''))
408
+ cur = int(n['id'])
409
+ anc, seen = [], set()
410
+ while cur in pm and cur not in seen:
411
+ seen.add(cur)
412
+ cur = pm[cur]
413
+ nd = id_to_node.get(cur)
414
+ if nd is None or nd.get('type') == 'root':
415
+ break
416
+ anc.append(cur)
417
+ anc.reverse()
418
+ lineage[name] = anc
419
+ return lineage
420
+
421
+
422
+ def _gold_leaf_lineage(gold_df) -> dict:
423
+ """leaf name β†’ list of cumulative path-prefix strings from a manual reference."""
424
+ lineage: dict = {}
425
+ for _, r in gold_df.iterrows():
426
+ name = str(r['leaf_label'])
427
+ path = str(r.get('gold_path', '') or '')
428
+ comps = [c.strip() for c in path.split('>')
429
+ if c.strip() and c.strip().lower() != 'ungrouped']
430
+ anc, pref = [], ''
431
+ for c in comps:
432
+ pref = c if not pref else f'{pref} > {c}'
433
+ anc.append(pref)
434
+ lineage[name] = anc
435
+ return lineage
436
+
437
+
438
+ def _sibling_pairs(lineage: dict) -> set:
439
+ from collections import defaultdict
440
+ groups: dict = defaultdict(list)
441
+ for name, anc in lineage.items():
442
+ if anc:
443
+ groups[anc[-1]].append(name)
444
+ pairs: set = set()
445
+ for members in groups.values():
446
+ m = sorted(members)
447
+ for i in range(len(m)):
448
+ for j in range(i + 1, len(m)):
449
+ pairs.add((m[i], m[j]))
450
+ return pairs
451
+
452
+
453
+ def _cogrouped_pairs(lineage: dict) -> set:
454
+ from collections import defaultdict
455
+ occ: dict = defaultdict(set)
456
+ for name, anc in lineage.items():
457
+ for a in anc:
458
+ occ[a].add(name)
459
+ pairs: set = set()
460
+ for members in occ.values():
461
+ m = sorted(members)
462
+ for i in range(len(m)):
463
+ for j in range(i + 1, len(m)):
464
+ pairs.add((m[i], m[j]))
465
+ return pairs
466
+
467
+
468
+ def _prf(pred_set: set, gold_set: set) -> dict:
469
+ if not pred_set and not gold_set:
470
+ return {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
471
+ tp = len(pred_set & gold_set)
472
+ p = tp / len(pred_set) if pred_set else 0.0
473
+ r = tp / len(gold_set) if gold_set else 0.0
474
+ f = 2 * p * r / (p + r) if (p + r) > 0 else 0.0
475
+ return {'precision': round(p, 4), 'recall': round(r, 4), 'f1': round(f, 4)}
476
+
477
+
478
+ def gold_comparison(nodes: list, gold_df) -> dict:
479
+ """Edge-F1 and Ancestor-F1 of a system tree vs an optional manual reference."""
480
+ pred = _pred_leaf_lineage(nodes)
481
+ gold = _gold_leaf_lineage(gold_df)
482
+ shared = set(pred) & set(gold)
483
+ pred = {k: v for k, v in pred.items() if k in shared}
484
+ gold = {k: v for k, v in gold.items() if k in shared}
485
+ return {
486
+ 'n_matched_leaves': len(shared),
487
+ 'edge_f1': _prf(_sibling_pairs(pred), _sibling_pairs(gold)),
488
+ 'ancestor_f1': _prf(_cogrouped_pairs(pred), _cogrouped_pairs(gold)),
489
+ }
490
+
491
+
492
+ # ──────────────────────────────────────────────────────────────────────────────
493
+ # Granularity-tolerant, label-independent structural F1 (set-overlap matching)
494
+ #
495
+ # Edge-F1 punishes a system for adding *correct* extra depth, because two leaves
496
+ # that a manual reference lists as siblings stop being immediate siblings once the system
497
+ # refines them into sub-tiers. That makes edge-F1 unfair to deliberately deeper
498
+ # trees (Approaches 1 & 2). Set-overlap F1 fixes this: it matches each reference
499
+ # cluster (the set of leaves under a reference path-prefix) to the system node whose
500
+ # leaf set overlaps it most (Jaccard), regardless of that node's depth or label.
501
+ #
502
+ # β€’ precision β€” for each system aggregation node, its best Jaccard with any
503
+ # reference cluster, averaged. Low when the system invents groups
504
+ # the reference does not have (e.g. one node per delay value = over-split).
505
+ # β€’ recall β€” for each reference cluster, its best Jaccard with any system node,
506
+ # averaged. Low when the system fails to recover a reference group.
507
+ #
508
+ # This is the cluster-matching / overlap-F1 tradition (e.g. ontology alignment,
509
+ # hierarchical-clustering evaluation). Label-free, so it compares the three
510
+ # approaches fairly even though their internal-node labels differ.
511
+ # ──────────────────────────────────────────────────────────────────────────────
512
+ def _system_clusters(nodes: list) -> list:
513
+ """Each aggregation node β†’ frozenset of leaf NAMES in its subtree (size β‰₯ 2)."""
514
+ id_to_node = {int(n['id']): n for n in nodes}
515
+ out: list = []
516
+ for n in nodes:
517
+ if n.get('type') != 'aggregation':
518
+ continue
519
+ leaves: list = []
520
+ stack = [int(n['id'])]
521
+ seen: set = set()
522
+ while stack:
523
+ x = stack.pop()
524
+ if x in seen:
525
+ continue
526
+ seen.add(x)
527
+ nd = id_to_node.get(x)
528
+ if nd is None:
529
+ continue
530
+ if nd.get('type') == 'attribute':
531
+ leaves.append(str(nd.get('name', '')))
532
+ else:
533
+ stack.extend(int(c) for c in nd.get('related', []))
534
+ s = frozenset(leaves)
535
+ if len(s) >= 2:
536
+ out.append(s)
537
+ return out
538
+
539
+
540
+ def _gold_clusters(gold_df) -> list:
541
+ """Each reference path-prefix β†’ frozenset of leaf NAMES under it (size β‰₯ 2)."""
542
+ from collections import defaultdict
543
+ occ: dict = defaultdict(set)
544
+ for name, anc in _gold_leaf_lineage(gold_df).items():
545
+ for a in anc:
546
+ occ[a].add(name)
547
+ return [frozenset(v) for v in occ.values() if len(v) >= 2]
548
+
549
+
550
+ def set_overlap_f1(nodes: list, gold_df) -> dict:
551
+ """Granularity-tolerant, label-free hierarchical F1 via best leaf-set Jaccard."""
552
+ pred_names = set(_pred_leaf_lineage(nodes))
553
+ gold_names = {str(x) for x in gold_df['leaf_label']}
554
+ shared = pred_names & gold_names
555
+ if len(shared) < 2:
556
+ return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
557
+
558
+ sys_cl = [c & shared for c in _system_clusters(nodes)]
559
+ sys_cl = [c for c in sys_cl if len(c) >= 2]
560
+ gold_cl = [c & shared for c in _gold_clusters(gold_df)]
561
+ gold_cl = [c for c in gold_cl if len(c) >= 2]
562
+ if not sys_cl or not gold_cl:
563
+ return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
564
+
565
+ def jac(a: frozenset, b: frozenset) -> float:
566
+ u = len(a | b)
567
+ return len(a & b) / u if u else 0.0
568
+
569
+ prec = float(np.mean([max(jac(s, g) for g in gold_cl) for s in sys_cl]))
570
+ rec = float(np.mean([max(jac(s, g) for s in sys_cl) for g in gold_cl]))
571
+ f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
572
+ return {'precision': round(prec, 4), 'recall': round(rec, 4), 'f1': round(f1, 4)}
573
+
574
+
575
+ def refinement_breakdown(nodes: list, gold_df) -> dict:
576
+ """Decompose edge-F1 disagreements into harmless refinement vs real errors.
577
+
578
+ β€’ wrong_merge_rate β€” system sibling pairs that the reference does NOT co-group anywhere
579
+ (genuine mistakes: variables wrongly placed together).
580
+ β€’ refinement_rate β€” reference sibling pairs the system keeps co-grouped but at a
581
+ FINER level (split into sub-tiers). These are deeper-but-consistent, the
582
+ thing edge-F1 unfairly penalises.
583
+ β€’ missed_rate β€” reference sibling pairs the system fails to co-group at all
584
+ (real recall failures).
585
+ """
586
+ pred = _pred_leaf_lineage(nodes)
587
+ gold = _gold_leaf_lineage(gold_df)
588
+ shared = set(pred) & set(gold)
589
+ pred = {k: v for k, v in pred.items() if k in shared}
590
+ gold = {k: v for k, v in gold.items() if k in shared}
591
+
592
+ sys_sib = _sibling_pairs(pred)
593
+ sys_cog = _cogrouped_pairs(pred)
594
+ gold_sib = _sibling_pairs(gold)
595
+ gold_cog = _cogrouped_pairs(gold)
596
+
597
+ wrong_merge = len(sys_sib - gold_cog)
598
+ refined = len((gold_sib & sys_cog) - sys_sib)
599
+ missed = len(gold_sib - sys_cog)
600
+ return {
601
+ 'wrong_merge_rate': round(wrong_merge / len(sys_sib), 4) if sys_sib else 0.0,
602
+ 'refinement_rate': round(refined / len(gold_sib), 4) if gold_sib else 0.0,
603
+ 'missed_rate': round(missed / len(gold_sib), 4) if gold_sib else 0.0,
604
+ }
605
+
606
+
607
+ # ──────────────────────────────────────────────────────────────────────────────
608
+ # One-call bundle
609
+ # ──────────────────────────────────────────────────────────────────────────────
610
+ def evaluate(nodes: list, corpus_texts: list | None = None, can=None,
611
+ gold_df=None) -> dict:
612
+ """Compute the full metric bundle for one hierarchy."""
613
+ out: dict = {}
614
+ out.update(traco_metrics(nodes))
615
+ out['npmi_coherence'] = (npmi_coherence(nodes, corpus_texts)
616
+ if corpus_texts is not None else None)
617
+ out.update({f'struct_{k}': v for k, v in structural_stats(nodes).items()})
618
+ if can is not None:
619
+ out['group_preservation'] = group_preservation(nodes, can)
620
+ if gold_df is not None:
621
+ out['gold'] = gold_comparison(nodes, gold_df)
622
+ return out
version2/launcher.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ launcher.py β€” start Baseline, Approach 1 and Approach 2 on different ports,
3
+ open them in browser tabs, and shut down all at once when you
4
+ press Enter.
5
+
6
+ Usage:
7
+ python launcher.py
8
+
9
+ Each app has its own file uploader β€” upload a different CSV to each tab to
10
+ compare approaches side by side.
11
+ """
12
+
13
+ from __future__ import annotations
14
+ import socket
15
+ import subprocess
16
+ import sys
17
+ import time
18
+ import webbrowser
19
+ from pathlib import Path
20
+
21
+ HERE = Path(__file__).resolve().parent
22
+
23
+ JOBS = [
24
+ ('baseline.py', 8501, 'Baseline'),
25
+ ('approach_1.py', 8502, 'Approach 1'),
26
+ ('approach_2.py', 8503, 'Approach 2'),
27
+ ]
28
+
29
+ # TIP: to compare TWO datasets at once you do NOT need extra ports. Streamlit
30
+ # gives every browser tab its own independent session (separate upload + state),
31
+ # so just open the same URL twice β€” e.g. open http://localhost:8501 in two tabs,
32
+ # load AI-MIND in one and HCP in the other.
33
+
34
+ OPEN_BROWSER = True
35
+ STARTUP_WAIT_SECS = 5
36
+
37
+
38
+ def _port_in_use(port: int) -> bool:
39
+ """Return True if something is already listening on this port."""
40
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
41
+ s.settimeout(0.5)
42
+ return s.connect_ex(('127.0.0.1', port)) == 0
43
+
44
+
45
+ def _kill_tree(p: subprocess.Popen) -> None:
46
+ """Kill a process and all its children (works reliably on Windows and POSIX)."""
47
+ if sys.platform == 'win32':
48
+ subprocess.call(
49
+ ['taskkill', '/F', '/T', '/PID', str(p.pid)],
50
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
51
+ )
52
+ else:
53
+ try:
54
+ import os, signal
55
+ os.killpg(os.getpgid(p.pid), signal.SIGTERM)
56
+ except Exception:
57
+ p.terminate()
58
+ try:
59
+ p.wait(timeout=5)
60
+ except subprocess.TimeoutExpired:
61
+ p.kill()
62
+
63
+
64
+ def main() -> int:
65
+ # Validate scripts
66
+ missing = [s for s, _, _ in JOBS if not (HERE / s).is_file()]
67
+ if missing:
68
+ print(f'ERROR: missing files: {missing}')
69
+ return 1
70
+
71
+ # Abort if any port is already occupied β€” prevents the duplicate-tab problem
72
+ busy = [(label, port) for _, port, label in JOBS if _port_in_use(port)]
73
+ if busy:
74
+ for label, port in busy:
75
+ print(f'ERROR: port {port} ({label}) is already in use.')
76
+ print('\nKill the existing servers first (Task Manager β†’ python.exe β†’ End Task),')
77
+ print('then run launcher.py again.')
78
+ return 1
79
+
80
+ procs: list[subprocess.Popen] = []
81
+ print(f'Working directory: {HERE}')
82
+ print(f'Launching {len(JOBS)} Streamlit instance(s)…\n')
83
+
84
+ for script, port, label in JOBS:
85
+ cmd = [
86
+ sys.executable, '-m', 'streamlit', 'run', str(HERE / script),
87
+ '--server.port', str(port),
88
+ '--server.headless', 'true', # suppress Streamlit's own browser open
89
+ '--browser.gatherUsageStats', 'false',
90
+ ]
91
+ try:
92
+ # Do NOT use CREATE_NEW_PROCESS_GROUP β€” it breaks taskkill /T
93
+ p = subprocess.Popen(cmd)
94
+ procs.append(p)
95
+ print(f' {label:<12} pid={p.pid:<6} β†’ http://localhost:{port}')
96
+ except Exception as e:
97
+ print(f' FAILED {label}: {e}')
98
+
99
+ if not procs:
100
+ print('Nothing started.')
101
+ return 1
102
+
103
+ # Wait for each server to actually be reachable before opening the browser
104
+ print(f'\nWaiting for servers to come up (max {STARTUP_WAIT_SECS}s each)…')
105
+ for _, port, label in JOBS:
106
+ for _ in range(STARTUP_WAIT_SECS * 2):
107
+ if _port_in_use(port):
108
+ print(f' {label} ready')
109
+ break
110
+ time.sleep(0.5)
111
+ else:
112
+ print(f' {label} did not respond in time β€” opening anyway')
113
+
114
+ if OPEN_BROWSER:
115
+ print('\nOpening browser tabs…')
116
+ for _, port, label in JOBS:
117
+ url = f'http://localhost:{port}'
118
+ webbrowser.open_new_tab(url)
119
+ print(f' β€’ {label} β†’ {url}')
120
+ time.sleep(0.3) # small gap so tabs open in order
121
+
122
+ print('\nAll servers running.')
123
+ print('Press Enter (in THIS terminal) to stop all servers and exit.\n')
124
+ try:
125
+ input()
126
+ except KeyboardInterrupt:
127
+ pass
128
+
129
+ print('\nStopping servers…')
130
+ for p in procs:
131
+ _kill_tree(p)
132
+ print('Done.')
133
+ return 0
134
+
135
+
136
+ if __name__ == '__main__':
137
+ raise SystemExit(main())
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_canonical.csv ADDED
The diff for this file is too large to render. See raw diff
 
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_concept_labels.csv ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Node,Confidence,Source,Embedding sim,Alternatives
2
+ Self Reported Ethnicity,0.542,keybert,0.625,"self reported racial, id strictly self-report, Self Reported"
3
+ Quarter Behavioral Data,0.266,keybert,0.525,"acquisition quarter, quarter behavioral, quarter relative start"
4
+ Yes Relationship Status,0.319,keybert,0.601,"employment status, respondent still school, participant employment status"
5
+ Gender,0.26,keybert,0.501,hcp id
6
+ Group Participant Age Range,0.834,description_title,0.874,"group participant age, participant age range, age participant years"
7
+ Percentage Task Fmri,0.41,keybert,0.84,"task fmri percent, fmri processing percent, fmri language percent"
8
+ Gambling Task Fmri,0.438,keybert,0.845,"task fmri gambling, fmri gambling, fmri gambling percent"
9
+ State Fmri Count,0.312,keybert,0.676,"fmri count, state fmri scans"
10
+ Scans Collected Scan Count,0.79,description_title,0.836,"collected scan count, scan count, scans collected scan"
11
+ Timepoints Task Fmri,0.329,keybert,0.698,"fmri protocol completed, task fmri protocol, relational task fmri"
12
+ Percentage Diffusion Mri,0.41,keybert,0.788,"diffusion mri completed, diffusion mri percent, complete diffusion mri"
13
+ Diffusion Mri Completed,0.385,keybert,0.729,"diffusion mri protocol, complete diffusion mri, full diffusion mri"
14
+ Fmri Movie Protocol,0.332,keybert,0.669,"fmri protocol completed, fmri retinotopy protocol, movie task fmri"
15
+ Data Complete Resting,0.275,keybert,0.532,"anatomy data complete, noise data complete, data complete noise"
16
+ Story Math Data,0.372,keybert,0.652,"available story math, story math, complete story math"
17
+ Ssaga Marijuana Dependence,0.378,keybert,0.666,"ssaga tobacco dependence, completed ssaga marijuana, data completed ssaga"
18
+ Compl Description,0.448,description_title,0.212,"compl description penn, compl description mini, test completed"
19
+ Non-Toolbox Battery Completed,0.433,keybert,0.747,"behavioral battery completed, toolbox battery completed, battery completed full"
20
+ Asr-Syn Compl Description,0.52,description_title,0.34,"asr-syn compl, asr-syn, der asr-syn compl"
21
+ Mr Session Scanner,0.41,keybert,0.847,"scanner particular mr, session scanner, scanner particular"
22
+ Mri Session Labels,0.427,keybert,0.843,"label mri session, specific label mri, label mri"
23
+ Parentheses Indicate Scan,0.339,keybert,0.658,"scan count type, indicate scan count, type scan mr"
24
+ Scan Session Scans,0.295,keybert,0.611,"scanner particular, scanner particular session, scan session"
25
+ Blood Sample,0.363,keybert,0.681,"hematocrit sample, women hematocrit sample, blood sample percentage"
26
+ Participant Menstrual Cycles,0.376,keybert,0.747,"participant age menstrual, participant menstrual, menstrual age cycles"
27
+ Bmi,0.715,description_title,0.698,"bmi body, bmi body mass, bmi their"
28
+ Applicable Hypothyroidism Age,0.349,keybert,0.675,"applicable hyperthyroidism age, hypothyroidism age onset, hypothyroidism age"
29
+ Birth Control Progesterone,0.369,keybert,0.675,"progesterone fertility drugs, participant birth control, control pills progesterone"
30
+ Systolic Blood Pressure,0.375,keybert,0.7,"blood pressure systolic, diastolic blood pressure, blood pressure diastolic"
31
+ Yes Father Bipolar,0.254,keybert,0.537,"father depression yes, yes father depression, father bipolar disorder"
32
+ Disease Dementia Yes,0.277,keybert,0.55,"parkinson disease yes, father parkinson disease, father alzheimer disease"
33
+ Drug Alcohol Problems,0.304,keybert,0.545,"father drug alcohol, mother drug alcohol, alcohol problems yes"
34
+ Father Anxiety Yes,0.355,keybert,0.688,"yes father anxiety, mother anxiety yes, anxiety yes father"
35
+ Schizophrenia Psychosis Yes,0.343,keybert,0.661,"mother schizophrenia psychosis, father schizophrenia psychosis, psychosis yes father"
36
+ Father Tourette Syndrome,0.345,keybert,0.674,"mother tourette syndrome, tourette syndrome yes, yes father tourette"
37
+ Psqi Compl Description,0.422,description_title,0.16,"psqi past month, psqi compl, psqi"
38
+ Description,0.386,description_title,0.079,"description quality, describe, description quality index"
39
+ Psqi Past Month,0.475,keybert,0.44,psqi past
40
+ Der Psqi Compl,0.34,keybert,0.165,
41
+ Index Psqi,0.376,keybert,0.286,
42
+ Test Matrix,0.517,keybert,0.411,
43
+ Delayed Reward Subjective,0.323,keybert,0.714,"delays fixed reward, larger delayed reward, undervaluing rewards delayed"
44
+ Larger Delayed Reward,0.306,keybert,0.68,"delays fixed reward, undervaluing rewards delayed, rewards delayed time"
45
+ Undervaluing Rewards Delayed,0.278,keybert,0.617,choice immediate amount
46
+ Area Under Curve,0.51,description_title,0.292,"curve area under, under curve area, curve area"
47
+ Total Positions Off,0.208,keybert,0.391,"trials total positions, total positions, positions off trials"
48
+ Sum Cpn Fp,0.275,keybert,0.513,"cpt true positives, sum cpn tp, cpt false negatives"
49
+ Non-Responses Longest Run,0.31,keybert,0.58,"longest run non-responses, non-responses longest, run non-responses longest"
50
+ Anger Identifications Correct,0.288,keybert,0.575,"correct anger identifications, anger identifications, correct fear identifications"
51
+ Aggression Scores Mean,0.31,keybert,0.62,"hostility scores mean, angry feelings scores, levels hostility scores"
52
+ Self-Report Measure Adults,0.285,keybert,0.602,"scores indicate self-reported, self-reported scores mean, self-reported scores"
53
+ Loneliness Scores,0.323,keybert,0.66,"loneliness scores mean, levels loneliness scores, scores indicative loneliness"
54
+ Perceived Hostility Scores,0.335,keybert,0.659,"perceived hostility survey, perceived rejection scores, hostility scores mean"
55
+ Brain Segmentation Volume,0.781,description_title,0.877,"brain segmentation, estimated intra-cranial volume, intra-cranial volume"
56
+ Total Defect Holes,0.582,keybert,0.824,"Prior Fixing, lh prior fixing, defect holes rh"
57
+ Etiv,0.582,description_title,0.418,"ratio maskvol etiv, ratio brainsegvol etiv, maskvol etiv"
58
+ Supratentorial Volume,0.715,keybert,0.948,"Supratentorial, supratentorial ventricals volume, supratentorial ventricals"
59
+ Gray Matter Volume,0.637,keybert,0.761,"white matter volume, Matter Volume, total gray matter"
60
+ Wm-Hypointensities,0.439,keybert,0.785,"left-wm-hypointensities, left-non-wm-hypointensities, right-wm-hypointensities"
61
+ Cc Anterior,0.154,keybert,0.319,"cc mid anterior, cc posterior, cc mid posterior"
62
+ Left-Vessel,0.417,keybert,0.773,right-vessel
63
+ Right-Putamen,0.139,keybert,0.308,"left-putamen, right-thalamus-proper, left-thalamus-proper"
64
+ Right-Cerebellum-Cortex,0.228,keybert,0.483,"left-cerebellum-cortex, right-cerebellum-white-matter, left-cerebellum-white-matter"
65
+ Left-Hippocampus,0.235,keybert,0.489,right-hippocampus
66
+ Left-Amygdala,0.276,keybert,0.547,right-amygdala
67
+ Right-Choroid-Plexus,0.285,keybert,0.551,left-choroid-plexus
68
+ Rd-Ventricle,0.28,keybert,0.559,"th-ventricle, right-lateral-ventricle, left-lateral-ventricle"
69
+ Right-Ventraldc,0.246,keybert,0.498,left-ventraldc
70
+ Right-Inf-Lat-Vent,0.144,keybert,0.321,left-inf-lat-vent
71
+ Gyrus Right Precentral,0.19,keybert,0.416,"gyrus right superiortemporal, gyrus right inferiortemporal, gyrus right middletemporal"
72
+ Cortex Right Entorhinal,0.225,keybert,0.5,"cortex left entorhinal, cortex right inferiorparietal, cortex right superiorparietal"
73
+ Inferior Frontal Gyrus,0.201,keybert,0.447,"gyrus left parsorbitalis, gyrus right parsorbitalis, gyrus right parsopercularis"
74
+ Pole Right Frontalpole,0.189,keybert,0.371,"pole right temporalpole, pole left temporalpole, right frontalpole average"
75
+ Gyrus Right Posteriorcingulate,0.219,keybert,0.46,"gyrus left posteriorcingulate, gyrus right isthmuscingulate, cingulate gyrus right"
76
+ Gyrus Right Caudalmiddlefrontal,0.244,keybert,0.529,"gyrus right rostralmiddlefrontal, gyrus left caudalmiddlefrontal"
77
+ Cortex Right Caudalanteriorcingulate,0.247,keybert,0.521,"cortex left caudalanteriorcingulate, cortex left rostralanteriorcingulate, anterior cingulate cortex"
78
+ Gyrus Right Parahippocampal,0.258,keybert,0.539,"gyrus left parahippocampal, parahippocampal average, right parahippocampal average"
79
+ Superior Temporal Sulcus,0.162,keybert,0.348,"superior temporal, temporal sulcus left, temporal sulcus"
80
+ Right Insula Average,0.193,keybert,0.374,"insula average, left insula average, right insula"
81
+ Cortex Right Medialorbitofrontal,0.222,keybert,0.488,"cortex left medialorbitofrontal, orbital frontal cortex, cortex right lateralorbitofrontal"
82
+ Cortex Right Lateraloccipital,0.234,keybert,0.486,"cortex left lateraloccipital, occipital cortex right, occipital cortex"
83
+ Sulcus Right Paracentral,0.166,keybert,0.327,"sulcus left paracentral, paracentral, right paracentral average"
84
+ Cortex Right Transversetemporal,0.248,keybert,0.524,"cortex left transversetemporal, temporal cortex left, temporal cortex"
85
+ Cortex Right Pericalcarine,0.241,keybert,0.51,"cortex left pericalcarine, right pericalcarine average, cortex"
86
+ Gyrus Right Lingual,0.236,keybert,0.488,"gyrus left lingual, lingual, lingual average"
87
+ Cortex Right Insula,0.241,keybert,0.53,"cortex left insula, cortex right transversetemporal, cortex left transversetemporal"
88
+ Cortex Left Medialorbitofrontal,0.221,keybert,0.489,"cortex right medialorbitofrontal, orbital frontal cortex, cortex right lateralorbitofrontal"
89
+ Cortex Left Caudalanteriorcingulate,0.256,keybert,0.543,"cortex right caudalanteriorcingulate, cortex left rostralanteriorcingulate, cortex right rostralanteriorcingulate"
90
+ Gyrus Right Superiortemporal,0.218,keybert,0.485,"gyrus right inferiortemporal, gyrus right middletemporal, gyrus left superiortemporal"
91
+ Cortex Right Inferiorparietal,0.246,keybert,0.526,"cortex right superiorparietal, cortex left inferiorparietal, cortex left superiorparietal"
92
+ Median Reaction Times,0.34,keybert,0.646,"average median reaction, overall reaction time, face median reaction"
93
+ Accuracy Percentage Face,0.361,keybert,0.677,"accuracy percentage overall, percentage overall accuracy, accuracy percentage shape"
94
+ Reaction Time Reward,0.294,keybert,0.589,"reaction times trials, median reaction times, reaction time punish"
95
+ Percentage Reward Trials,0.301,keybert,0.669,"percentage larger reward, percentage smaller reward, prediction percentage larger"
96
+ Percentage Trials Response,0.307,keybert,0.663,"reward trials response, trials response logged, overall percentage trials"
97
+ Percentage Larger Punish,0.339,keybert,0.729,"percentage smaller punish, percentage punish trials, percentage punish"
98
+ Difficulty Level Stimuli,0.277,keybert,0.531,"story median reaction, correct reaction time, stimuli presented math"
99
+ Accuracy Percentage Math,0.341,keybert,0.647,"accuracy percentage story, accuracy condition overall, accuracy percentage"
100
+ Accuracy Percentage Overall,0.322,keybert,0.582,"percentage overall accuracy, accuracy percentage match, accuracy percentage blocks"
101
+ Cial Tom Perc Random,0.582,description_title,0.509,"tom perc random, cial tom perc, percentage tom random"
102
+ Percentage Stimuli Response,0.318,keybert,0.686,"stimuli response logged, overall percentage stimuli, percentage stimuli"
103
+ Rating Percentage Unsure,0.286,keybert,0.59,unsure rating percentage
104
+ Stimuli Received Unsure,0.263,keybert,0.584,"rating median reaction, average median reaction, median reaction times"
105
+ Percentage Random Stimuli,0.364,keybert,0.763,"random stimuli subject, rated random percentage"
106
+ Time Random Stimuli,0.303,keybert,0.629,"random median reaction, random stimuli subject, reaction time random"
107
+ Accuracy Across Trials,0.265,keybert,0.589,"accuracy back place, trials back place, accuracy back"
108
+ Trials Back Tool,0.248,keybert,0.55,"accuracy back tool, tool condition accuracy, back tool nontargets"
109
+ Accuracy Back Face,0.303,keybert,0.618,"face condition accuracy, trials back face, back face targets"
110
+ Median Reaction Time,0.217,keybert,0.482,"tool condition median, reaction time back, target trials back"
111
+ Face Condition Median,0.225,keybert,0.459,"reaction time back, trials back face, reaction time across"
112
+ Reaction Time Back,0.217,keybert,0.482,"condition median reaction, correct trials back, reaction time across"
113
+ Accuracy Back Place,0.261,keybert,0.574,"trials back place, target trials back, accuracy back"
114
+ Back Median Reaction,0.245,keybert,0.535,"reaction time conditions, average median"
115
+ Accuracy Back Body,0.279,keybert,0.621,"body condition accuracy, trials back body, back body nontargets"
116
+ Body Condition Median,0.21,keybert,0.442,"trials back body, reaction time across, back body targets"
117
+ Body Condition Accuracy,0.264,keybert,0.579,"trials back body, back body nontargets"
118
+ Accuracy Back Tool,0.259,keybert,0.567,"tool condition accuracy, back tool targets, target trials back"
119
+ Conscientiousness Scale Neo-Ffi,0.382,keybert,0.785,"neo-ffi conscientiousness, neuroticism scale neo-ffi"
120
+ Am Methodical Person,0.188,keybert,0.41,"methodical person am, high-spirited person am, tough-minded my attitudes"
121
+ Energy Often Feel,0.238,keybert,0.475,"feel chill wave, often feel am, stress sometimes feel"
122
+ Myself Especially Lighthearted,0.239,keybert,0.484,"laugh easily, easily laugh"
123
+ Life Fast-Paced My,0.302,keybert,0.587,"my life fast-paced, fast-paced my life, life fast-paced"
124
+ Depressed Am Seldom,0.294,keybert,0.59,"seldom sad depressed, am seldom sad, rarely feel lonely"
125
+ T-Score Asr Anxiety,0.305,keybert,0.679,"t-score asr depressive, anxiety problems gender, depressive problems gender"
126
+ Asr-Syn Compl Description,0.512,description_title,0.36,"compl description asr, asr-syn compl, description asr"
127
+ Asr Somatic Problems,0.264,keybert,0.562,"raw asr somatic, raw asr ad, asr ad problems"
128
+ Somatic Problems Gender,0.272,keybert,0.561,"gender age adjusted, ad problems gender, t-score asr somatic"
129
+ Der Asr-Syn Compl,0.506,keybert,0.447,description asr
130
+ Description Asr Withdrawn,0.483,keybert,0.614,asr withdrawn
131
+ Description Asr Anxious,0.455,keybert,0.628,
132
+ T-Score Asr Avoidant,0.263,keybert,0.585,"asr avoidant problems, adjusted t-score asr, asr avoidant"
133
+ T-Score Asr Antisocial,0.335,keybert,0.708,"asr antisocial problems, asr antisocial, antisocial problems"
134
+ Dsmiv Major Depressive,0.38,keybert,0.649,"depressive symptoms endorsed, lifetime depressive symptoms, experienced diagnosed dsmiv"
135
+ Visual Acuity Numerator,0.362,keybert,0.714,"acuity numerator distance, visual acuity denominator, coded eyeglass correction"
136
+ Left Eye Color,0.355,keybert,0.652,color eye
137
+ Read Letter Test,0.227,keybert,0.485,"reads letters test, letters test, letters test card"
138
+ Positive Amphetamines Tests,0.402,keybert,0.734,"positive opiates tests, positive cocaine tests, positive methamphetamine tests"
139
+ Breathalyzers Administered Hcp,0.407,keybert,0.787,"any breathalyzers administered, breathalyzers administered, any breathalyzers"
140
+ Avg Total Weekday,0.276,keybert,0.529,"avg total weekend, total weekday alcoholic, drinks past days"
141
+ Drinks Past Days,0.281,keybert,0.625,"alcoholic drinks past, last hcp visit, total alcoholic drinks"
142
+ Dsm Alc Criteria,0.291,keybert,0.577,"criteria dsm sometime, yes dsm criteria, criteria dsm"
143
+ Drunk Past Months,0.49,keybert,0.531,"past months frequency, drinks past months, Past Months"
144
+ Max Drinks Consumed,0.361,keybert,0.732,"lifetime max drinks, max drinks, female max drinks"
145
+ Weekday Pipes Per,0.353,keybert,0.674,"avg weekday pipes, weekday pipes, pipes past days"
146
+ Weekday Times Chew,0.355,keybert,0.684,"times chew per, chew past days, weekend times chew"
147
+ Weekday Times Snuff,0.263,keybert,0.584,"avg weekend times, avg weekday times, snuff per past"
148
+ Total Times Smoked,0.292,keybert,0.622,"visit times smoked, cigarettes past days, times smoked any"
149
+ Avg Weekday Cigarettes,0.362,keybert,0.726,"avg weekend cigarettes, avg weekday cigars, weekday cigarettes per"
150
+ Years Smoked,0.587,keybert,0.688,smoked years
151
+ Fagerstrom Ftnd,0.622,keybert,0.522,"Fagerstrom, fagerstrom ftnd indicative, fagerstrom hsi"
152
+ Dsm Criteria Withdrawal,0.29,keybert,0.528,"dsm criteria tolerance, dsm criteria difficulty, dsm tolerance"
153
+ Cigarettes Per Day,0.376,keybert,0.745,"per day smoking, regularly cigarettes per, day cigarettes smoked"
154
+ Times,0.502,description_title,0.339,"times sedatives, times sedatives never, times hallucinogens"
155
+ Times Opiates,0.328,keybert,0.623,"times opiates never, times drugs, times cocaine"
156
+ Lifetime Yes Dsm,0.2,keybert,0.339,"lifetime yes, never times"
157
+ Trackfrac Min Trfrac,0.303,keybert,0.632,trackfrac min minimum
158
+ Scan Trackfrac,0.305,keybert,0.648,
159
+ Scan Trfrac,0.3,keybert,0.641,
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json ADDED
The diff for this file is too large to render. See raw diff
 
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json ADDED
The diff for this file is too large to render. See raw diff
 
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_canonical.csv ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _source_file,_row_index,_leaf_label,_leaf_id,_group_path,_text,_semantic_text,_dtype,_concept_label,_concept_score,_concept_source,_code_family,_facet_cond,_facet_task,_facet_variant,_facet_stat,_facet_outcome,_facet_prec
2
+ ai-mind-variable-descriptions_in_.csv,0,DMSCC,DMS > DMS Recommended Standard.DMSCC,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSCC | description: DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays). | Decimal Places: 2","DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays).",determine,Mean Choices Correct,0.0,singleton_title,DMS,No Condition,DMS,DMS Recommended Standard,Mean,Other,2
3
+ ai-mind-variable-descriptions_in_.csv,1,DMSL0SD,DMS > DMS Recommended Standard.DMSL0SD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSL0SD | description: DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,0,DMS,DMS Recommended Standard,Standard Deviation,Other,4
4
+ ai-mind-variable-descriptions_in_.csv,2,DMSL12SD,DMS > DMS Recommended Standard.DMSL12SD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSL12SD | description: DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,12,DMS,DMS Recommended Standard,Standard Deviation,Other,4
5
+ ai-mind-variable-descriptions_in_.csv,3,DMSL4SD,DMS > DMS Recommended Standard.DMSL4SD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSL4SD | description: DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,4,DMS,DMS Recommended Standard,Standard Deviation,Other,4
6
+ ai-mind-variable-descriptions_in_.csv,4,DMSLADSD,DMS > DMS Recommended Standard.DMSLADSD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSLADSD | description: DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,No Condition,DMS,DMS Recommended Standard,Standard Deviation,Other,4
7
+ ai-mind-variable-descriptions_in_.csv,5,DMSLSD,DMS > DMS Recommended Standard.DMSLSD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSLSD | description: DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,determine,Correct Latency Standard Deviation,0.687,description_title,DMSLS,No Condition,DMS,DMS Recommended Standard,Standard Deviation,Other,4
8
+ ai-mind-variable-descriptions_in_.csv,6,DMSLSSD,DMS > DMS Recommended Standard.DMSLSSD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSLSSD | description: DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSLS,No Condition,DMS,DMS Recommended Standard,Standard Deviation,Other,4
9
+ ai-mind-variable-descriptions_in_.csv,7,DMSMDL,DMS > DMS Recommended Standard.DMSMDL,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL | description: DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,determine,Correct Latency Mean,0.625,keybert,DMSMDL,No Condition,DMS,DMS Recommended Standard,Median,Other,4
10
+ ai-mind-variable-descriptions_in_.csv,8,DMSMDL0,DMS > DMS Recommended Standard.DMSMDL0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL0 | description: DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4,DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,0,DMS,DMS Recommended Standard,Median,Other,4
11
+ ai-mind-variable-descriptions_in_.csv,9,DMSMDL12,DMS > DMS Recommended Standard.DMSMDL12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL12 | description: DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4,DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,12,DMS,DMS Recommended Standard,Median,Other,4
12
+ ai-mind-variable-descriptions_in_.csv,10,DMSMDL4,DMS > DMS Recommended Standard.DMSMDL4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL4 | description: DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4,DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,4,DMS,DMS Recommended Standard,Median,Other,4
13
+ ai-mind-variable-descriptions_in_.csv,11,DMSMDLAD,DMS > DMS Recommended Standard.DMSMDLAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLAD | description: DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4,DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,No Condition,DMS,DMS Recommended Standard,Median,Other,4
14
+ ai-mind-variable-descriptions_in_.csv,12,DMSMDLS,DMS > DMS Recommended Standard.DMSMDLS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLS | description: DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4,DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,No Condition,DMS,DMS Recommended Standard,Median,Other,4
15
+ ai-mind-variable-descriptions_in_.csv,13,DMSML,DMS > DMS Recommended Standard.DMSML,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML | description: DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,determine,Correct Latency Mean,0.625,keybert,DMSML,No Condition,DMS,DMS Recommended Standard,Mean,Other,4
16
+ ai-mind-variable-descriptions_in_.csv,14,DMSML0,DMS > DMS Recommended Standard.DMSML0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML0 | description: DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4,DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,0,DMS,DMS Recommended Standard,Mean,Other,4
17
+ ai-mind-variable-descriptions_in_.csv,15,DMSML12,DMS > DMS Recommended Standard.DMSML12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML12 | description: DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4,DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,12,DMS,DMS Recommended Standard,Mean,Other,4
18
+ ai-mind-variable-descriptions_in_.csv,16,DMSML4,DMS > DMS Recommended Standard.DMSML4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML4 | description: DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4,DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,4,DMS,DMS Recommended Standard,Mean,Other,4
19
+ ai-mind-variable-descriptions_in_.csv,17,DMSMLAD,DMS > DMS Recommended Standard.DMSMLAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMLAD | description: DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4,DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,No Condition,DMS,DMS Recommended Standard,Mean,Other,4
20
+ ai-mind-variable-descriptions_in_.csv,18,DMSMLS,DMS > DMS Recommended Standard.DMSMLS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMLS | description: DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4,DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,determine,Correct Latency Mean,0.625,keybert,DMSML,No Condition,DMS,DMS Recommended Standard,Mean,Other,4
21
+ ai-mind-variable-descriptions_in_.csv,19,DMSPC,DMS > DMS Recommended Standard.DMSPC,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC | description: DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0,DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,determine,Percent Correct Percentage,0.54,keybert,DMSPC,No Condition,DMS,DMS Recommended Standard,Percent,Other,0
22
+ ai-mind-variable-descriptions_in_.csv,20,DMSPC0,DMS > DMS Recommended Standard.DMSPC0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC0 | description: KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 0,KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,0,DMS,DMS Recommended Standard,Percent,Other,0
23
+ ai-mind-variable-descriptions_in_.csv,21,DMSPC12,DMS > DMS Recommended Standard.DMSPC12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC12 | description: KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 0,KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,12,DMS,DMS Recommended Standard,Percent,Other,0
24
+ ai-mind-variable-descriptions_in_.csv,22,DMSPC4,DMS > DMS Recommended Standard.DMSPC4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC4 | description: KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay. | Decimal Places: 0,KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,4,DMS,DMS Recommended Standard,Percent,Other,0
25
+ ai-mind-variable-descriptions_in_.csv,23,DMSPCAD,DMS > DMS Recommended Standard.DMSPCAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPCAD | description: KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay. | Decimal Places: 0,KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,No Condition,DMS,DMS Recommended Standard,Percent,Other,0
26
+ ai-mind-variable-descriptions_in_.csv,24,DMSPCS,DMS > DMS Recommended Standard.DMSPCS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPCS | description: KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli. | Decimal Places: 0,KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,No Condition,DMS,DMS Recommended Standard,Percent,Other,0
27
+ ai-mind-variable-descriptions_in_.csv,25,DMSPEGC,DMS > DMS Recommended Standard.DMSPEGC,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGC | description: DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays).,determine,Probability Error Occurring,0.619,keybert,DMSPEG,No Condition,DMS,DMS Recommended Standard,Probability,Error,4
28
+ ai-mind-variable-descriptions_in_.csv,26,DMSPEGE,DMS > DMS Recommended Standard.DMSPEGE,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGE | description: KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4,KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays).,determine,Probability Error Occurring,0.619,keybert,DMSPEG,No Condition,DMS,DMS Recommended Standard,Probability,Error,4
29
+ ai-mind-variable-descriptions_in_.csv,27,DMSTC,DMS > DMS Recommended Standard.DMSTC,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC | description: DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0,DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,determine,Total Correct,0.507,description_title,DMSTC,No Condition,DMS,DMS Recommended Standard,Total,Other,0
30
+ ai-mind-variable-descriptions_in_.csv,28,DMSTC0,DMS > DMS Recommended Standard.DMSTC0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC0 | description: DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds. | Decimal Places: 0,DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds.,determine,Total Correct,0.507,description_title,DMSTC,0,DMS,DMS Recommended Standard,Total,Other,0
31
+ ai-mind-variable-descriptions_in_.csv,29,DMSTC12,DMS > DMS Recommended Standard.DMSTC12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC12 | description: DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds. | Decimal Places: 0,DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds.,determine,Total Correct,0.507,description_title,DMSTC,12,DMS,DMS Recommended Standard,Total,Other,0
32
+ ai-mind-variable-descriptions_in_.csv,30,DMSTC4,DMS > DMS Recommended Standard.DMSTC4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC4 | description: DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds. | Decimal Places: 0,DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds.,determine,Total Correct,0.507,description_title,DMSTC,4,DMS,DMS Recommended Standard,Total,Other,0
33
+ ai-mind-variable-descriptions_in_.csv,31,DMSTCAD,DMS > DMS Recommended Standard.DMSTCAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTCAD | description: DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay. | Decimal Places: 0,DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay.,determine,Total Correct,0.507,description_title,DMSTC,No Condition,DMS,DMS Recommended Standard,Total,Other,0
34
+ ai-mind-variable-descriptions_in_.csv,32,DMSTCS,DMS > DMS Recommended Standard.DMSTCS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTCS | description: DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli. | Decimal Places: 0,DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli.,determine,Total Correct,0.507,description_title,DMSTC,No Condition,DMS,DMS Recommended Standard,Total,Other,0
35
+ ai-mind-variable-descriptions_in_.csv,33,DMSTE,DMS > DMS Recommended Standard.DMSTE,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTE | description: DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen. | Decimal Places: 0","DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen.",determine,Errors Total,0.604,keybert,DMSTE,No Condition,DMS,DMS Recommended Standard,Total,Errors,0
36
+ ai-mind-variable-descriptions_in_.csv,34,DMSTEAD,DMS > DMS Recommended Standard.DMSTEAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTEAD | description: DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component. | Decimal Places: 0,DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component.,determine,Errors Total,0.604,keybert,DMSTE,No Condition,DMS,DMS Recommended Standard,Total,Errors,0
37
+ ai-mind-variable-descriptions_in_.csv,35,DMSTEC,DMS > DMS Recommended Standard.DMSTEC,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEC | description: DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0","DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays).",determine,Error,0.447,description_title,DMSTEC,No Condition,DMS,DMS Recommended Standard,Other,Error,0
38
+ ai-mind-variable-descriptions_in_.csv,36,DMSTECAD,DMS > DMS Recommended Standard.DMSTECAD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTECAD | description: DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0","DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component.",determine,Error,0.447,description_title,DMSTEC,No Condition,DMS,DMS Recommended Standard,Other,Error,0
39
+ ai-mind-variable-descriptions_in_.csv,37,DMSTED,DMS > DMS Recommended Standard.DMSTED,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTED | description: DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0","DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays).",determine,Error,0.447,description_title,DMSTED,No Condition,DMS,DMS Recommended Standard,Other,Error,0
40
+ ai-mind-variable-descriptions_in_.csv,38,DMSTEDAD,DMS > DMS Recommended Standard.DMSTEDAD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEDAD | description: DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0","DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component.",determine,Error,0.447,description_title,DMSTED,No Condition,DMS,DMS Recommended Standard,Other,Error,0
41
+ ai-mind-variable-descriptions_in_.csv,39,DMSTEP,DMS > DMS Recommended Standard.DMSTEP,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEP | description: DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0","DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays).",determine,Error,0.447,description_title,DMSTEP,No Condition,DMS,DMS Recommended Standard,Other,Error,0
42
+ ai-mind-variable-descriptions_in_.csv,40,DMSTEPAD,DMS > DMS Recommended Standard.DMSTEPAD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEPAD | description: DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0","DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component.",determine,Error,0.447,description_title,DMSTEP,No Condition,DMS,DMS Recommended Standard,Other,Error,0
43
+ ai-mind-variable-descriptions_in_.csv,41,MOTML,MOT > MOT Tone 2.0.MOTML,MOT > MOT Tone 2.0,Task: MOT | Variant: MOT Tone 2.0 | name: MOTML | description: The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 1,The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials.,determine,Latency Display Stimulus,0.418,keybert,MOT,No Condition,MOT,MOT Tone 2.0,Mean,Other,1
44
+ ai-mind-variable-descriptions_in_.csv,42,MOTSDL,MOT > MOT Tone 2.0.MOTSDL,MOT > MOT Tone 2.0,"Task: MOT | Variant: MOT Tone 2.0 | name: MOTSDL | description: This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 2","This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials.",determine,Latency Display Stimulus,0.418,keybert,MOT,No Condition,MOT,MOT Tone 2.0,Standard Deviation,Other,2
45
+ ai-mind-variable-descriptions_in_.csv,43,MOTTC,MOT > MOT Tone 2.0.MOTTC,MOT > MOT Tone 2.0,Task: MOT | Variant: MOT Tone 2.0 | name: MOTTC | description: The total number of assessment trials on which the subject made a correct response. | Decimal Places: 0,MOT The total number of assessment trials on which the subject made a correct response.,determine,Total Assessment Trials,0.313,keybert,MOTT,No Condition,MOT,MOT Tone 2.0,Total,Other,0
46
+ ai-mind-variable-descriptions_in_.csv,44,MOTTE,MOT > MOT Tone 2.0.MOTTE,MOT > MOT Tone 2.0,Task: MOT | Variant: MOT Tone 2.0 | name: MOTTE | description: The total number of assessment trials on which the subject failed to make a correct response. | Decimal Places: 0,MOT The total number of assessment trials on which the subject failed to make a correct response.,determine,Total Assessment Trials,0.313,keybert,MOTT,No Condition,MOT,MOT Tone 2.0,Total,Other,0
47
+ ai-mind-variable-descriptions_in_.csv,45,PALFAMS28,PAL > PAL Recommended Standard Extended.PALFAMS28,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALFAMS28 | description: KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard.. | Decimal Places: 0","KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard..",determine,First Attempt Memory,0.0,singleton_title,PAL,28,PAL,PAL Recommended Standard Extended,Other,Other,0
48
+ ai-mind-variable-descriptions_in_.csv,46,PALMETS28,PAL > PAL Recommended Standard Extended.PALMETS28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALMETS28 | description: PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0,PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage. Does not include 12 box level to provide a direct comparison to Recommended Standard.,determine,Mean Errors Success,0.0,singleton_title,PAL,28,PAL,PAL Recommended Standard Extended,Mean,Errors,0
49
+ ai-mind-variable-descriptions_in_.csv,47,PALNPR28,PAL > PAL Recommended Standard Extended.PALNPR28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALNPR28 | description: PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached. | Decimal Places: 0,PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached.,determine,Patterns Reached,0.0,singleton_title,PAL,28,PAL,PAL Recommended Standard Extended,Other,Other,0
50
+ ai-mind-variable-descriptions_in_.csv,48,PALTA12,PAL > PAL Recommended Standard Extended.PALTA12,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA12 | description: PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall. | Decimal Places: 0,PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,12,PAL,PAL Recommended Standard Extended,Total,Other,0
51
+ ai-mind-variable-descriptions_in_.csv,49,PALTA2,PAL > PAL Recommended Standard Extended.PALTA2,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA2 | description: PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall. | Decimal Places: 0,PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,2,PAL,PAL Recommended Standard Extended,Total,Other,0
52
+ ai-mind-variable-descriptions_in_.csv,50,PALTA28,PAL > PAL Recommended Standard Extended.PALTA28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA28 | description: PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0,PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems. Does not include 12 box level to provide a direct comparison to Recommended Standard.,determine,Total Attempts Made,0.605,keybert,PALTA,28,PAL,PAL Recommended Standard Extended,Total,Other,0
53
+ ai-mind-variable-descriptions_in_.csv,51,PALTA4,PAL > PAL Recommended Standard Extended.PALTA4,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA4 | description: PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall. | Decimal Places: 0,PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,4,PAL,PAL Recommended Standard Extended,Total,Other,0
54
+ ai-mind-variable-descriptions_in_.csv,52,PALTA6,PAL > PAL Recommended Standard Extended.PALTA6,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA6 | description: PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall. | Decimal Places: 0,PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,6,PAL,PAL Recommended Standard Extended,Total,Other,0
55
+ ai-mind-variable-descriptions_in_.csv,53,PALTA8,PAL > PAL Recommended Standard Extended.PALTA8,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA8 | description: PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall. | Decimal Places: 0,PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,8,PAL,PAL Recommended Standard Extended,Total,Other,0
56
+ ai-mind-variable-descriptions_in_.csv,54,PALTE12,PAL > PAL Recommended Standard Extended.PALTE12,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE12 | description: PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,12,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
57
+ ai-mind-variable-descriptions_in_.csv,55,PALTE2,PAL > PAL Recommended Standard Extended.PALTE2,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE2 | description: PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,2,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
58
+ ai-mind-variable-descriptions_in_.csv,56,PALTE28,PAL > PAL Recommended Standard Extended.PALTE28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE28 | description: PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials. Does not include 12 box level to provide a direct comparison to Recommended Standard.,determine,Errors Patterns Total,0.296,keybert,PALTE,28,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
59
+ ai-mind-variable-descriptions_in_.csv,57,PALTE4,PAL > PAL Recommended Standard Extended.PALTE4,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE4 | description: PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,4,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
60
+ ai-mind-variable-descriptions_in_.csv,58,PALTE6,PAL > PAL Recommended Standard Extended.PALTE6,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE6 | description: PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,6,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
61
+ ai-mind-variable-descriptions_in_.csv,59,PALTE8,PAL > PAL Recommended Standard Extended.PALTE8,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE8 | description: PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,8,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
62
+ ai-mind-variable-descriptions_in_.csv,60,PALTEA12,PAL > PAL Recommended Standard Extended.PALTEA12,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA12 | description: PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,12,PAL,PAL Recommended Standard Extended,Total,Errors,0
63
+ ai-mind-variable-descriptions_in_.csv,61,PALTEA2,PAL > PAL Recommended Standard Extended.PALTEA2,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA2 | description: PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,2,PAL,PAL Recommended Standard Extended,Total,Errors,0
64
+ ai-mind-variable-descriptions_in_.csv,62,PALTEA28,PAL > PAL Recommended Standard Extended.PALTEA28,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA28 | description: KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task. In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0","KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task. In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard.",determine,Total Errors,0.0,singleton_title,PALTEA,28,PAL,PAL Recommended Standard Extended,Total,Errors,0
65
+ ai-mind-variable-descriptions_in_.csv,63,PALTEA4,PAL > PAL Recommended Standard Extended.PALTEA4,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA4 | description: PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,4,PAL,PAL Recommended Standard Extended,Total,Errors,0
66
+ ai-mind-variable-descriptions_in_.csv,64,PALTEA6,PAL > PAL Recommended Standard Extended.PALTEA6,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA6 | description: PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,6,PAL,PAL Recommended Standard Extended,Total,Errors,0
67
+ ai-mind-variable-descriptions_in_.csv,65,PALTEA8,PAL > PAL Recommended Standard Extended.PALTEA8,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA8 | description: PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,8,PAL,PAL Recommended Standard Extended,Total,Errors,0
68
+ ai-mind-variable-descriptions_in_.csv,66,PRMCLSDD,PRM > PRM Recommended Standard 18 Extended.PRMCLSDD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDD | description: PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMCLSD,No Condition,PRM,PRM Recommended Standard 18 Extended,Standard Deviation,Other,2
69
+ ai-mind-variable-descriptions_in_.csv,67,PRMCLSDI,PRM > PRM Recommended Standard 18 Extended.PRMCLSDI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDI | description: PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMCLSD,No Condition,PRM,PRM Recommended Standard 18 Extended,Standard Deviation,Other,2
70
+ ai-mind-variable-descriptions_in_.csv,68,PRMMCLD,PRM > PRM Recommended Standard 18 Extended.PRMMCLD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLD | description: PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Mean,Other,2
71
+ ai-mind-variable-descriptions_in_.csv,69,PRMMCLI,PRM > PRM Recommended Standard 18 Extended.PRMMCLI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLI | description: PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Mean,Other,2
72
+ ai-mind-variable-descriptions_in_.csv,70,PRMMDCLD,PRM > PRM Recommended Standard 18 Extended.PRMMDCLD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLD | description: PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMDCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Median,Other,2
73
+ ai-mind-variable-descriptions_in_.csv,71,PRMMDCLI,PRM > PRM Recommended Standard 18 Extended.PRMMDCLI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLI | description: PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMDCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Median,Other,2
74
+ ai-mind-variable-descriptions_in_.csv,72,PRMPCD,PRM > PRM Recommended Standard 18 Extended.PRMPCD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCD | description: KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage. | Decimal Places: 2","KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage.",determine,Percent Correct Immediate,0.596,keybert,PRMPC,No Condition,PRM,PRM Recommended Standard 18 Extended,Percent,Other,2
75
+ ai-mind-variable-descriptions_in_.csv,73,PRMPCI,PRM > PRM Recommended Standard 18 Extended.PRMPCI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCI | description: KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage. | Decimal Places: 2","KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage.",determine,Percent Correct Immediate,0.596,keybert,PRMPC,No Condition,PRM,PRM Recommended Standard 18 Extended,Percent,Other,2
76
+ ai-mind-variable-descriptions_in_.csv,74,PRMTSDSP,PRM > PRM Recommended Standard 18 Extended.PRMTSDSP,PRM > PRM Recommended Standard 18 Extended,Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMTSDSP | description: PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition. | Decimal Places: 2,PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition.,determine,Time Since Delayed Stimuli,0.0,singleton_title,PRM,No Condition,PRM,PRM Recommended Standard 18 Extended,Other,Other,2
77
+ ai-mind-variable-descriptions_in_.csv,75,RVPA,RVP > RVP 3 Targets.RVPA,RVP > RVP 3 Targets,"Task: RVP | Variant: RVP 3 Targets | name: RVPA | description: KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences. | Decimal Places: 4","KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences.",determine,Detection Measure,0.0,singleton_keybert,RVP,No Condition,RVP,RVP 3 Targets,Range,Other,4
78
+ ai-mind-variable-descriptions_in_.csv,76,RVPLSD,RVP > RVP 3 Targets.RVPLSD,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPLSD | description: RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4,RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials.,determine,Response Latency Mean,0.676,keybert,RVP,No Condition,RVP,RVP 3 Targets,Standard Deviation,Other,4
79
+ ai-mind-variable-descriptions_in_.csv,77,RVPMDL,RVP > RVP 3 Targets.RVPMDL,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPMDL | description: KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4,KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials.,determine,Response Latency Mean,0.676,keybert,RVPM,No Condition,RVP,RVP 3 Targets,Median,Other,4
80
+ ai-mind-variable-descriptions_in_.csv,78,RVPML,RVP > RVP 3 Targets.RVPML,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPML | description: RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4,RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials.,determine,Response Latency Mean,0.676,keybert,RVPM,No Condition,RVP,RVP 3 Targets,Mean,Other,4
81
+ ai-mind-variable-descriptions_in_.csv,79,RVPPFA,RVP > RVP 3 Targets.RVPPFA,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPPFA | description: KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms Γ· (False Alarms + Correct Rejections)) | Decimal Places: 4,KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms Γ· (False Alarms + Correct Rejections)),determine,Total,0.407,description_title,RVPP,No Condition,RVP,RVP 3 Targets,Probability,False Alarm,4
82
+ ai-mind-variable-descriptions_in_.csv,80,RVPPH,RVP > RVP 3 Targets.RVPPH,RVP > RVP 3 Targets,"Task: RVP | Variant: RVP 3 Targets | name: RVPPH | description: RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits Γ· total number of sequences) | Decimal Places: 4","RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits Γ· total number of sequences)",determine,Total,0.407,description_title,RVPP,No Condition,RVP,RVP 3 Targets,Probability,Other,4
83
+ ai-mind-variable-descriptions_in_.csv,81,RVPTFA,RVP > RVP 3 Targets.RVPTFA,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPTFA | description: RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms. | Decimal Places: 0,RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms.,determine,Total,0.407,description_title,RVPT,No Condition,RVP,RVP 3 Targets,Total,Other,0
84
+ ai-mind-variable-descriptions_in_.csv,82,RVPTH,RVP > RVP 3 Targets.RVPTH,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPTH | description: RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks. | Decimal Places: 0,RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks.,determine,Total,0.407,description_title,RVPT,No Condition,RVP,RVP 3 Targets,Total,Other,0
85
+ ai-mind-variable-descriptions_in_.csv,83,RVPTM,RVP > RVP 3 Targets.RVPTM,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPTM | description: RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks. | Decimal Places: 0,RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks.,determine,Total,0.407,description_title,RVPT,No Condition,RVP,RVP 3 Targets,Total,Other,0
86
+ ai-mind-variable-descriptions_in_.csv,84,SWMBE12,SWM > SWM Recommended Standard 2.0 Extended.SWMBE12,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE12 | description: KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only. | Decimal Places: 0,KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,12,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
87
+ ai-mind-variable-descriptions_in_.csv,85,SWMBE4,SWM > SWM Recommended Standard 2.0 Extended.SWMBE4,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE4 | description: KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only. | Decimal Places: 0,KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,4,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
88
+ ai-mind-variable-descriptions_in_.csv,86,SWMBE468,SWM > SWM Recommended Standard 2.0 Extended.SWMBE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE468 | description: KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials.",determine,Errors Boxes Times,0.515,keybert,SWMBE,468,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
89
+ ai-mind-variable-descriptions_in_.csv,87,SWMBE6,SWM > SWM Recommended Standard 2.0 Extended.SWMBE6,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE6 | description: KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only. | Decimal Places: 0,KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
90
+ ai-mind-variable-descriptions_in_.csv,88,SWMBE8,SWM > SWM Recommended Standard 2.0 Extended.SWMBE8,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE8 | description: KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only. | Decimal Places: 0,KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,8,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
91
+ ai-mind-variable-descriptions_in_.csv,89,SWMDE12,SWM > SWM Recommended Standard 2.0 Extended.SWMDE12,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE12 | description: SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only. | Decimal Places: 0,SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,12,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
92
+ ai-mind-variable-descriptions_in_.csv,90,SWMDE4,SWM > SWM Recommended Standard 2.0 Extended.SWMDE4,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE4 | description: SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only. | Decimal Places: 0,SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,4,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
93
+ ai-mind-variable-descriptions_in_.csv,91,SWMDE468,SWM > SWM Recommended Standard 2.0 Extended.SWMDE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE468 | description: SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials.",determine,Double Errors,0.0,singleton_title,SWMDE,468,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
94
+ ai-mind-variable-descriptions_in_.csv,92,SWMDE6,SWM > SWM Recommended Standard 2.0 Extended.SWMDE6,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE6 | description: SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only. | Decimal Places: 0,SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
95
+ ai-mind-variable-descriptions_in_.csv,93,SWMDE8,SWM > SWM Recommended Standard 2.0 Extended.SWMDE8,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE8 | description: SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only. | Decimal Places: 0,SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,8,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
96
+ ai-mind-variable-descriptions_in_.csv,94,SWMPR,SWM > SWM Recommended Standard 2.0 Extended.SWMPR,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMPR | description: SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete. | Decimal Places: 0","SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete.",determine,Problem Reached,0.0,singleton_title,SWM,No Condition,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
97
+ ai-mind-variable-descriptions_in_.csv,95,SWMS,SWM > SWM Recommended Standard 2.0 Extended.SWMS,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS | description: KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens. | Decimal Places: 0","KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens.",determine,Strategy High,0.569,keybert,SWMS,No Condition,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
98
+ ai-mind-variable-descriptions_in_.csv,96,SWMS6,SWM > SWM Recommended Standard 2.0 Extended.SWMS6,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS6 | description: SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. | Decimal Places: 0","SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes.",determine,Strategy High,0.569,keybert,SWMS,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
99
+ ai-mind-variable-descriptions_in_.csv,97,SWMSX,SWM > SWM Recommended Standard 2.0 Extended.SWMSX,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMSX | description: SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more. | Decimal Places: 0","SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more.",determine,Strategy High,0.569,keybert,SWMS,No Condition,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
100
+ ai-mind-variable-descriptions_in_.csv,98,SWMTE12,SWM > SWM Recommended Standard 2.0 Extended.SWMTE12,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE12 | description: SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only. | Decimal Places: 0","SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,12,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
101
+ ai-mind-variable-descriptions_in_.csv,99,SWMTE4,SWM > SWM Recommended Standard 2.0 Extended.SWMTE4,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE4 | description: SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only. | Decimal Places: 0","SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,4,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
102
+ ai-mind-variable-descriptions_in_.csv,100,SWMTE468,SWM > SWM Recommended Standard 2.0 Extended.SWMTE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE468 | description: SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials.",determine,Errors Total,0.593,keybert,SWMTE,468,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
103
+ ai-mind-variable-descriptions_in_.csv,101,SWMTE6,SWM > SWM Recommended Standard 2.0 Extended.SWMTE6,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE6 | description: SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only. | Decimal Places: 0","SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,6,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
104
+ ai-mind-variable-descriptions_in_.csv,102,SWMTE8,SWM > SWM Recommended Standard 2.0 Extended.SWMTE8,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE8 | description: SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only. | Decimal Places: 0","SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,8,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
105
+ ai-mind-variable-descriptions_in_.csv,103,SWMWE12,SWM > SWM Recommended Standard 2.0 Extended.SWMWE12,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE12 | description: SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only. | Decimal Places: 0,SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,12,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
106
+ ai-mind-variable-descriptions_in_.csv,104,SWMWE4,SWM > SWM Recommended Standard 2.0 Extended.SWMWE4,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE4 | description: SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only. | Decimal Places: 0,SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,4,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
107
+ ai-mind-variable-descriptions_in_.csv,105,SWMWE468,SWM > SWM Recommended Standard 2.0 Extended.SWMWE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE468 | description: SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials.",determine,Within Errors,0.0,singleton_title,SWMWE,468,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
108
+ ai-mind-variable-descriptions_in_.csv,106,SWMWE6,SWM > SWM Recommended Standard 2.0 Extended.SWMWE6,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE6 | description: SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only. | Decimal Places: 0,SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
109
+ ai-mind-variable-descriptions_in_.csv,107,SWMWE8,SWM > SWM Recommended Standard 2.0 Extended.SWMWE8,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE8 | description: SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only. | Decimal Places: 0,SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,8,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_concept_labels.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Node,Confidence,Source,Embedding sim,Alternatives
2
+ Total Correct,0.507,description_title,0.319,"correct total, correct total times"
3
+ Error,0.447,description_title,0.216,"error times subject, error times, failed"
4
+ Mean Latency,0.625,keybert,0.676,latency mean
5
+ Errors Total,0.604,keybert,0.543,"errors total times, Total Errors"
6
+ Standard Deviation,0.687,description_title,0.684,"latency standard deviation, deviation response latencies"
7
+ Probability Error Occurring,0.619,keybert,0.578,"Probability Error, probability error made, reports probability error"
8
+ Percent Correct Percentage,0.54,keybert,0.473,"correct percentage assessment, correct percentage, Percent Correct"
9
+ Latency Display Stimulus,0.418,keybert,0.732,"mean latency display, standard deviation latency, deviation latency calculated"
10
+ Total Assessment Trials,0.313,keybert,0.629,"assessment trials subject, trials subject failed, trials subject"
11
+ Total Attempts Made,0.605,keybert,0.535,attempts total
12
+ Errors Patterns Total,0.296,keybert,0.619,"box stimulus assessment, stimulus assessment problems, incorrect box stimulus"
13
+ Include Shapes,0.609,description_title,0.549,"total errors shapes, errors shapes times, errors shapes"
14
+ Latency Immediate Standard,0.653,keybert,0.715,"correct latency immediate, latency immediate, correct latency delayed"
15
+ Percent Correct Immediate,0.596,keybert,0.671,"Percent Correct, key percent correct, percent correct delayed"
16
+ Total,0.407,description_title,0.111,"total hits, hits total"
17
+ Response Latency Mean,0.676,keybert,0.683,"Response Latency, response latency trials, latency mean response"
18
+ Times Errors,0.515,keybert,0.447,"Errors Boxes, key errors boxes, errors times"
19
+ Strategy High,0.569,keybert,0.509,"Strategy, strategy finding, high strategy"
20
+ Within Errors,0.412,keybert,0.303,boxes times subject
21
+ Errors Total,0.593,keybert,0.537,"errors total times, Total Errors"
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json ADDED
The diff for this file is too large to render. See raw diff
 
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json ADDED
The diff for this file is too large to render. See raw diff
 
version2/outputs/approach_2/HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json ADDED
The diff for this file is too large to render. See raw diff
 
version2/outputs/approach_2/ai-mind-variable-descriptions_in__approach2_lod.json ADDED
@@ -0,0 +1,2716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": 0,
4
+ "name": "project",
5
+ "related": [
6
+ 109,
7
+ 120,
8
+ 124,
9
+ 130,
10
+ 136,
11
+ 143
12
+ ],
13
+ "type": "root",
14
+ "desc": "Root node",
15
+ "dtype": "determine",
16
+ "isShown": true,
17
+ "post_build_stats": {
18
+ "sibling_factor_nodes_inserted": 0,
19
+ "low_quality_nodes_dissolved": 0,
20
+ "group_prefix_labels_stripped": 6,
21
+ "dag_links_removed": 9
22
+ }
23
+ },
24
+ {
25
+ "id": 1,
26
+ "name": "DMSCC",
27
+ "related": [],
28
+ "type": "attribute",
29
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSCC | description: DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays). | Decimal Places: 2",
30
+ "dtype": "determine",
31
+ "isShown": true,
32
+ "metadata": {
33
+ "row_index": 0,
34
+ "group": "DMS > DMS Recommended Standard"
35
+ }
36
+ },
37
+ {
38
+ "id": 2,
39
+ "name": "DMSL0SD",
40
+ "related": [],
41
+ "type": "attribute",
42
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL0SD | description: DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
43
+ "dtype": "determine",
44
+ "isShown": true,
45
+ "metadata": {
46
+ "row_index": 1,
47
+ "group": "DMS > DMS Recommended Standard"
48
+ }
49
+ },
50
+ {
51
+ "id": 3,
52
+ "name": "DMSL12SD",
53
+ "related": [],
54
+ "type": "attribute",
55
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL12SD | description: DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
56
+ "dtype": "determine",
57
+ "isShown": true,
58
+ "metadata": {
59
+ "row_index": 2,
60
+ "group": "DMS > DMS Recommended Standard"
61
+ }
62
+ },
63
+ {
64
+ "id": 4,
65
+ "name": "DMSL4SD",
66
+ "related": [],
67
+ "type": "attribute",
68
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL4SD | description: DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
69
+ "dtype": "determine",
70
+ "isShown": true,
71
+ "metadata": {
72
+ "row_index": 3,
73
+ "group": "DMS > DMS Recommended Standard"
74
+ }
75
+ },
76
+ {
77
+ "id": 5,
78
+ "name": "DMSLADSD",
79
+ "related": [],
80
+ "type": "attribute",
81
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLADSD | description: DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
82
+ "dtype": "determine",
83
+ "isShown": true,
84
+ "metadata": {
85
+ "row_index": 4,
86
+ "group": "DMS > DMS Recommended Standard"
87
+ }
88
+ },
89
+ {
90
+ "id": 6,
91
+ "name": "DMSLSD",
92
+ "related": [],
93
+ "type": "attribute",
94
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSD | description: DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
95
+ "dtype": "determine",
96
+ "isShown": true,
97
+ "metadata": {
98
+ "row_index": 5,
99
+ "group": "DMS > DMS Recommended Standard"
100
+ }
101
+ },
102
+ {
103
+ "id": 7,
104
+ "name": "DMSLSSD",
105
+ "related": [],
106
+ "type": "attribute",
107
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSSD | description: DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations. | Decimal Places: 4",
108
+ "dtype": "determine",
109
+ "isShown": true,
110
+ "metadata": {
111
+ "row_index": 6,
112
+ "group": "DMS > DMS Recommended Standard"
113
+ }
114
+ },
115
+ {
116
+ "id": 8,
117
+ "name": "DMSMDL",
118
+ "related": [],
119
+ "type": "attribute",
120
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL | description: DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
121
+ "dtype": "determine",
122
+ "isShown": true,
123
+ "metadata": {
124
+ "row_index": 7,
125
+ "group": "DMS > DMS Recommended Standard"
126
+ }
127
+ },
128
+ {
129
+ "id": 9,
130
+ "name": "DMSMDL0",
131
+ "related": [],
132
+ "type": "attribute",
133
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL0 | description: DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
134
+ "dtype": "determine",
135
+ "isShown": true,
136
+ "metadata": {
137
+ "row_index": 8,
138
+ "group": "DMS > DMS Recommended Standard"
139
+ }
140
+ },
141
+ {
142
+ "id": 10,
143
+ "name": "DMSMDL12",
144
+ "related": [],
145
+ "type": "attribute",
146
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL12 | description: DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
147
+ "dtype": "determine",
148
+ "isShown": true,
149
+ "metadata": {
150
+ "row_index": 9,
151
+ "group": "DMS > DMS Recommended Standard"
152
+ }
153
+ },
154
+ {
155
+ "id": 11,
156
+ "name": "DMSMDL4",
157
+ "related": [],
158
+ "type": "attribute",
159
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL4 | description: DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
160
+ "dtype": "determine",
161
+ "isShown": true,
162
+ "metadata": {
163
+ "row_index": 10,
164
+ "group": "DMS > DMS Recommended Standard"
165
+ }
166
+ },
167
+ {
168
+ "id": 12,
169
+ "name": "DMSMDLAD",
170
+ "related": [],
171
+ "type": "attribute",
172
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLAD | description: DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
173
+ "dtype": "determine",
174
+ "isShown": true,
175
+ "metadata": {
176
+ "row_index": 11,
177
+ "group": "DMS > DMS Recommended Standard"
178
+ }
179
+ },
180
+ {
181
+ "id": 13,
182
+ "name": "DMSMDLS",
183
+ "related": [],
184
+ "type": "attribute",
185
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLS | description: DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
186
+ "dtype": "determine",
187
+ "isShown": true,
188
+ "metadata": {
189
+ "row_index": 12,
190
+ "group": "DMS > DMS Recommended Standard"
191
+ }
192
+ },
193
+ {
194
+ "id": 14,
195
+ "name": "DMSML",
196
+ "related": [],
197
+ "type": "attribute",
198
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML | description: DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
199
+ "dtype": "determine",
200
+ "isShown": true,
201
+ "metadata": {
202
+ "row_index": 13,
203
+ "group": "DMS > DMS Recommended Standard"
204
+ }
205
+ },
206
+ {
207
+ "id": 15,
208
+ "name": "DMSML0",
209
+ "related": [],
210
+ "type": "attribute",
211
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML0 | description: DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
212
+ "dtype": "determine",
213
+ "isShown": true,
214
+ "metadata": {
215
+ "row_index": 14,
216
+ "group": "DMS > DMS Recommended Standard"
217
+ }
218
+ },
219
+ {
220
+ "id": 16,
221
+ "name": "DMSML12",
222
+ "related": [],
223
+ "type": "attribute",
224
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML12 | description: DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
225
+ "dtype": "determine",
226
+ "isShown": true,
227
+ "metadata": {
228
+ "row_index": 15,
229
+ "group": "DMS > DMS Recommended Standard"
230
+ }
231
+ },
232
+ {
233
+ "id": 17,
234
+ "name": "DMSML4",
235
+ "related": [],
236
+ "type": "attribute",
237
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML4 | description: DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
238
+ "dtype": "determine",
239
+ "isShown": true,
240
+ "metadata": {
241
+ "row_index": 16,
242
+ "group": "DMS > DMS Recommended Standard"
243
+ }
244
+ },
245
+ {
246
+ "id": 18,
247
+ "name": "DMSMLAD",
248
+ "related": [],
249
+ "type": "attribute",
250
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLAD | description: DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
251
+ "dtype": "determine",
252
+ "isShown": true,
253
+ "metadata": {
254
+ "row_index": 17,
255
+ "group": "DMS > DMS Recommended Standard"
256
+ }
257
+ },
258
+ {
259
+ "id": 19,
260
+ "name": "DMSMLS",
261
+ "related": [],
262
+ "type": "attribute",
263
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLS | description: DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
264
+ "dtype": "determine",
265
+ "isShown": true,
266
+ "metadata": {
267
+ "row_index": 18,
268
+ "group": "DMS > DMS Recommended Standard"
269
+ }
270
+ },
271
+ {
272
+ "id": 20,
273
+ "name": "DMSPC",
274
+ "related": [],
275
+ "type": "attribute",
276
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC | description: DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
277
+ "dtype": "determine",
278
+ "isShown": true,
279
+ "metadata": {
280
+ "row_index": 19,
281
+ "group": "DMS > DMS Recommended Standard"
282
+ }
283
+ },
284
+ {
285
+ "id": 21,
286
+ "name": "DMSPC0",
287
+ "related": [],
288
+ "type": "attribute",
289
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC0 | description: KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 0",
290
+ "dtype": "determine",
291
+ "isShown": true,
292
+ "metadata": {
293
+ "row_index": 20,
294
+ "group": "DMS > DMS Recommended Standard"
295
+ }
296
+ },
297
+ {
298
+ "id": 22,
299
+ "name": "DMSPC12",
300
+ "related": [],
301
+ "type": "attribute",
302
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC12 | description: KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 0",
303
+ "dtype": "determine",
304
+ "isShown": true,
305
+ "metadata": {
306
+ "row_index": 21,
307
+ "group": "DMS > DMS Recommended Standard"
308
+ }
309
+ },
310
+ {
311
+ "id": 23,
312
+ "name": "DMSPC4",
313
+ "related": [],
314
+ "type": "attribute",
315
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC4 | description: KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay. | Decimal Places: 0",
316
+ "dtype": "determine",
317
+ "isShown": true,
318
+ "metadata": {
319
+ "row_index": 22,
320
+ "group": "DMS > DMS Recommended Standard"
321
+ }
322
+ },
323
+ {
324
+ "id": 24,
325
+ "name": "DMSPCAD",
326
+ "related": [],
327
+ "type": "attribute",
328
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCAD | description: KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
329
+ "dtype": "determine",
330
+ "isShown": true,
331
+ "metadata": {
332
+ "row_index": 23,
333
+ "group": "DMS > DMS Recommended Standard"
334
+ }
335
+ },
336
+ {
337
+ "id": 25,
338
+ "name": "DMSPCS",
339
+ "related": [],
340
+ "type": "attribute",
341
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCS | description: KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli. | Decimal Places: 0",
342
+ "dtype": "determine",
343
+ "isShown": true,
344
+ "metadata": {
345
+ "row_index": 24,
346
+ "group": "DMS > DMS Recommended Standard"
347
+ }
348
+ },
349
+ {
350
+ "id": 26,
351
+ "name": "DMSPEGC",
352
+ "related": [],
353
+ "type": "attribute",
354
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGC | description: DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
355
+ "dtype": "determine",
356
+ "isShown": true,
357
+ "metadata": {
358
+ "row_index": 25,
359
+ "group": "DMS > DMS Recommended Standard"
360
+ }
361
+ },
362
+ {
363
+ "id": 27,
364
+ "name": "DMSPEGE",
365
+ "related": [],
366
+ "type": "attribute",
367
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGE | description: KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
368
+ "dtype": "determine",
369
+ "isShown": true,
370
+ "metadata": {
371
+ "row_index": 26,
372
+ "group": "DMS > DMS Recommended Standard"
373
+ }
374
+ },
375
+ {
376
+ "id": 28,
377
+ "name": "DMSTC",
378
+ "related": [],
379
+ "type": "attribute",
380
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC | description: DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
381
+ "dtype": "determine",
382
+ "isShown": true,
383
+ "metadata": {
384
+ "row_index": 27,
385
+ "group": "DMS > DMS Recommended Standard"
386
+ }
387
+ },
388
+ {
389
+ "id": 29,
390
+ "name": "DMSTC0",
391
+ "related": [],
392
+ "type": "attribute",
393
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC0 | description: DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds. | Decimal Places: 0",
394
+ "dtype": "determine",
395
+ "isShown": true,
396
+ "metadata": {
397
+ "row_index": 28,
398
+ "group": "DMS > DMS Recommended Standard"
399
+ }
400
+ },
401
+ {
402
+ "id": 30,
403
+ "name": "DMSTC12",
404
+ "related": [],
405
+ "type": "attribute",
406
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC12 | description: DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds. | Decimal Places: 0",
407
+ "dtype": "determine",
408
+ "isShown": true,
409
+ "metadata": {
410
+ "row_index": 29,
411
+ "group": "DMS > DMS Recommended Standard"
412
+ }
413
+ },
414
+ {
415
+ "id": 31,
416
+ "name": "DMSTC4",
417
+ "related": [],
418
+ "type": "attribute",
419
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC4 | description: DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds. | Decimal Places: 0",
420
+ "dtype": "determine",
421
+ "isShown": true,
422
+ "metadata": {
423
+ "row_index": 30,
424
+ "group": "DMS > DMS Recommended Standard"
425
+ }
426
+ },
427
+ {
428
+ "id": 32,
429
+ "name": "DMSTCAD",
430
+ "related": [],
431
+ "type": "attribute",
432
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCAD | description: DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
433
+ "dtype": "determine",
434
+ "isShown": true,
435
+ "metadata": {
436
+ "row_index": 31,
437
+ "group": "DMS > DMS Recommended Standard"
438
+ }
439
+ },
440
+ {
441
+ "id": 33,
442
+ "name": "DMSTCS",
443
+ "related": [],
444
+ "type": "attribute",
445
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCS | description: DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli. | Decimal Places: 0",
446
+ "dtype": "determine",
447
+ "isShown": true,
448
+ "metadata": {
449
+ "row_index": 32,
450
+ "group": "DMS > DMS Recommended Standard"
451
+ }
452
+ },
453
+ {
454
+ "id": 34,
455
+ "name": "DMSTE",
456
+ "related": [],
457
+ "type": "attribute",
458
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTE | description: DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen. | Decimal Places: 0",
459
+ "dtype": "determine",
460
+ "isShown": true,
461
+ "metadata": {
462
+ "row_index": 33,
463
+ "group": "DMS > DMS Recommended Standard"
464
+ }
465
+ },
466
+ {
467
+ "id": 35,
468
+ "name": "DMSTEAD",
469
+ "related": [],
470
+ "type": "attribute",
471
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEAD | description: DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component. | Decimal Places: 0",
472
+ "dtype": "determine",
473
+ "isShown": true,
474
+ "metadata": {
475
+ "row_index": 34,
476
+ "group": "DMS > DMS Recommended Standard"
477
+ }
478
+ },
479
+ {
480
+ "id": 36,
481
+ "name": "DMSTEC",
482
+ "related": [],
483
+ "type": "attribute",
484
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEC | description: DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
485
+ "dtype": "determine",
486
+ "isShown": true,
487
+ "metadata": {
488
+ "row_index": 35,
489
+ "group": "DMS > DMS Recommended Standard"
490
+ }
491
+ },
492
+ {
493
+ "id": 37,
494
+ "name": "DMSTECAD",
495
+ "related": [],
496
+ "type": "attribute",
497
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTECAD | description: DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
498
+ "dtype": "determine",
499
+ "isShown": true,
500
+ "metadata": {
501
+ "row_index": 36,
502
+ "group": "DMS > DMS Recommended Standard"
503
+ }
504
+ },
505
+ {
506
+ "id": 38,
507
+ "name": "DMSTED",
508
+ "related": [],
509
+ "type": "attribute",
510
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTED | description: DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
511
+ "dtype": "determine",
512
+ "isShown": true,
513
+ "metadata": {
514
+ "row_index": 37,
515
+ "group": "DMS > DMS Recommended Standard"
516
+ }
517
+ },
518
+ {
519
+ "id": 39,
520
+ "name": "DMSTEDAD",
521
+ "related": [],
522
+ "type": "attribute",
523
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEDAD | description: DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
524
+ "dtype": "determine",
525
+ "isShown": true,
526
+ "metadata": {
527
+ "row_index": 38,
528
+ "group": "DMS > DMS Recommended Standard"
529
+ }
530
+ },
531
+ {
532
+ "id": 40,
533
+ "name": "DMSTEP",
534
+ "related": [],
535
+ "type": "attribute",
536
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEP | description: DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
537
+ "dtype": "determine",
538
+ "isShown": true,
539
+ "metadata": {
540
+ "row_index": 39,
541
+ "group": "DMS > DMS Recommended Standard"
542
+ }
543
+ },
544
+ {
545
+ "id": 41,
546
+ "name": "DMSTEPAD",
547
+ "related": [],
548
+ "type": "attribute",
549
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEPAD | description: DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
550
+ "dtype": "determine",
551
+ "isShown": true,
552
+ "metadata": {
553
+ "row_index": 40,
554
+ "group": "DMS > DMS Recommended Standard"
555
+ }
556
+ },
557
+ {
558
+ "id": 42,
559
+ "name": "MOTML",
560
+ "related": [],
561
+ "type": "attribute",
562
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTML | description: The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 1",
563
+ "dtype": "determine",
564
+ "isShown": true,
565
+ "metadata": {
566
+ "row_index": 41,
567
+ "group": "MOT > MOT Tone 2.0"
568
+ }
569
+ },
570
+ {
571
+ "id": 43,
572
+ "name": "MOTSDL",
573
+ "related": [],
574
+ "type": "attribute",
575
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTSDL | description: This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 2",
576
+ "dtype": "determine",
577
+ "isShown": true,
578
+ "metadata": {
579
+ "row_index": 42,
580
+ "group": "MOT > MOT Tone 2.0"
581
+ }
582
+ },
583
+ {
584
+ "id": 44,
585
+ "name": "MOTTC",
586
+ "related": [],
587
+ "type": "attribute",
588
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTC | description: The total number of assessment trials on which the subject made a correct response. | Decimal Places: 0",
589
+ "dtype": "determine",
590
+ "isShown": true,
591
+ "metadata": {
592
+ "row_index": 43,
593
+ "group": "MOT > MOT Tone 2.0"
594
+ }
595
+ },
596
+ {
597
+ "id": 45,
598
+ "name": "MOTTE",
599
+ "related": [],
600
+ "type": "attribute",
601
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTE | description: The total number of assessment trials on which the subject failed to make a correct response. | Decimal Places: 0",
602
+ "dtype": "determine",
603
+ "isShown": true,
604
+ "metadata": {
605
+ "row_index": 44,
606
+ "group": "MOT > MOT Tone 2.0"
607
+ }
608
+ },
609
+ {
610
+ "id": 46,
611
+ "name": "PALFAMS28",
612
+ "related": [],
613
+ "type": "attribute",
614
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALFAMS28 | description: KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard.. | Decimal Places: 0",
615
+ "dtype": "determine",
616
+ "isShown": true,
617
+ "metadata": {
618
+ "row_index": 45,
619
+ "group": "PAL > PAL Recommended Standard Extended"
620
+ }
621
+ },
622
+ {
623
+ "id": 47,
624
+ "name": "PALMETS28",
625
+ "related": [],
626
+ "type": "attribute",
627
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALMETS28 | description: PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
628
+ "dtype": "determine",
629
+ "isShown": true,
630
+ "metadata": {
631
+ "row_index": 46,
632
+ "group": "PAL > PAL Recommended Standard Extended"
633
+ }
634
+ },
635
+ {
636
+ "id": 48,
637
+ "name": "PALNPR28",
638
+ "related": [],
639
+ "type": "attribute",
640
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALNPR28 | description: PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached. | Decimal Places: 0",
641
+ "dtype": "determine",
642
+ "isShown": true,
643
+ "metadata": {
644
+ "row_index": 47,
645
+ "group": "PAL > PAL Recommended Standard Extended"
646
+ }
647
+ },
648
+ {
649
+ "id": 49,
650
+ "name": "PALTA12",
651
+ "related": [],
652
+ "type": "attribute",
653
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA12 | description: PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall. | Decimal Places: 0",
654
+ "dtype": "determine",
655
+ "isShown": true,
656
+ "metadata": {
657
+ "row_index": 48,
658
+ "group": "PAL > PAL Recommended Standard Extended"
659
+ }
660
+ },
661
+ {
662
+ "id": 50,
663
+ "name": "PALTA2",
664
+ "related": [],
665
+ "type": "attribute",
666
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA2 | description: PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall. | Decimal Places: 0",
667
+ "dtype": "determine",
668
+ "isShown": true,
669
+ "metadata": {
670
+ "row_index": 49,
671
+ "group": "PAL > PAL Recommended Standard Extended"
672
+ }
673
+ },
674
+ {
675
+ "id": 51,
676
+ "name": "PALTA28",
677
+ "related": [],
678
+ "type": "attribute",
679
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA28 | description: PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
680
+ "dtype": "determine",
681
+ "isShown": true,
682
+ "metadata": {
683
+ "row_index": 50,
684
+ "group": "PAL > PAL Recommended Standard Extended"
685
+ }
686
+ },
687
+ {
688
+ "id": 52,
689
+ "name": "PALTA4",
690
+ "related": [],
691
+ "type": "attribute",
692
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA4 | description: PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall. | Decimal Places: 0",
693
+ "dtype": "determine",
694
+ "isShown": true,
695
+ "metadata": {
696
+ "row_index": 51,
697
+ "group": "PAL > PAL Recommended Standard Extended"
698
+ }
699
+ },
700
+ {
701
+ "id": 53,
702
+ "name": "PALTA6",
703
+ "related": [],
704
+ "type": "attribute",
705
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA6 | description: PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall. | Decimal Places: 0",
706
+ "dtype": "determine",
707
+ "isShown": true,
708
+ "metadata": {
709
+ "row_index": 52,
710
+ "group": "PAL > PAL Recommended Standard Extended"
711
+ }
712
+ },
713
+ {
714
+ "id": 54,
715
+ "name": "PALTA8",
716
+ "related": [],
717
+ "type": "attribute",
718
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA8 | description: PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall. | Decimal Places: 0",
719
+ "dtype": "determine",
720
+ "isShown": true,
721
+ "metadata": {
722
+ "row_index": 53,
723
+ "group": "PAL > PAL Recommended Standard Extended"
724
+ }
725
+ },
726
+ {
727
+ "id": 55,
728
+ "name": "PALTE12",
729
+ "related": [],
730
+ "type": "attribute",
731
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE12 | description: PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials. | Decimal Places: 0",
732
+ "dtype": "determine",
733
+ "isShown": true,
734
+ "metadata": {
735
+ "row_index": 54,
736
+ "group": "PAL > PAL Recommended Standard Extended"
737
+ }
738
+ },
739
+ {
740
+ "id": 56,
741
+ "name": "PALTE2",
742
+ "related": [],
743
+ "type": "attribute",
744
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE2 | description: PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials. | Decimal Places: 0",
745
+ "dtype": "determine",
746
+ "isShown": true,
747
+ "metadata": {
748
+ "row_index": 55,
749
+ "group": "PAL > PAL Recommended Standard Extended"
750
+ }
751
+ },
752
+ {
753
+ "id": 57,
754
+ "name": "PALTE28",
755
+ "related": [],
756
+ "type": "attribute",
757
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE28 | description: PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
758
+ "dtype": "determine",
759
+ "isShown": true,
760
+ "metadata": {
761
+ "row_index": 56,
762
+ "group": "PAL > PAL Recommended Standard Extended"
763
+ }
764
+ },
765
+ {
766
+ "id": 58,
767
+ "name": "PALTE4",
768
+ "related": [],
769
+ "type": "attribute",
770
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE4 | description: PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials. | Decimal Places: 0",
771
+ "dtype": "determine",
772
+ "isShown": true,
773
+ "metadata": {
774
+ "row_index": 57,
775
+ "group": "PAL > PAL Recommended Standard Extended"
776
+ }
777
+ },
778
+ {
779
+ "id": 59,
780
+ "name": "PALTE6",
781
+ "related": [],
782
+ "type": "attribute",
783
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE6 | description: PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials. | Decimal Places: 0",
784
+ "dtype": "determine",
785
+ "isShown": true,
786
+ "metadata": {
787
+ "row_index": 58,
788
+ "group": "PAL > PAL Recommended Standard Extended"
789
+ }
790
+ },
791
+ {
792
+ "id": 60,
793
+ "name": "PALTE8",
794
+ "related": [],
795
+ "type": "attribute",
796
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE8 | description: PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials. | Decimal Places: 0",
797
+ "dtype": "determine",
798
+ "isShown": true,
799
+ "metadata": {
800
+ "row_index": 59,
801
+ "group": "PAL > PAL Recommended Standard Extended"
802
+ }
803
+ },
804
+ {
805
+ "id": 61,
806
+ "name": "PALTEA12",
807
+ "related": [],
808
+ "type": "attribute",
809
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA12 | description: PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
810
+ "dtype": "determine",
811
+ "isShown": true,
812
+ "metadata": {
813
+ "row_index": 60,
814
+ "group": "PAL > PAL Recommended Standard Extended"
815
+ }
816
+ },
817
+ {
818
+ "id": 62,
819
+ "name": "PALTEA2",
820
+ "related": [],
821
+ "type": "attribute",
822
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA2 | description: PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
823
+ "dtype": "determine",
824
+ "isShown": true,
825
+ "metadata": {
826
+ "row_index": 61,
827
+ "group": "PAL > PAL Recommended Standard Extended"
828
+ }
829
+ },
830
+ {
831
+ "id": 63,
832
+ "name": "PALTEA28",
833
+ "related": [],
834
+ "type": "attribute",
835
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA28 | description: KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task. In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
836
+ "dtype": "determine",
837
+ "isShown": true,
838
+ "metadata": {
839
+ "row_index": 62,
840
+ "group": "PAL > PAL Recommended Standard Extended"
841
+ }
842
+ },
843
+ {
844
+ "id": 64,
845
+ "name": "PALTEA4",
846
+ "related": [],
847
+ "type": "attribute",
848
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA4 | description: PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
849
+ "dtype": "determine",
850
+ "isShown": true,
851
+ "metadata": {
852
+ "row_index": 63,
853
+ "group": "PAL > PAL Recommended Standard Extended"
854
+ }
855
+ },
856
+ {
857
+ "id": 65,
858
+ "name": "PALTEA6",
859
+ "related": [],
860
+ "type": "attribute",
861
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA6 | description: PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
862
+ "dtype": "determine",
863
+ "isShown": true,
864
+ "metadata": {
865
+ "row_index": 64,
866
+ "group": "PAL > PAL Recommended Standard Extended"
867
+ }
868
+ },
869
+ {
870
+ "id": 66,
871
+ "name": "PALTEA8",
872
+ "related": [],
873
+ "type": "attribute",
874
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA8 | description: PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
875
+ "dtype": "determine",
876
+ "isShown": true,
877
+ "metadata": {
878
+ "row_index": 65,
879
+ "group": "PAL > PAL Recommended Standard Extended"
880
+ }
881
+ },
882
+ {
883
+ "id": 67,
884
+ "name": "PRMCLSDD",
885
+ "related": [],
886
+ "type": "attribute",
887
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDD | description: PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
888
+ "dtype": "determine",
889
+ "isShown": true,
890
+ "metadata": {
891
+ "row_index": 66,
892
+ "group": "PRM > PRM Recommended Standard 18 Extended"
893
+ }
894
+ },
895
+ {
896
+ "id": 68,
897
+ "name": "PRMCLSDI",
898
+ "related": [],
899
+ "type": "attribute",
900
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDI | description: PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
901
+ "dtype": "determine",
902
+ "isShown": true,
903
+ "metadata": {
904
+ "row_index": 67,
905
+ "group": "PRM > PRM Recommended Standard 18 Extended"
906
+ }
907
+ },
908
+ {
909
+ "id": 69,
910
+ "name": "PRMMCLD",
911
+ "related": [],
912
+ "type": "attribute",
913
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLD | description: PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
914
+ "dtype": "determine",
915
+ "isShown": true,
916
+ "metadata": {
917
+ "row_index": 68,
918
+ "group": "PRM > PRM Recommended Standard 18 Extended"
919
+ }
920
+ },
921
+ {
922
+ "id": 70,
923
+ "name": "PRMMCLI",
924
+ "related": [],
925
+ "type": "attribute",
926
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLI | description: PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
927
+ "dtype": "determine",
928
+ "isShown": true,
929
+ "metadata": {
930
+ "row_index": 69,
931
+ "group": "PRM > PRM Recommended Standard 18 Extended"
932
+ }
933
+ },
934
+ {
935
+ "id": 71,
936
+ "name": "PRMMDCLD",
937
+ "related": [],
938
+ "type": "attribute",
939
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLD | description: PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
940
+ "dtype": "determine",
941
+ "isShown": true,
942
+ "metadata": {
943
+ "row_index": 70,
944
+ "group": "PRM > PRM Recommended Standard 18 Extended"
945
+ }
946
+ },
947
+ {
948
+ "id": 72,
949
+ "name": "PRMMDCLI",
950
+ "related": [],
951
+ "type": "attribute",
952
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLI | description: PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
953
+ "dtype": "determine",
954
+ "isShown": true,
955
+ "metadata": {
956
+ "row_index": 71,
957
+ "group": "PRM > PRM Recommended Standard 18 Extended"
958
+ }
959
+ },
960
+ {
961
+ "id": 73,
962
+ "name": "PRMPCD",
963
+ "related": [],
964
+ "type": "attribute",
965
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCD | description: KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage. | Decimal Places: 2",
966
+ "dtype": "determine",
967
+ "isShown": true,
968
+ "metadata": {
969
+ "row_index": 72,
970
+ "group": "PRM > PRM Recommended Standard 18 Extended"
971
+ }
972
+ },
973
+ {
974
+ "id": 74,
975
+ "name": "PRMPCI",
976
+ "related": [],
977
+ "type": "attribute",
978
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCI | description: KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage. | Decimal Places: 2",
979
+ "dtype": "determine",
980
+ "isShown": true,
981
+ "metadata": {
982
+ "row_index": 73,
983
+ "group": "PRM > PRM Recommended Standard 18 Extended"
984
+ }
985
+ },
986
+ {
987
+ "id": 75,
988
+ "name": "PRMTSDSP",
989
+ "related": [],
990
+ "type": "attribute",
991
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMTSDSP | description: PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition. | Decimal Places: 2",
992
+ "dtype": "determine",
993
+ "isShown": true,
994
+ "metadata": {
995
+ "row_index": 74,
996
+ "group": "PRM > PRM Recommended Standard 18 Extended"
997
+ }
998
+ },
999
+ {
1000
+ "id": 76,
1001
+ "name": "RVPA",
1002
+ "related": [],
1003
+ "type": "attribute",
1004
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPA | description: KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences. | Decimal Places: 4",
1005
+ "dtype": "determine",
1006
+ "isShown": true,
1007
+ "metadata": {
1008
+ "row_index": 75,
1009
+ "group": "RVP > RVP 3 Targets"
1010
+ }
1011
+ },
1012
+ {
1013
+ "id": 77,
1014
+ "name": "RVPLSD",
1015
+ "related": [],
1016
+ "type": "attribute",
1017
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPLSD | description: RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
1018
+ "dtype": "determine",
1019
+ "isShown": true,
1020
+ "metadata": {
1021
+ "row_index": 76,
1022
+ "group": "RVP > RVP 3 Targets"
1023
+ }
1024
+ },
1025
+ {
1026
+ "id": 78,
1027
+ "name": "RVPMDL",
1028
+ "related": [],
1029
+ "type": "attribute",
1030
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPMDL | description: KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
1031
+ "dtype": "determine",
1032
+ "isShown": true,
1033
+ "metadata": {
1034
+ "row_index": 77,
1035
+ "group": "RVP > RVP 3 Targets"
1036
+ }
1037
+ },
1038
+ {
1039
+ "id": 79,
1040
+ "name": "RVPML",
1041
+ "related": [],
1042
+ "type": "attribute",
1043
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPML | description: RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
1044
+ "dtype": "determine",
1045
+ "isShown": true,
1046
+ "metadata": {
1047
+ "row_index": 78,
1048
+ "group": "RVP > RVP 3 Targets"
1049
+ }
1050
+ },
1051
+ {
1052
+ "id": 80,
1053
+ "name": "RVPPFA",
1054
+ "related": [],
1055
+ "type": "attribute",
1056
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPFA | description: KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms Γ· (False Alarms + Correct Rejections)) | Decimal Places: 4",
1057
+ "dtype": "determine",
1058
+ "isShown": true,
1059
+ "metadata": {
1060
+ "row_index": 79,
1061
+ "group": "RVP > RVP 3 Targets"
1062
+ }
1063
+ },
1064
+ {
1065
+ "id": 81,
1066
+ "name": "RVPPH",
1067
+ "related": [],
1068
+ "type": "attribute",
1069
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPH | description: RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits Γ· total number of sequences) | Decimal Places: 4",
1070
+ "dtype": "determine",
1071
+ "isShown": true,
1072
+ "metadata": {
1073
+ "row_index": 80,
1074
+ "group": "RVP > RVP 3 Targets"
1075
+ }
1076
+ },
1077
+ {
1078
+ "id": 82,
1079
+ "name": "RVPTFA",
1080
+ "related": [],
1081
+ "type": "attribute",
1082
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTFA | description: RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms. | Decimal Places: 0",
1083
+ "dtype": "determine",
1084
+ "isShown": true,
1085
+ "metadata": {
1086
+ "row_index": 81,
1087
+ "group": "RVP > RVP 3 Targets"
1088
+ }
1089
+ },
1090
+ {
1091
+ "id": 83,
1092
+ "name": "RVPTH",
1093
+ "related": [],
1094
+ "type": "attribute",
1095
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTH | description: RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks. | Decimal Places: 0",
1096
+ "dtype": "determine",
1097
+ "isShown": true,
1098
+ "metadata": {
1099
+ "row_index": 82,
1100
+ "group": "RVP > RVP 3 Targets"
1101
+ }
1102
+ },
1103
+ {
1104
+ "id": 84,
1105
+ "name": "RVPTM",
1106
+ "related": [],
1107
+ "type": "attribute",
1108
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTM | description: RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks. | Decimal Places: 0",
1109
+ "dtype": "determine",
1110
+ "isShown": true,
1111
+ "metadata": {
1112
+ "row_index": 83,
1113
+ "group": "RVP > RVP 3 Targets"
1114
+ }
1115
+ },
1116
+ {
1117
+ "id": 85,
1118
+ "name": "SWMBE12",
1119
+ "related": [],
1120
+ "type": "attribute",
1121
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE12 | description: KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1122
+ "dtype": "determine",
1123
+ "isShown": true,
1124
+ "metadata": {
1125
+ "row_index": 84,
1126
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1127
+ }
1128
+ },
1129
+ {
1130
+ "id": 86,
1131
+ "name": "SWMBE4",
1132
+ "related": [],
1133
+ "type": "attribute",
1134
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE4 | description: KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1135
+ "dtype": "determine",
1136
+ "isShown": true,
1137
+ "metadata": {
1138
+ "row_index": 85,
1139
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1140
+ }
1141
+ },
1142
+ {
1143
+ "id": 87,
1144
+ "name": "SWMBE468",
1145
+ "related": [],
1146
+ "type": "attribute",
1147
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE468 | description: KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1148
+ "dtype": "determine",
1149
+ "isShown": true,
1150
+ "metadata": {
1151
+ "row_index": 86,
1152
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1153
+ }
1154
+ },
1155
+ {
1156
+ "id": 88,
1157
+ "name": "SWMBE6",
1158
+ "related": [],
1159
+ "type": "attribute",
1160
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE6 | description: KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1161
+ "dtype": "determine",
1162
+ "isShown": true,
1163
+ "metadata": {
1164
+ "row_index": 87,
1165
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1166
+ }
1167
+ },
1168
+ {
1169
+ "id": 89,
1170
+ "name": "SWMBE8",
1171
+ "related": [],
1172
+ "type": "attribute",
1173
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE8 | description: KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1174
+ "dtype": "determine",
1175
+ "isShown": true,
1176
+ "metadata": {
1177
+ "row_index": 88,
1178
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1179
+ }
1180
+ },
1181
+ {
1182
+ "id": 90,
1183
+ "name": "SWMDE12",
1184
+ "related": [],
1185
+ "type": "attribute",
1186
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE12 | description: SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1187
+ "dtype": "determine",
1188
+ "isShown": true,
1189
+ "metadata": {
1190
+ "row_index": 89,
1191
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1192
+ }
1193
+ },
1194
+ {
1195
+ "id": 91,
1196
+ "name": "SWMDE4",
1197
+ "related": [],
1198
+ "type": "attribute",
1199
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE4 | description: SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1200
+ "dtype": "determine",
1201
+ "isShown": true,
1202
+ "metadata": {
1203
+ "row_index": 90,
1204
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1205
+ }
1206
+ },
1207
+ {
1208
+ "id": 92,
1209
+ "name": "SWMDE468",
1210
+ "related": [],
1211
+ "type": "attribute",
1212
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE468 | description: SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1213
+ "dtype": "determine",
1214
+ "isShown": true,
1215
+ "metadata": {
1216
+ "row_index": 91,
1217
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1218
+ }
1219
+ },
1220
+ {
1221
+ "id": 93,
1222
+ "name": "SWMDE6",
1223
+ "related": [],
1224
+ "type": "attribute",
1225
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE6 | description: SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1226
+ "dtype": "determine",
1227
+ "isShown": true,
1228
+ "metadata": {
1229
+ "row_index": 92,
1230
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1231
+ }
1232
+ },
1233
+ {
1234
+ "id": 94,
1235
+ "name": "SWMDE8",
1236
+ "related": [],
1237
+ "type": "attribute",
1238
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE8 | description: SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1239
+ "dtype": "determine",
1240
+ "isShown": true,
1241
+ "metadata": {
1242
+ "row_index": 93,
1243
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1244
+ }
1245
+ },
1246
+ {
1247
+ "id": 95,
1248
+ "name": "SWMPR",
1249
+ "related": [],
1250
+ "type": "attribute",
1251
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMPR | description: SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete. | Decimal Places: 0",
1252
+ "dtype": "determine",
1253
+ "isShown": true,
1254
+ "metadata": {
1255
+ "row_index": 94,
1256
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1257
+ }
1258
+ },
1259
+ {
1260
+ "id": 96,
1261
+ "name": "SWMS",
1262
+ "related": [],
1263
+ "type": "attribute",
1264
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS | description: KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens. | Decimal Places: 0",
1265
+ "dtype": "determine",
1266
+ "isShown": true,
1267
+ "metadata": {
1268
+ "row_index": 95,
1269
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1270
+ }
1271
+ },
1272
+ {
1273
+ "id": 97,
1274
+ "name": "SWMS6",
1275
+ "related": [],
1276
+ "type": "attribute",
1277
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS6 | description: SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. | Decimal Places: 0",
1278
+ "dtype": "determine",
1279
+ "isShown": true,
1280
+ "metadata": {
1281
+ "row_index": 96,
1282
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1283
+ }
1284
+ },
1285
+ {
1286
+ "id": 98,
1287
+ "name": "SWMSX",
1288
+ "related": [],
1289
+ "type": "attribute",
1290
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMSX | description: SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more. | Decimal Places: 0",
1291
+ "dtype": "determine",
1292
+ "isShown": true,
1293
+ "metadata": {
1294
+ "row_index": 97,
1295
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1296
+ }
1297
+ },
1298
+ {
1299
+ "id": 99,
1300
+ "name": "SWMTE12",
1301
+ "related": [],
1302
+ "type": "attribute",
1303
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE12 | description: SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1304
+ "dtype": "determine",
1305
+ "isShown": true,
1306
+ "metadata": {
1307
+ "row_index": 98,
1308
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1309
+ }
1310
+ },
1311
+ {
1312
+ "id": 100,
1313
+ "name": "SWMTE4",
1314
+ "related": [],
1315
+ "type": "attribute",
1316
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE4 | description: SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1317
+ "dtype": "determine",
1318
+ "isShown": true,
1319
+ "metadata": {
1320
+ "row_index": 99,
1321
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1322
+ }
1323
+ },
1324
+ {
1325
+ "id": 101,
1326
+ "name": "SWMTE468",
1327
+ "related": [],
1328
+ "type": "attribute",
1329
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE468 | description: SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1330
+ "dtype": "determine",
1331
+ "isShown": true,
1332
+ "metadata": {
1333
+ "row_index": 100,
1334
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1335
+ }
1336
+ },
1337
+ {
1338
+ "id": 102,
1339
+ "name": "SWMTE6",
1340
+ "related": [],
1341
+ "type": "attribute",
1342
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE6 | description: SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1343
+ "dtype": "determine",
1344
+ "isShown": true,
1345
+ "metadata": {
1346
+ "row_index": 101,
1347
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1348
+ }
1349
+ },
1350
+ {
1351
+ "id": 103,
1352
+ "name": "SWMTE8",
1353
+ "related": [],
1354
+ "type": "attribute",
1355
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE8 | description: SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1356
+ "dtype": "determine",
1357
+ "isShown": true,
1358
+ "metadata": {
1359
+ "row_index": 102,
1360
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1361
+ }
1362
+ },
1363
+ {
1364
+ "id": 104,
1365
+ "name": "SWMWE12",
1366
+ "related": [],
1367
+ "type": "attribute",
1368
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE12 | description: SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1369
+ "dtype": "determine",
1370
+ "isShown": true,
1371
+ "metadata": {
1372
+ "row_index": 103,
1373
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1374
+ }
1375
+ },
1376
+ {
1377
+ "id": 105,
1378
+ "name": "SWMWE4",
1379
+ "related": [],
1380
+ "type": "attribute",
1381
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE4 | description: SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1382
+ "dtype": "determine",
1383
+ "isShown": true,
1384
+ "metadata": {
1385
+ "row_index": 104,
1386
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1387
+ }
1388
+ },
1389
+ {
1390
+ "id": 106,
1391
+ "name": "SWMWE468",
1392
+ "related": [],
1393
+ "type": "attribute",
1394
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE468 | description: SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1395
+ "dtype": "determine",
1396
+ "isShown": true,
1397
+ "metadata": {
1398
+ "row_index": 105,
1399
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1400
+ }
1401
+ },
1402
+ {
1403
+ "id": 107,
1404
+ "name": "SWMWE6",
1405
+ "related": [],
1406
+ "type": "attribute",
1407
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE6 | description: SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1408
+ "dtype": "determine",
1409
+ "isShown": true,
1410
+ "metadata": {
1411
+ "row_index": 106,
1412
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1413
+ }
1414
+ },
1415
+ {
1416
+ "id": 108,
1417
+ "name": "SWMWE8",
1418
+ "related": [],
1419
+ "type": "attribute",
1420
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE8 | description: SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1421
+ "dtype": "determine",
1422
+ "isShown": true,
1423
+ "metadata": {
1424
+ "row_index": 107,
1425
+ "group": "SWM > SWM Recommended Standard 2.0 Extended"
1426
+ }
1427
+ },
1428
+ {
1429
+ "id": 109,
1430
+ "name": "DMS",
1431
+ "related": [
1432
+ 110
1433
+ ],
1434
+ "type": "aggregation",
1435
+ "desc": "Group: DMS",
1436
+ "dtype": "determine",
1437
+ "isShown": true,
1438
+ "structure_provenance": {
1439
+ "route": "group_anchor",
1440
+ "aspect_method": null,
1441
+ "silhouette": null,
1442
+ "slot_coverage": null
1443
+ }
1444
+ },
1445
+ {
1446
+ "id": 110,
1447
+ "name": "Recommended Standard",
1448
+ "related": [
1449
+ 111,
1450
+ 115,
1451
+ 117,
1452
+ 118,
1453
+ 119,
1454
+ 1,
1455
+ 14,
1456
+ 26
1457
+ ],
1458
+ "type": "aggregation",
1459
+ "desc": "Group: DMS > DMS Recommended Standard",
1460
+ "dtype": "determine",
1461
+ "isShown": true,
1462
+ "structure_provenance": {
1463
+ "route": "group_anchor",
1464
+ "aspect_method": null,
1465
+ "silhouette": null,
1466
+ "slot_coverage": null,
1467
+ "phrase_regularity": 1.0,
1468
+ "route_used": "per_row_llm_extraction"
1469
+ }
1470
+ },
1471
+ {
1472
+ "id": 111,
1473
+ "name": "Correct Latency",
1474
+ "related": [
1475
+ 112,
1476
+ 113,
1477
+ 114
1478
+ ],
1479
+ "type": "aggregation",
1480
+ "desc": "Role: measure | Value: \"Correct Latency\" | Variables: 17 | Source: per-row LLM extraction (Zhu et al. 2025)",
1481
+ "dtype": "determine",
1482
+ "isShown": true,
1483
+ "label_provenance": {
1484
+ "label_source": "per_row_llm_role",
1485
+ "evidence_terms": [
1486
+ "Correct Latency"
1487
+ ],
1488
+ "confidence": 1.0,
1489
+ "llm_used": true,
1490
+ "llm_rejected": false,
1491
+ "role": "measure"
1492
+ },
1493
+ "structure_provenance": {
1494
+ "route": "per_row_llm_extraction",
1495
+ "aspect_method": "per_row_llm_extraction",
1496
+ "slot_role": "measure",
1497
+ "phrase_silhouette": null,
1498
+ "regularity": 1.0,
1499
+ "n_clusters": null
1500
+ }
1501
+ },
1502
+ {
1503
+ "id": 112,
1504
+ "name": "Standard Deviation",
1505
+ "related": [
1506
+ 2,
1507
+ 3,
1508
+ 4,
1509
+ 5,
1510
+ 7,
1511
+ 6
1512
+ ],
1513
+ "type": "aggregation",
1514
+ "desc": "Role: statistic | Value: \"Standard Deviation\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
1515
+ "dtype": "determine",
1516
+ "isShown": true,
1517
+ "label_provenance": {
1518
+ "label_source": "per_row_llm_role",
1519
+ "evidence_terms": [
1520
+ "Standard Deviation"
1521
+ ],
1522
+ "confidence": 1.0,
1523
+ "llm_used": true,
1524
+ "llm_rejected": false,
1525
+ "role": "statistic"
1526
+ },
1527
+ "structure_provenance": {
1528
+ "route": "per_row_llm_extraction",
1529
+ "aspect_method": "per_row_llm_extraction",
1530
+ "slot_role": "statistic",
1531
+ "phrase_silhouette": null,
1532
+ "regularity": 1.0,
1533
+ "n_clusters": null
1534
+ }
1535
+ },
1536
+ {
1537
+ "id": 113,
1538
+ "name": "Median",
1539
+ "related": [
1540
+ 9,
1541
+ 10,
1542
+ 11,
1543
+ 12,
1544
+ 13,
1545
+ 8
1546
+ ],
1547
+ "type": "aggregation",
1548
+ "desc": "Role: statistic | Value: \"Median\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
1549
+ "dtype": "determine",
1550
+ "isShown": true,
1551
+ "label_provenance": {
1552
+ "label_source": "per_row_llm_role",
1553
+ "evidence_terms": [
1554
+ "Median"
1555
+ ],
1556
+ "confidence": 1.0,
1557
+ "llm_used": true,
1558
+ "llm_rejected": false,
1559
+ "role": "statistic"
1560
+ },
1561
+ "structure_provenance": {
1562
+ "route": "per_row_llm_extraction",
1563
+ "aspect_method": "per_row_llm_extraction",
1564
+ "slot_role": "statistic",
1565
+ "phrase_silhouette": null,
1566
+ "regularity": 1.0,
1567
+ "n_clusters": null
1568
+ }
1569
+ },
1570
+ {
1571
+ "id": 114,
1572
+ "name": "Mean",
1573
+ "related": [
1574
+ 15,
1575
+ 16,
1576
+ 17,
1577
+ 18,
1578
+ 19
1579
+ ],
1580
+ "type": "aggregation",
1581
+ "desc": "Role: statistic | Value: \"Mean\" | Variables: 5 | Source: per-row LLM extraction (Zhu et al. 2025)",
1582
+ "dtype": "determine",
1583
+ "isShown": true,
1584
+ "label_provenance": {
1585
+ "label_source": "per_row_llm_role",
1586
+ "evidence_terms": [
1587
+ "Mean"
1588
+ ],
1589
+ "confidence": 1.0,
1590
+ "llm_used": true,
1591
+ "llm_rejected": false,
1592
+ "role": "statistic"
1593
+ },
1594
+ "structure_provenance": {
1595
+ "route": "per_row_llm_extraction",
1596
+ "aspect_method": "per_row_llm_extraction",
1597
+ "slot_role": "statistic",
1598
+ "phrase_silhouette": null,
1599
+ "regularity": 1.0,
1600
+ "n_clusters": null
1601
+ }
1602
+ },
1603
+ {
1604
+ "id": 115,
1605
+ "name": "Error",
1606
+ "related": [
1607
+ 116,
1608
+ 27,
1609
+ 36,
1610
+ 38,
1611
+ 40
1612
+ ],
1613
+ "type": "aggregation",
1614
+ "desc": "Role: measure | Value: \"Error\" | Variables: 7 | Source: per-row LLM extraction (Zhu et al. 2025)",
1615
+ "dtype": "determine",
1616
+ "isShown": true,
1617
+ "label_provenance": {
1618
+ "label_source": "per_row_llm_role",
1619
+ "evidence_terms": [
1620
+ "Error"
1621
+ ],
1622
+ "confidence": 1.0,
1623
+ "llm_used": true,
1624
+ "llm_rejected": false,
1625
+ "role": "measure"
1626
+ },
1627
+ "structure_provenance": {
1628
+ "route": "per_row_llm_extraction",
1629
+ "aspect_method": "per_row_llm_extraction",
1630
+ "slot_role": "measure",
1631
+ "phrase_silhouette": null,
1632
+ "regularity": 1.0,
1633
+ "n_clusters": null
1634
+ }
1635
+ },
1636
+ {
1637
+ "id": 116,
1638
+ "name": "All Delays",
1639
+ "related": [
1640
+ 37,
1641
+ 39,
1642
+ 41
1643
+ ],
1644
+ "type": "aggregation",
1645
+ "desc": "Role: condition | Value: \"all delays\" | Variables: 3 | Source: per-row LLM extraction (Zhu et al. 2025)",
1646
+ "dtype": "determine",
1647
+ "isShown": true,
1648
+ "label_provenance": {
1649
+ "label_source": "per_row_llm_role",
1650
+ "evidence_terms": [
1651
+ "all delays"
1652
+ ],
1653
+ "confidence": 1.0,
1654
+ "llm_used": true,
1655
+ "llm_rejected": false,
1656
+ "role": "condition"
1657
+ },
1658
+ "structure_provenance": {
1659
+ "route": "per_row_llm_extraction",
1660
+ "aspect_method": "per_row_llm_extraction",
1661
+ "slot_role": "condition",
1662
+ "phrase_silhouette": null,
1663
+ "regularity": 1.0,
1664
+ "n_clusters": null
1665
+ }
1666
+ },
1667
+ {
1668
+ "id": 117,
1669
+ "name": "Percent Correct",
1670
+ "related": [
1671
+ 20,
1672
+ 21,
1673
+ 22,
1674
+ 23,
1675
+ 24,
1676
+ 25
1677
+ ],
1678
+ "type": "aggregation",
1679
+ "desc": "Role: measure | Value: \"Percent Correct\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
1680
+ "dtype": "determine",
1681
+ "isShown": true,
1682
+ "label_provenance": {
1683
+ "label_source": "per_row_llm_role",
1684
+ "evidence_terms": [
1685
+ "Percent Correct"
1686
+ ],
1687
+ "confidence": 1.0,
1688
+ "llm_used": true,
1689
+ "llm_rejected": false,
1690
+ "role": "measure"
1691
+ },
1692
+ "structure_provenance": {
1693
+ "route": "per_row_llm_extraction",
1694
+ "aspect_method": "per_row_llm_extraction",
1695
+ "slot_role": "measure",
1696
+ "phrase_silhouette": null,
1697
+ "regularity": 1.0,
1698
+ "n_clusters": null
1699
+ }
1700
+ },
1701
+ {
1702
+ "id": 118,
1703
+ "name": "Correct",
1704
+ "related": [
1705
+ 29,
1706
+ 30,
1707
+ 31,
1708
+ 32,
1709
+ 33,
1710
+ 28
1711
+ ],
1712
+ "type": "aggregation",
1713
+ "desc": "Role: measure | Value: \"Correct\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
1714
+ "dtype": "determine",
1715
+ "isShown": true,
1716
+ "label_provenance": {
1717
+ "label_source": "per_row_llm_role",
1718
+ "evidence_terms": [
1719
+ "Correct"
1720
+ ],
1721
+ "confidence": 1.0,
1722
+ "llm_used": true,
1723
+ "llm_rejected": false,
1724
+ "role": "measure"
1725
+ },
1726
+ "structure_provenance": {
1727
+ "route": "per_row_llm_extraction",
1728
+ "aspect_method": "per_row_llm_extraction",
1729
+ "slot_role": "measure",
1730
+ "phrase_silhouette": null,
1731
+ "regularity": 1.0,
1732
+ "n_clusters": null
1733
+ }
1734
+ },
1735
+ {
1736
+ "id": 119,
1737
+ "name": "Errors",
1738
+ "related": [
1739
+ 34,
1740
+ 35
1741
+ ],
1742
+ "type": "aggregation",
1743
+ "desc": "Role: measure | Value: \"Errors\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
1744
+ "dtype": "determine",
1745
+ "isShown": true,
1746
+ "label_provenance": {
1747
+ "label_source": "per_row_llm_role",
1748
+ "evidence_terms": [
1749
+ "Errors"
1750
+ ],
1751
+ "confidence": 1.0,
1752
+ "llm_used": true,
1753
+ "llm_rejected": false,
1754
+ "role": "measure"
1755
+ },
1756
+ "structure_provenance": {
1757
+ "route": "per_row_llm_extraction",
1758
+ "aspect_method": "per_row_llm_extraction",
1759
+ "slot_role": "measure",
1760
+ "phrase_silhouette": null,
1761
+ "regularity": 1.0,
1762
+ "n_clusters": null
1763
+ }
1764
+ },
1765
+ {
1766
+ "id": 120,
1767
+ "name": "MOT",
1768
+ "related": [
1769
+ 121
1770
+ ],
1771
+ "type": "aggregation",
1772
+ "desc": "Group: MOT",
1773
+ "dtype": "determine",
1774
+ "isShown": true,
1775
+ "structure_provenance": {
1776
+ "route": "group_anchor",
1777
+ "aspect_method": null,
1778
+ "silhouette": null,
1779
+ "slot_coverage": null
1780
+ }
1781
+ },
1782
+ {
1783
+ "id": 121,
1784
+ "name": "Tone 2.0",
1785
+ "related": [
1786
+ 122,
1787
+ 123
1788
+ ],
1789
+ "type": "aggregation",
1790
+ "desc": "Group: MOT > MOT Tone 2.0",
1791
+ "dtype": "determine",
1792
+ "isShown": true,
1793
+ "structure_provenance": {
1794
+ "route": "group_anchor",
1795
+ "aspect_method": null,
1796
+ "silhouette": null,
1797
+ "slot_coverage": null,
1798
+ "phrase_regularity": 1.0,
1799
+ "route_used": "per_row_llm_extraction"
1800
+ }
1801
+ },
1802
+ {
1803
+ "id": 122,
1804
+ "name": "Latency",
1805
+ "related": [
1806
+ 42,
1807
+ 43
1808
+ ],
1809
+ "type": "aggregation",
1810
+ "desc": "Role: measure | Value: \"latency\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
1811
+ "dtype": "determine",
1812
+ "isShown": true,
1813
+ "label_provenance": {
1814
+ "label_source": "per_row_llm_role",
1815
+ "evidence_terms": [
1816
+ "latency"
1817
+ ],
1818
+ "confidence": 1.0,
1819
+ "llm_used": true,
1820
+ "llm_rejected": false,
1821
+ "role": "measure"
1822
+ },
1823
+ "structure_provenance": {
1824
+ "route": "per_row_llm_extraction",
1825
+ "aspect_method": "per_row_llm_extraction",
1826
+ "slot_role": "measure",
1827
+ "phrase_silhouette": null,
1828
+ "regularity": 1.0,
1829
+ "n_clusters": null
1830
+ }
1831
+ },
1832
+ {
1833
+ "id": 123,
1834
+ "name": "Assessment Trials",
1835
+ "related": [
1836
+ 44,
1837
+ 45
1838
+ ],
1839
+ "type": "aggregation",
1840
+ "desc": "Role: measure | Value: \"assessment trials\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
1841
+ "dtype": "determine",
1842
+ "isShown": true,
1843
+ "label_provenance": {
1844
+ "label_source": "per_row_llm_role",
1845
+ "evidence_terms": [
1846
+ "assessment trials"
1847
+ ],
1848
+ "confidence": 1.0,
1849
+ "llm_used": true,
1850
+ "llm_rejected": false,
1851
+ "role": "measure"
1852
+ },
1853
+ "structure_provenance": {
1854
+ "route": "per_row_llm_extraction",
1855
+ "aspect_method": "per_row_llm_extraction",
1856
+ "slot_role": "measure",
1857
+ "phrase_silhouette": null,
1858
+ "regularity": 1.0,
1859
+ "n_clusters": null
1860
+ }
1861
+ },
1862
+ {
1863
+ "id": 124,
1864
+ "name": "PAL",
1865
+ "related": [
1866
+ 125
1867
+ ],
1868
+ "type": "aggregation",
1869
+ "desc": "Group: PAL",
1870
+ "dtype": "determine",
1871
+ "isShown": true,
1872
+ "structure_provenance": {
1873
+ "route": "group_anchor",
1874
+ "aspect_method": null,
1875
+ "silhouette": null,
1876
+ "slot_coverage": null
1877
+ }
1878
+ },
1879
+ {
1880
+ "id": 125,
1881
+ "name": "Recommended Standard Extended",
1882
+ "related": [
1883
+ 126,
1884
+ 128,
1885
+ 46,
1886
+ 52,
1887
+ 65,
1888
+ 66,
1889
+ 48
1890
+ ],
1891
+ "type": "aggregation",
1892
+ "desc": "Group: PAL > PAL Recommended Standard Extended",
1893
+ "dtype": "determine",
1894
+ "isShown": true,
1895
+ "structure_provenance": {
1896
+ "route": "group_anchor",
1897
+ "aspect_method": null,
1898
+ "silhouette": null,
1899
+ "slot_coverage": null,
1900
+ "phrase_regularity": 1.0,
1901
+ "route_used": "per_row_llm_extraction"
1902
+ }
1903
+ },
1904
+ {
1905
+ "id": 126,
1906
+ "name": "Errors",
1907
+ "related": [
1908
+ 127,
1909
+ 47
1910
+ ],
1911
+ "type": "aggregation",
1912
+ "desc": "Role: measure | Value: \"Errors\" | Variables: 11 | Source: per-row LLM extraction (Zhu et al. 2025)",
1913
+ "dtype": "determine",
1914
+ "isShown": true,
1915
+ "label_provenance": {
1916
+ "label_source": "per_row_llm_role",
1917
+ "evidence_terms": [
1918
+ "Errors"
1919
+ ],
1920
+ "confidence": 1.0,
1921
+ "llm_used": true,
1922
+ "llm_rejected": false,
1923
+ "role": "measure"
1924
+ },
1925
+ "structure_provenance": {
1926
+ "route": "per_row_llm_extraction",
1927
+ "aspect_method": "per_row_llm_extraction",
1928
+ "slot_role": "measure",
1929
+ "phrase_silhouette": null,
1930
+ "regularity": 1.0,
1931
+ "n_clusters": null
1932
+ }
1933
+ },
1934
+ {
1935
+ "id": 127,
1936
+ "name": "Total",
1937
+ "related": [
1938
+ 55,
1939
+ 56,
1940
+ 58,
1941
+ 59,
1942
+ 60,
1943
+ 61,
1944
+ 62,
1945
+ 63,
1946
+ 64,
1947
+ 57
1948
+ ],
1949
+ "type": "aggregation",
1950
+ "desc": "Role: statistic | Value: \"Total\" | Variables: 10 | Source: per-row LLM extraction (Zhu et al. 2025)",
1951
+ "dtype": "determine",
1952
+ "isShown": true,
1953
+ "label_provenance": {
1954
+ "label_source": "per_row_llm_role",
1955
+ "evidence_terms": [
1956
+ "Total"
1957
+ ],
1958
+ "confidence": 1.0,
1959
+ "llm_used": true,
1960
+ "llm_rejected": false,
1961
+ "role": "statistic"
1962
+ },
1963
+ "structure_provenance": {
1964
+ "route": "per_row_llm_extraction",
1965
+ "aspect_method": "per_row_llm_extraction",
1966
+ "slot_role": "statistic",
1967
+ "phrase_silhouette": null,
1968
+ "regularity": 1.0,
1969
+ "n_clusters": null
1970
+ }
1971
+ },
1972
+ {
1973
+ "id": 128,
1974
+ "name": "Attempts",
1975
+ "related": [
1976
+ 129,
1977
+ 51
1978
+ ],
1979
+ "type": "aggregation",
1980
+ "desc": "Role: measure | Value: \"Attempts\" | Variables: 5 | Source: per-row LLM extraction (Zhu et al. 2025)",
1981
+ "dtype": "determine",
1982
+ "isShown": true,
1983
+ "label_provenance": {
1984
+ "label_source": "per_row_llm_role",
1985
+ "evidence_terms": [
1986
+ "Attempts"
1987
+ ],
1988
+ "confidence": 1.0,
1989
+ "llm_used": true,
1990
+ "llm_rejected": false,
1991
+ "role": "measure"
1992
+ },
1993
+ "structure_provenance": {
1994
+ "route": "per_row_llm_extraction",
1995
+ "aspect_method": "per_row_llm_extraction",
1996
+ "slot_role": "measure",
1997
+ "phrase_silhouette": null,
1998
+ "regularity": 1.0,
1999
+ "n_clusters": null
2000
+ }
2001
+ },
2002
+ {
2003
+ "id": 129,
2004
+ "name": "Total",
2005
+ "related": [
2006
+ 49,
2007
+ 50,
2008
+ 53,
2009
+ 54
2010
+ ],
2011
+ "type": "aggregation",
2012
+ "desc": "Role: statistic | Value: \"Total\" | Variables: 4 | Source: per-row LLM extraction (Zhu et al. 2025)",
2013
+ "dtype": "determine",
2014
+ "isShown": true,
2015
+ "label_provenance": {
2016
+ "label_source": "per_row_llm_role",
2017
+ "evidence_terms": [
2018
+ "Total"
2019
+ ],
2020
+ "confidence": 1.0,
2021
+ "llm_used": true,
2022
+ "llm_rejected": false,
2023
+ "role": "statistic"
2024
+ },
2025
+ "structure_provenance": {
2026
+ "route": "per_row_llm_extraction",
2027
+ "aspect_method": "per_row_llm_extraction",
2028
+ "slot_role": "statistic",
2029
+ "phrase_silhouette": null,
2030
+ "regularity": 1.0,
2031
+ "n_clusters": null
2032
+ }
2033
+ },
2034
+ {
2035
+ "id": 130,
2036
+ "name": "PRM",
2037
+ "related": [
2038
+ 131
2039
+ ],
2040
+ "type": "aggregation",
2041
+ "desc": "Group: PRM",
2042
+ "dtype": "determine",
2043
+ "isShown": true,
2044
+ "structure_provenance": {
2045
+ "route": "group_anchor",
2046
+ "aspect_method": null,
2047
+ "silhouette": null,
2048
+ "slot_coverage": null
2049
+ }
2050
+ },
2051
+ {
2052
+ "id": 131,
2053
+ "name": "Recommended Standard 18 Extended",
2054
+ "related": [
2055
+ 132,
2056
+ 134,
2057
+ 135,
2058
+ 75
2059
+ ],
2060
+ "type": "aggregation",
2061
+ "desc": "Group: PRM > PRM Recommended Standard 18 Extended",
2062
+ "dtype": "determine",
2063
+ "isShown": true,
2064
+ "structure_provenance": {
2065
+ "route": "group_anchor",
2066
+ "aspect_method": null,
2067
+ "silhouette": null,
2068
+ "slot_coverage": null,
2069
+ "phrase_regularity": 1.0,
2070
+ "route_used": "per_row_llm_extraction"
2071
+ }
2072
+ },
2073
+ {
2074
+ "id": 132,
2075
+ "name": "Latency",
2076
+ "related": [
2077
+ 133,
2078
+ 67,
2079
+ 68
2080
+ ],
2081
+ "type": "aggregation",
2082
+ "desc": "Role: measure | Value: \"latency\" | Variables: 4 | Source: per-row LLM extraction (Zhu et al. 2025)",
2083
+ "dtype": "determine",
2084
+ "isShown": true,
2085
+ "label_provenance": {
2086
+ "label_source": "per_row_llm_role",
2087
+ "evidence_terms": [
2088
+ "latency"
2089
+ ],
2090
+ "confidence": 1.0,
2091
+ "llm_used": true,
2092
+ "llm_rejected": false,
2093
+ "role": "measure"
2094
+ },
2095
+ "structure_provenance": {
2096
+ "route": "per_row_llm_extraction",
2097
+ "aspect_method": "per_row_llm_extraction",
2098
+ "slot_role": "measure",
2099
+ "phrase_silhouette": null,
2100
+ "regularity": 1.0,
2101
+ "n_clusters": null
2102
+ }
2103
+ },
2104
+ {
2105
+ "id": 133,
2106
+ "name": "Median",
2107
+ "related": [
2108
+ 71,
2109
+ 72
2110
+ ],
2111
+ "type": "aggregation",
2112
+ "desc": "Role: statistic | Value: \"median\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
2113
+ "dtype": "determine",
2114
+ "isShown": true,
2115
+ "label_provenance": {
2116
+ "label_source": "per_row_llm_role",
2117
+ "evidence_terms": [
2118
+ "median"
2119
+ ],
2120
+ "confidence": 1.0,
2121
+ "llm_used": true,
2122
+ "llm_rejected": false,
2123
+ "role": "statistic"
2124
+ },
2125
+ "structure_provenance": {
2126
+ "route": "per_row_llm_extraction",
2127
+ "aspect_method": "per_row_llm_extraction",
2128
+ "slot_role": "statistic",
2129
+ "phrase_silhouette": null,
2130
+ "regularity": 1.0,
2131
+ "n_clusters": null
2132
+ }
2133
+ },
2134
+ {
2135
+ "id": 134,
2136
+ "name": "Latency",
2137
+ "related": [
2138
+ 69,
2139
+ 70
2140
+ ],
2141
+ "type": "aggregation",
2142
+ "desc": "Role: measure | Value: \"Latency\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
2143
+ "dtype": "determine",
2144
+ "isShown": true,
2145
+ "label_provenance": {
2146
+ "label_source": "per_row_llm_role",
2147
+ "evidence_terms": [
2148
+ "Latency"
2149
+ ],
2150
+ "confidence": 1.0,
2151
+ "llm_used": true,
2152
+ "llm_rejected": false,
2153
+ "role": "measure"
2154
+ },
2155
+ "structure_provenance": {
2156
+ "route": "per_row_llm_extraction",
2157
+ "aspect_method": "per_row_llm_extraction",
2158
+ "slot_role": "measure",
2159
+ "phrase_silhouette": null,
2160
+ "regularity": 1.0,
2161
+ "n_clusters": null
2162
+ }
2163
+ },
2164
+ {
2165
+ "id": 135,
2166
+ "name": "Percent Correct",
2167
+ "related": [
2168
+ 73,
2169
+ 74
2170
+ ],
2171
+ "type": "aggregation",
2172
+ "desc": "Role: measure | Value: \"Percent Correct\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
2173
+ "dtype": "determine",
2174
+ "isShown": true,
2175
+ "label_provenance": {
2176
+ "label_source": "per_row_llm_role",
2177
+ "evidence_terms": [
2178
+ "Percent Correct"
2179
+ ],
2180
+ "confidence": 1.0,
2181
+ "llm_used": true,
2182
+ "llm_rejected": false,
2183
+ "role": "measure"
2184
+ },
2185
+ "structure_provenance": {
2186
+ "route": "per_row_llm_extraction",
2187
+ "aspect_method": "per_row_llm_extraction",
2188
+ "slot_role": "measure",
2189
+ "phrase_silhouette": null,
2190
+ "regularity": 1.0,
2191
+ "n_clusters": null
2192
+ }
2193
+ },
2194
+ {
2195
+ "id": 136,
2196
+ "name": "RVP",
2197
+ "related": [
2198
+ 137
2199
+ ],
2200
+ "type": "aggregation",
2201
+ "desc": "Group: RVP",
2202
+ "dtype": "determine",
2203
+ "isShown": true,
2204
+ "structure_provenance": {
2205
+ "route": "group_anchor",
2206
+ "aspect_method": null,
2207
+ "silhouette": null,
2208
+ "slot_coverage": null
2209
+ }
2210
+ },
2211
+ {
2212
+ "id": 137,
2213
+ "name": "3 Targets",
2214
+ "related": [
2215
+ 138,
2216
+ 139,
2217
+ 142
2218
+ ],
2219
+ "type": "aggregation",
2220
+ "desc": "Group: RVP > RVP 3 Targets",
2221
+ "dtype": "determine",
2222
+ "isShown": true,
2223
+ "structure_provenance": {
2224
+ "route": "group_anchor",
2225
+ "aspect_method": null,
2226
+ "silhouette": null,
2227
+ "slot_coverage": null,
2228
+ "phrase_regularity": 0.2222,
2229
+ "route_used": "aspect_clustering_fallback"
2230
+ }
2231
+ },
2232
+ {
2233
+ "id": 138,
2234
+ "name": "Response Latency",
2235
+ "related": [
2236
+ 77,
2237
+ 78,
2238
+ 79
2239
+ ],
2240
+ "type": "aggregation",
2241
+ "desc": "Role: measure | Value: \"Response Latency\" | Variables: 3 | Source: per-row LLM extraction (Zhu et al. 2025)",
2242
+ "dtype": "determine",
2243
+ "isShown": true,
2244
+ "label_provenance": {
2245
+ "label_source": "per_row_llm_role",
2246
+ "evidence_terms": [
2247
+ "Response Latency"
2248
+ ],
2249
+ "confidence": 1.0,
2250
+ "llm_used": true,
2251
+ "llm_rejected": false,
2252
+ "role": "measure"
2253
+ },
2254
+ "structure_provenance": {
2255
+ "route": "per_row_llm_extraction",
2256
+ "aspect_method": "per_row_llm_extraction",
2257
+ "slot_role": "measure",
2258
+ "phrase_silhouette": null,
2259
+ "regularity": 1.0,
2260
+ "n_clusters": null
2261
+ }
2262
+ },
2263
+ {
2264
+ "id": 139,
2265
+ "name": "False Alarm Number",
2266
+ "related": [
2267
+ 76,
2268
+ 140,
2269
+ 141
2270
+ ],
2271
+ "type": "aggregation",
2272
+ "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.626 | Variables: 6",
2273
+ "dtype": "determine",
2274
+ "isShown": true,
2275
+ "label_provenance": {
2276
+ "label_source": "llm",
2277
+ "evidence_terms": [
2278
+ "false",
2279
+ "alarms",
2280
+ "number"
2281
+ ],
2282
+ "confidence": 0.97,
2283
+ "llm_used": true,
2284
+ "llm_rejected": false,
2285
+ "llm_raw_label": "False Alarm Number",
2286
+ "llm_reason": "accepted"
2287
+ },
2288
+ "structure_provenance": {
2289
+ "route": "aspect_clustering",
2290
+ "aspect_method": "nmf",
2291
+ "silhouette": 0.6255,
2292
+ "slot_coverage": null
2293
+ }
2294
+ },
2295
+ {
2296
+ "id": 140,
2297
+ "name": "False Alarm Presentations",
2298
+ "related": [
2299
+ 80,
2300
+ 82
2301
+ ],
2302
+ "type": "aggregation",
2303
+ "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.601 | Variables: 2",
2304
+ "dtype": "determine",
2305
+ "isShown": true,
2306
+ "label_provenance": {
2307
+ "label_source": "llm",
2308
+ "evidence_terms": [
2309
+ "false",
2310
+ "alarms",
2311
+ "presentations"
2312
+ ],
2313
+ "confidence": 0.97,
2314
+ "llm_used": true,
2315
+ "llm_rejected": false,
2316
+ "llm_raw_label": "False Alarm Presentations",
2317
+ "llm_reason": "accepted"
2318
+ },
2319
+ "structure_provenance": {
2320
+ "route": "aspect_clustering",
2321
+ "aspect_method": "nmf",
2322
+ "silhouette": 0.6006,
2323
+ "slot_coverage": null
2324
+ }
2325
+ },
2326
+ {
2327
+ "id": 141,
2328
+ "name": "Target Hit Number",
2329
+ "related": [
2330
+ 81,
2331
+ 83,
2332
+ 84
2333
+ ],
2334
+ "type": "aggregation",
2335
+ "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.601 | Variables: 3",
2336
+ "dtype": "determine",
2337
+ "isShown": true,
2338
+ "label_provenance": {
2339
+ "label_source": "llm",
2340
+ "evidence_terms": [
2341
+ "number",
2342
+ "target",
2343
+ "hits"
2344
+ ],
2345
+ "confidence": 0.92,
2346
+ "llm_used": true,
2347
+ "llm_rejected": false,
2348
+ "llm_raw_label": "Target Hit Number",
2349
+ "llm_reason": "accepted"
2350
+ },
2351
+ "structure_provenance": {
2352
+ "route": "aspect_clustering",
2353
+ "aspect_method": "nmf",
2354
+ "silhouette": 0.6006,
2355
+ "slot_coverage": null
2356
+ }
2357
+ },
2358
+ {
2359
+ "id": 142,
2360
+ "name": "RVP Response Latency",
2361
+ "related": [],
2362
+ "type": "aggregation",
2363
+ "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.626 | Variables: 3",
2364
+ "dtype": "determine",
2365
+ "isShown": true,
2366
+ "label_provenance": {
2367
+ "label_source": "llm",
2368
+ "evidence_terms": [
2369
+ "response latency",
2370
+ "latency",
2371
+ "response"
2372
+ ],
2373
+ "confidence": 0.97,
2374
+ "llm_used": true,
2375
+ "llm_rejected": false,
2376
+ "llm_raw_label": "RVP Response Latency",
2377
+ "llm_reason": "accepted"
2378
+ },
2379
+ "structure_provenance": {
2380
+ "route": "aspect_clustering",
2381
+ "aspect_method": "nmf",
2382
+ "silhouette": 0.6255,
2383
+ "slot_coverage": null
2384
+ }
2385
+ },
2386
+ {
2387
+ "id": 143,
2388
+ "name": "SWM",
2389
+ "related": [
2390
+ 144
2391
+ ],
2392
+ "type": "aggregation",
2393
+ "desc": "Group: SWM",
2394
+ "dtype": "determine",
2395
+ "isShown": true,
2396
+ "structure_provenance": {
2397
+ "route": "group_anchor",
2398
+ "aspect_method": null,
2399
+ "silhouette": null,
2400
+ "slot_coverage": null
2401
+ }
2402
+ },
2403
+ {
2404
+ "id": 144,
2405
+ "name": "Recommended Standard 2.0 Extended",
2406
+ "related": [
2407
+ 145,
2408
+ 152,
2409
+ 153,
2410
+ 87,
2411
+ 92,
2412
+ 95,
2413
+ 97
2414
+ ],
2415
+ "type": "aggregation",
2416
+ "desc": "Group: SWM > SWM Recommended Standard 2.0 Extended",
2417
+ "dtype": "determine",
2418
+ "isShown": true,
2419
+ "structure_provenance": {
2420
+ "route": "group_anchor",
2421
+ "aspect_method": null,
2422
+ "silhouette": null,
2423
+ "slot_coverage": null,
2424
+ "phrase_regularity": 1.0,
2425
+ "route_used": "per_row_llm_extraction"
2426
+ }
2427
+ },
2428
+ {
2429
+ "id": 145,
2430
+ "name": "Errors",
2431
+ "related": [
2432
+ 146,
2433
+ 147,
2434
+ 148,
2435
+ 151
2436
+ ],
2437
+ "type": "aggregation",
2438
+ "desc": "Role: measure | Value: \"errors\" | Variables: 16 | Source: per-row LLM extraction (Zhu et al. 2025)",
2439
+ "dtype": "determine",
2440
+ "isShown": true,
2441
+ "label_provenance": {
2442
+ "label_source": "per_row_llm_role",
2443
+ "evidence_terms": [
2444
+ "errors"
2445
+ ],
2446
+ "confidence": 1.0,
2447
+ "llm_used": true,
2448
+ "llm_rejected": false,
2449
+ "role": "measure"
2450
+ },
2451
+ "structure_provenance": {
2452
+ "route": "per_row_llm_extraction",
2453
+ "aspect_method": "per_row_llm_extraction",
2454
+ "slot_role": "measure",
2455
+ "phrase_silhouette": null,
2456
+ "regularity": 1.0,
2457
+ "n_clusters": null
2458
+ }
2459
+ },
2460
+ {
2461
+ "id": 146,
2462
+ "name": "Total",
2463
+ "related": [
2464
+ 99,
2465
+ 100,
2466
+ 101
2467
+ ],
2468
+ "type": "aggregation",
2469
+ "desc": "Role: statistic | Value: \"total\" | Variables: 3 | Source: per-row LLM extraction (Zhu et al. 2025)",
2470
+ "dtype": "determine",
2471
+ "isShown": true,
2472
+ "label_provenance": {
2473
+ "label_source": "per_row_llm_role",
2474
+ "evidence_terms": [
2475
+ "total"
2476
+ ],
2477
+ "confidence": 1.0,
2478
+ "llm_used": true,
2479
+ "llm_rejected": false,
2480
+ "role": "statistic"
2481
+ },
2482
+ "structure_provenance": {
2483
+ "route": "per_row_llm_extraction",
2484
+ "aspect_method": "per_row_llm_extraction",
2485
+ "slot_role": "statistic",
2486
+ "phrase_silhouette": null,
2487
+ "regularity": 1.0,
2488
+ "n_clusters": null
2489
+ }
2490
+ },
2491
+ {
2492
+ "id": 147,
2493
+ "name": "Total",
2494
+ "related": [
2495
+ 102,
2496
+ 103
2497
+ ],
2498
+ "type": "aggregation",
2499
+ "desc": "Role: statistic | Value: \"Total\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
2500
+ "dtype": "determine",
2501
+ "isShown": true,
2502
+ "label_provenance": {
2503
+ "label_source": "per_row_llm_role",
2504
+ "evidence_terms": [
2505
+ "Total"
2506
+ ],
2507
+ "confidence": 1.0,
2508
+ "llm_used": true,
2509
+ "llm_rejected": false,
2510
+ "role": "statistic"
2511
+ },
2512
+ "structure_provenance": {
2513
+ "route": "per_row_llm_extraction",
2514
+ "aspect_method": "per_row_llm_extraction",
2515
+ "slot_role": "statistic",
2516
+ "phrase_silhouette": null,
2517
+ "regularity": 1.0,
2518
+ "n_clusters": null
2519
+ }
2520
+ },
2521
+ {
2522
+ "id": 148,
2523
+ "name": "SWM box revisits",
2524
+ "related": [
2525
+ 149,
2526
+ 150
2527
+ ],
2528
+ "type": "aggregation",
2529
+ "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.794 | Variables: 7",
2530
+ "dtype": "determine",
2531
+ "isShown": true,
2532
+ "label_provenance": {
2533
+ "label_source": "llm",
2534
+ "evidence_terms": [
2535
+ "swm",
2536
+ "box",
2537
+ "revisits"
2538
+ ],
2539
+ "confidence": 0.95,
2540
+ "llm_used": true,
2541
+ "llm_rejected": false,
2542
+ "llm_raw_label": "SWM box revisits",
2543
+ "llm_reason": "accepted"
2544
+ },
2545
+ "structure_provenance": {
2546
+ "route": "aspect_clustering",
2547
+ "aspect_method": "nmf",
2548
+ "silhouette": 0.7936,
2549
+ "slot_coverage": null
2550
+ }
2551
+ },
2552
+ {
2553
+ "id": 149,
2554
+ "name": "SWM token previously",
2555
+ "related": [
2556
+ 86,
2557
+ 88
2558
+ ],
2559
+ "type": "aggregation",
2560
+ "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.731 | Variables: 2",
2561
+ "dtype": "determine",
2562
+ "isShown": true,
2563
+ "label_provenance": {
2564
+ "label_source": "llm",
2565
+ "evidence_terms": [
2566
+ "token",
2567
+ "previously",
2568
+ "key",
2569
+ "swm"
2570
+ ],
2571
+ "confidence": 0.92,
2572
+ "llm_used": true,
2573
+ "llm_rejected": false,
2574
+ "llm_raw_label": "SWM token previously",
2575
+ "llm_reason": "accepted"
2576
+ },
2577
+ "structure_provenance": {
2578
+ "route": "aspect_clustering",
2579
+ "aspect_method": "nmf",
2580
+ "silhouette": 0.7313,
2581
+ "slot_coverage": null
2582
+ }
2583
+ },
2584
+ {
2585
+ "id": 150,
2586
+ "name": "SWM Within Errors Box Revisits",
2587
+ "related": [
2588
+ 104,
2589
+ 105,
2590
+ 106,
2591
+ 107,
2592
+ 108
2593
+ ],
2594
+ "type": "aggregation",
2595
+ "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.731 | Variables: 5",
2596
+ "dtype": "determine",
2597
+ "isShown": true,
2598
+ "label_provenance": {
2599
+ "label_source": "llm",
2600
+ "evidence_terms": [
2601
+ "search",
2602
+ "box search",
2603
+ "description swm",
2604
+ "swmwe4",
2605
+ "swmwe8",
2606
+ "swmwe6",
2607
+ "extended swmwe8",
2608
+ "extended swmwe6"
2609
+ ],
2610
+ "confidence": 0.92,
2611
+ "llm_used": true,
2612
+ "llm_rejected": false,
2613
+ "llm_raw_label": "SWM Within Errors Box Revisits",
2614
+ "llm_reason": "accepted"
2615
+ },
2616
+ "structure_provenance": {
2617
+ "route": "aspect_clustering",
2618
+ "aspect_method": "nmf",
2619
+ "silhouette": 0.7313,
2620
+ "slot_coverage": null
2621
+ }
2622
+ },
2623
+ {
2624
+ "id": 151,
2625
+ "name": "SWM Double Errors",
2626
+ "related": [
2627
+ 90,
2628
+ 91,
2629
+ 93,
2630
+ 94
2631
+ ],
2632
+ "type": "aggregation",
2633
+ "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.794 | Variables: 4",
2634
+ "dtype": "determine",
2635
+ "isShown": true,
2636
+ "label_provenance": {
2637
+ "label_source": "llm",
2638
+ "evidence_terms": [
2639
+ "swm double",
2640
+ "double errors",
2641
+ "error"
2642
+ ],
2643
+ "confidence": 0.98,
2644
+ "llm_used": true,
2645
+ "llm_rejected": false,
2646
+ "llm_raw_label": "SWM Double Errors",
2647
+ "llm_reason": "accepted"
2648
+ },
2649
+ "structure_provenance": {
2650
+ "route": "aspect_clustering",
2651
+ "aspect_method": "nmf",
2652
+ "silhouette": 0.7936,
2653
+ "slot_coverage": null
2654
+ }
2655
+ },
2656
+ {
2657
+ "id": 152,
2658
+ "name": "Between Errors",
2659
+ "related": [
2660
+ 85,
2661
+ 89
2662
+ ],
2663
+ "type": "aggregation",
2664
+ "desc": "Role: measure | Value: \"Between errors\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
2665
+ "dtype": "determine",
2666
+ "isShown": true,
2667
+ "label_provenance": {
2668
+ "label_source": "per_row_llm_role",
2669
+ "evidence_terms": [
2670
+ "Between errors"
2671
+ ],
2672
+ "confidence": 1.0,
2673
+ "llm_used": true,
2674
+ "llm_rejected": false,
2675
+ "role": "measure"
2676
+ },
2677
+ "structure_provenance": {
2678
+ "route": "per_row_llm_extraction",
2679
+ "aspect_method": "per_row_llm_extraction",
2680
+ "slot_role": "measure",
2681
+ "phrase_silhouette": null,
2682
+ "regularity": 1.0,
2683
+ "n_clusters": null
2684
+ }
2685
+ },
2686
+ {
2687
+ "id": 153,
2688
+ "name": "Swm Strategy",
2689
+ "related": [
2690
+ 96,
2691
+ 98
2692
+ ],
2693
+ "type": "aggregation",
2694
+ "desc": "Role: measure | Value: \"SWM Strategy\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
2695
+ "dtype": "determine",
2696
+ "isShown": true,
2697
+ "label_provenance": {
2698
+ "label_source": "per_row_llm_role",
2699
+ "evidence_terms": [
2700
+ "SWM Strategy"
2701
+ ],
2702
+ "confidence": 1.0,
2703
+ "llm_used": true,
2704
+ "llm_rejected": false,
2705
+ "role": "measure"
2706
+ },
2707
+ "structure_provenance": {
2708
+ "route": "per_row_llm_extraction",
2709
+ "aspect_method": "per_row_llm_extraction",
2710
+ "slot_role": "measure",
2711
+ "phrase_silhouette": null,
2712
+ "regularity": 1.0,
2713
+ "n_clusters": null
2714
+ }
2715
+ }
2716
+ ]
version2/outputs/baseline/HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json ADDED
The diff for this file is too large to render. See raw diff
 
version2/outputs/baseline/ai-mind-variable-descriptions_in__baseline_hierarchy.json ADDED
@@ -0,0 +1,1876 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": 0,
4
+ "name": "ai-mind-variable-descriptions(in)",
5
+ "type": "root",
6
+ "dtype": "root",
7
+ "isShown": true,
8
+ "related": [
9
+ 109,
10
+ 114,
11
+ 115,
12
+ 122,
13
+ 128,
14
+ 132,
15
+ 139,
16
+ 140
17
+ ],
18
+ "desc": "Root node"
19
+ },
20
+ {
21
+ "id": 1,
22
+ "name": "DMSCC",
23
+ "dtype": "determine",
24
+ "related": [],
25
+ "isShown": true,
26
+ "type": "attribute",
27
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSCC | description: DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays). | Decimal Places: 2",
28
+ "metadata": {
29
+ "leaf_id": "DMS > DMS Recommended Standard.DMSCC",
30
+ "group_path": "DMS > DMS Recommended Standard"
31
+ }
32
+ },
33
+ {
34
+ "id": 2,
35
+ "name": "DMSL0SD",
36
+ "dtype": "determine",
37
+ "related": [],
38
+ "isShown": true,
39
+ "type": "attribute",
40
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL0SD | description: DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
41
+ "metadata": {
42
+ "leaf_id": "DMS > DMS Recommended Standard.DMSL0SD",
43
+ "group_path": "DMS > DMS Recommended Standard"
44
+ }
45
+ },
46
+ {
47
+ "id": 3,
48
+ "name": "DMSL12SD",
49
+ "dtype": "determine",
50
+ "related": [],
51
+ "isShown": true,
52
+ "type": "attribute",
53
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL12SD | description: DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
54
+ "metadata": {
55
+ "leaf_id": "DMS > DMS Recommended Standard.DMSL12SD",
56
+ "group_path": "DMS > DMS Recommended Standard"
57
+ }
58
+ },
59
+ {
60
+ "id": 4,
61
+ "name": "DMSL4SD",
62
+ "dtype": "determine",
63
+ "related": [],
64
+ "isShown": true,
65
+ "type": "attribute",
66
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL4SD | description: DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
67
+ "metadata": {
68
+ "leaf_id": "DMS > DMS Recommended Standard.DMSL4SD",
69
+ "group_path": "DMS > DMS Recommended Standard"
70
+ }
71
+ },
72
+ {
73
+ "id": 5,
74
+ "name": "DMSLADSD",
75
+ "dtype": "determine",
76
+ "related": [],
77
+ "isShown": true,
78
+ "type": "attribute",
79
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLADSD | description: DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
80
+ "metadata": {
81
+ "leaf_id": "DMS > DMS Recommended Standard.DMSLADSD",
82
+ "group_path": "DMS > DMS Recommended Standard"
83
+ }
84
+ },
85
+ {
86
+ "id": 6,
87
+ "name": "DMSLSD",
88
+ "dtype": "determine",
89
+ "related": [],
90
+ "isShown": true,
91
+ "type": "attribute",
92
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSD | description: DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
93
+ "metadata": {
94
+ "leaf_id": "DMS > DMS Recommended Standard.DMSLSD",
95
+ "group_path": "DMS > DMS Recommended Standard"
96
+ }
97
+ },
98
+ {
99
+ "id": 7,
100
+ "name": "DMSLSSD",
101
+ "dtype": "determine",
102
+ "related": [],
103
+ "isShown": true,
104
+ "type": "attribute",
105
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSSD | description: DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations. | Decimal Places: 4",
106
+ "metadata": {
107
+ "leaf_id": "DMS > DMS Recommended Standard.DMSLSSD",
108
+ "group_path": "DMS > DMS Recommended Standard"
109
+ }
110
+ },
111
+ {
112
+ "id": 8,
113
+ "name": "DMSMDL",
114
+ "dtype": "determine",
115
+ "related": [],
116
+ "isShown": true,
117
+ "type": "attribute",
118
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL | description: DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
119
+ "metadata": {
120
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMDL",
121
+ "group_path": "DMS > DMS Recommended Standard"
122
+ }
123
+ },
124
+ {
125
+ "id": 9,
126
+ "name": "DMSMDL0",
127
+ "dtype": "determine",
128
+ "related": [],
129
+ "isShown": true,
130
+ "type": "attribute",
131
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL0 | description: DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
132
+ "metadata": {
133
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMDL0",
134
+ "group_path": "DMS > DMS Recommended Standard"
135
+ }
136
+ },
137
+ {
138
+ "id": 10,
139
+ "name": "DMSMDL12",
140
+ "dtype": "determine",
141
+ "related": [],
142
+ "isShown": true,
143
+ "type": "attribute",
144
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL12 | description: DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
145
+ "metadata": {
146
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMDL12",
147
+ "group_path": "DMS > DMS Recommended Standard"
148
+ }
149
+ },
150
+ {
151
+ "id": 11,
152
+ "name": "DMSMDL4",
153
+ "dtype": "determine",
154
+ "related": [],
155
+ "isShown": true,
156
+ "type": "attribute",
157
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL4 | description: DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
158
+ "metadata": {
159
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMDL4",
160
+ "group_path": "DMS > DMS Recommended Standard"
161
+ }
162
+ },
163
+ {
164
+ "id": 12,
165
+ "name": "DMSMDLAD",
166
+ "dtype": "determine",
167
+ "related": [],
168
+ "isShown": true,
169
+ "type": "attribute",
170
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLAD | description: DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
171
+ "metadata": {
172
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMDLAD",
173
+ "group_path": "DMS > DMS Recommended Standard"
174
+ }
175
+ },
176
+ {
177
+ "id": 13,
178
+ "name": "DMSMDLS",
179
+ "dtype": "determine",
180
+ "related": [],
181
+ "isShown": true,
182
+ "type": "attribute",
183
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLS | description: DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
184
+ "metadata": {
185
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMDLS",
186
+ "group_path": "DMS > DMS Recommended Standard"
187
+ }
188
+ },
189
+ {
190
+ "id": 14,
191
+ "name": "DMSML",
192
+ "dtype": "determine",
193
+ "related": [],
194
+ "isShown": true,
195
+ "type": "attribute",
196
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML | description: DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
197
+ "metadata": {
198
+ "leaf_id": "DMS > DMS Recommended Standard.DMSML",
199
+ "group_path": "DMS > DMS Recommended Standard"
200
+ }
201
+ },
202
+ {
203
+ "id": 15,
204
+ "name": "DMSML0",
205
+ "dtype": "determine",
206
+ "related": [],
207
+ "isShown": true,
208
+ "type": "attribute",
209
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML0 | description: DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
210
+ "metadata": {
211
+ "leaf_id": "DMS > DMS Recommended Standard.DMSML0",
212
+ "group_path": "DMS > DMS Recommended Standard"
213
+ }
214
+ },
215
+ {
216
+ "id": 16,
217
+ "name": "DMSML12",
218
+ "dtype": "determine",
219
+ "related": [],
220
+ "isShown": true,
221
+ "type": "attribute",
222
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML12 | description: DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
223
+ "metadata": {
224
+ "leaf_id": "DMS > DMS Recommended Standard.DMSML12",
225
+ "group_path": "DMS > DMS Recommended Standard"
226
+ }
227
+ },
228
+ {
229
+ "id": 17,
230
+ "name": "DMSML4",
231
+ "dtype": "determine",
232
+ "related": [],
233
+ "isShown": true,
234
+ "type": "attribute",
235
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML4 | description: DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
236
+ "metadata": {
237
+ "leaf_id": "DMS > DMS Recommended Standard.DMSML4",
238
+ "group_path": "DMS > DMS Recommended Standard"
239
+ }
240
+ },
241
+ {
242
+ "id": 18,
243
+ "name": "DMSMLAD",
244
+ "dtype": "determine",
245
+ "related": [],
246
+ "isShown": true,
247
+ "type": "attribute",
248
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLAD | description: DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
249
+ "metadata": {
250
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMLAD",
251
+ "group_path": "DMS > DMS Recommended Standard"
252
+ }
253
+ },
254
+ {
255
+ "id": 19,
256
+ "name": "DMSMLS",
257
+ "dtype": "determine",
258
+ "related": [],
259
+ "isShown": true,
260
+ "type": "attribute",
261
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLS | description: DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
262
+ "metadata": {
263
+ "leaf_id": "DMS > DMS Recommended Standard.DMSMLS",
264
+ "group_path": "DMS > DMS Recommended Standard"
265
+ }
266
+ },
267
+ {
268
+ "id": 20,
269
+ "name": "DMSPC",
270
+ "dtype": "determine",
271
+ "related": [],
272
+ "isShown": true,
273
+ "type": "attribute",
274
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC | description: DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
275
+ "metadata": {
276
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPC",
277
+ "group_path": "DMS > DMS Recommended Standard"
278
+ }
279
+ },
280
+ {
281
+ "id": 21,
282
+ "name": "DMSPC0",
283
+ "dtype": "determine",
284
+ "related": [],
285
+ "isShown": true,
286
+ "type": "attribute",
287
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC0 | description: KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 0",
288
+ "metadata": {
289
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPC0",
290
+ "group_path": "DMS > DMS Recommended Standard"
291
+ }
292
+ },
293
+ {
294
+ "id": 22,
295
+ "name": "DMSPC12",
296
+ "dtype": "determine",
297
+ "related": [],
298
+ "isShown": true,
299
+ "type": "attribute",
300
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC12 | description: KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 0",
301
+ "metadata": {
302
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPC12",
303
+ "group_path": "DMS > DMS Recommended Standard"
304
+ }
305
+ },
306
+ {
307
+ "id": 23,
308
+ "name": "DMSPC4",
309
+ "dtype": "determine",
310
+ "related": [],
311
+ "isShown": true,
312
+ "type": "attribute",
313
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC4 | description: KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay. | Decimal Places: 0",
314
+ "metadata": {
315
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPC4",
316
+ "group_path": "DMS > DMS Recommended Standard"
317
+ }
318
+ },
319
+ {
320
+ "id": 24,
321
+ "name": "DMSPCAD",
322
+ "dtype": "determine",
323
+ "related": [],
324
+ "isShown": true,
325
+ "type": "attribute",
326
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCAD | description: KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
327
+ "metadata": {
328
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPCAD",
329
+ "group_path": "DMS > DMS Recommended Standard"
330
+ }
331
+ },
332
+ {
333
+ "id": 25,
334
+ "name": "DMSPCS",
335
+ "dtype": "determine",
336
+ "related": [],
337
+ "isShown": true,
338
+ "type": "attribute",
339
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCS | description: KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli. | Decimal Places: 0",
340
+ "metadata": {
341
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPCS",
342
+ "group_path": "DMS > DMS Recommended Standard"
343
+ }
344
+ },
345
+ {
346
+ "id": 26,
347
+ "name": "DMSPEGC",
348
+ "dtype": "determine",
349
+ "related": [],
350
+ "isShown": true,
351
+ "type": "attribute",
352
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGC | description: DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
353
+ "metadata": {
354
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPEGC",
355
+ "group_path": "DMS > DMS Recommended Standard"
356
+ }
357
+ },
358
+ {
359
+ "id": 27,
360
+ "name": "DMSPEGE",
361
+ "dtype": "determine",
362
+ "related": [],
363
+ "isShown": true,
364
+ "type": "attribute",
365
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGE | description: KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
366
+ "metadata": {
367
+ "leaf_id": "DMS > DMS Recommended Standard.DMSPEGE",
368
+ "group_path": "DMS > DMS Recommended Standard"
369
+ }
370
+ },
371
+ {
372
+ "id": 28,
373
+ "name": "DMSTC",
374
+ "dtype": "determine",
375
+ "related": [],
376
+ "isShown": true,
377
+ "type": "attribute",
378
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC | description: DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
379
+ "metadata": {
380
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTC",
381
+ "group_path": "DMS > DMS Recommended Standard"
382
+ }
383
+ },
384
+ {
385
+ "id": 29,
386
+ "name": "DMSTC0",
387
+ "dtype": "determine",
388
+ "related": [],
389
+ "isShown": true,
390
+ "type": "attribute",
391
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC0 | description: DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds. | Decimal Places: 0",
392
+ "metadata": {
393
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTC0",
394
+ "group_path": "DMS > DMS Recommended Standard"
395
+ }
396
+ },
397
+ {
398
+ "id": 30,
399
+ "name": "DMSTC12",
400
+ "dtype": "determine",
401
+ "related": [],
402
+ "isShown": true,
403
+ "type": "attribute",
404
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC12 | description: DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds. | Decimal Places: 0",
405
+ "metadata": {
406
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTC12",
407
+ "group_path": "DMS > DMS Recommended Standard"
408
+ }
409
+ },
410
+ {
411
+ "id": 31,
412
+ "name": "DMSTC4",
413
+ "dtype": "determine",
414
+ "related": [],
415
+ "isShown": true,
416
+ "type": "attribute",
417
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC4 | description: DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds. | Decimal Places: 0",
418
+ "metadata": {
419
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTC4",
420
+ "group_path": "DMS > DMS Recommended Standard"
421
+ }
422
+ },
423
+ {
424
+ "id": 32,
425
+ "name": "DMSTCAD",
426
+ "dtype": "determine",
427
+ "related": [],
428
+ "isShown": true,
429
+ "type": "attribute",
430
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCAD | description: DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
431
+ "metadata": {
432
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTCAD",
433
+ "group_path": "DMS > DMS Recommended Standard"
434
+ }
435
+ },
436
+ {
437
+ "id": 33,
438
+ "name": "DMSTCS",
439
+ "dtype": "determine",
440
+ "related": [],
441
+ "isShown": true,
442
+ "type": "attribute",
443
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCS | description: DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli. | Decimal Places: 0",
444
+ "metadata": {
445
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTCS",
446
+ "group_path": "DMS > DMS Recommended Standard"
447
+ }
448
+ },
449
+ {
450
+ "id": 34,
451
+ "name": "DMSTE",
452
+ "dtype": "determine",
453
+ "related": [],
454
+ "isShown": true,
455
+ "type": "attribute",
456
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTE | description: DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen. | Decimal Places: 0",
457
+ "metadata": {
458
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTE",
459
+ "group_path": "DMS > DMS Recommended Standard"
460
+ }
461
+ },
462
+ {
463
+ "id": 35,
464
+ "name": "DMSTEAD",
465
+ "dtype": "determine",
466
+ "related": [],
467
+ "isShown": true,
468
+ "type": "attribute",
469
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEAD | description: DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component. | Decimal Places: 0",
470
+ "metadata": {
471
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTEAD",
472
+ "group_path": "DMS > DMS Recommended Standard"
473
+ }
474
+ },
475
+ {
476
+ "id": 36,
477
+ "name": "DMSTEC",
478
+ "dtype": "determine",
479
+ "related": [],
480
+ "isShown": true,
481
+ "type": "attribute",
482
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEC | description: DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
483
+ "metadata": {
484
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTEC",
485
+ "group_path": "DMS > DMS Recommended Standard"
486
+ }
487
+ },
488
+ {
489
+ "id": 37,
490
+ "name": "DMSTECAD",
491
+ "dtype": "determine",
492
+ "related": [],
493
+ "isShown": true,
494
+ "type": "attribute",
495
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTECAD | description: DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
496
+ "metadata": {
497
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTECAD",
498
+ "group_path": "DMS > DMS Recommended Standard"
499
+ }
500
+ },
501
+ {
502
+ "id": 38,
503
+ "name": "DMSTED",
504
+ "dtype": "determine",
505
+ "related": [],
506
+ "isShown": true,
507
+ "type": "attribute",
508
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTED | description: DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
509
+ "metadata": {
510
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTED",
511
+ "group_path": "DMS > DMS Recommended Standard"
512
+ }
513
+ },
514
+ {
515
+ "id": 39,
516
+ "name": "DMSTEDAD",
517
+ "dtype": "determine",
518
+ "related": [],
519
+ "isShown": true,
520
+ "type": "attribute",
521
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEDAD | description: DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
522
+ "metadata": {
523
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTEDAD",
524
+ "group_path": "DMS > DMS Recommended Standard"
525
+ }
526
+ },
527
+ {
528
+ "id": 40,
529
+ "name": "DMSTEP",
530
+ "dtype": "determine",
531
+ "related": [],
532
+ "isShown": true,
533
+ "type": "attribute",
534
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEP | description: DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
535
+ "metadata": {
536
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTEP",
537
+ "group_path": "DMS > DMS Recommended Standard"
538
+ }
539
+ },
540
+ {
541
+ "id": 41,
542
+ "name": "DMSTEPAD",
543
+ "dtype": "determine",
544
+ "related": [],
545
+ "isShown": true,
546
+ "type": "attribute",
547
+ "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEPAD | description: DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
548
+ "metadata": {
549
+ "leaf_id": "DMS > DMS Recommended Standard.DMSTEPAD",
550
+ "group_path": "DMS > DMS Recommended Standard"
551
+ }
552
+ },
553
+ {
554
+ "id": 42,
555
+ "name": "MOTML",
556
+ "dtype": "determine",
557
+ "related": [],
558
+ "isShown": true,
559
+ "type": "attribute",
560
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTML | description: The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 1",
561
+ "metadata": {
562
+ "leaf_id": "MOT > MOT Tone 2.0.MOTML",
563
+ "group_path": "MOT > MOT Tone 2.0"
564
+ }
565
+ },
566
+ {
567
+ "id": 43,
568
+ "name": "MOTSDL",
569
+ "dtype": "determine",
570
+ "related": [],
571
+ "isShown": true,
572
+ "type": "attribute",
573
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTSDL | description: This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 2",
574
+ "metadata": {
575
+ "leaf_id": "MOT > MOT Tone 2.0.MOTSDL",
576
+ "group_path": "MOT > MOT Tone 2.0"
577
+ }
578
+ },
579
+ {
580
+ "id": 44,
581
+ "name": "MOTTC",
582
+ "dtype": "determine",
583
+ "related": [],
584
+ "isShown": true,
585
+ "type": "attribute",
586
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTC | description: The total number of assessment trials on which the subject made a correct response. | Decimal Places: 0",
587
+ "metadata": {
588
+ "leaf_id": "MOT > MOT Tone 2.0.MOTTC",
589
+ "group_path": "MOT > MOT Tone 2.0"
590
+ }
591
+ },
592
+ {
593
+ "id": 45,
594
+ "name": "MOTTE",
595
+ "dtype": "determine",
596
+ "related": [],
597
+ "isShown": true,
598
+ "type": "attribute",
599
+ "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTE | description: The total number of assessment trials on which the subject failed to make a correct response. | Decimal Places: 0",
600
+ "metadata": {
601
+ "leaf_id": "MOT > MOT Tone 2.0.MOTTE",
602
+ "group_path": "MOT > MOT Tone 2.0"
603
+ }
604
+ },
605
+ {
606
+ "id": 46,
607
+ "name": "PALFAMS28",
608
+ "dtype": "determine",
609
+ "related": [],
610
+ "isShown": true,
611
+ "type": "attribute",
612
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALFAMS28 | description: KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard.. | Decimal Places: 0",
613
+ "metadata": {
614
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALFAMS28",
615
+ "group_path": "PAL > PAL Recommended Standard Extended"
616
+ }
617
+ },
618
+ {
619
+ "id": 47,
620
+ "name": "PALMETS28",
621
+ "dtype": "determine",
622
+ "related": [],
623
+ "isShown": true,
624
+ "type": "attribute",
625
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALMETS28 | description: PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
626
+ "metadata": {
627
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALMETS28",
628
+ "group_path": "PAL > PAL Recommended Standard Extended"
629
+ }
630
+ },
631
+ {
632
+ "id": 48,
633
+ "name": "PALNPR28",
634
+ "dtype": "determine",
635
+ "related": [],
636
+ "isShown": true,
637
+ "type": "attribute",
638
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALNPR28 | description: PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached. | Decimal Places: 0",
639
+ "metadata": {
640
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALNPR28",
641
+ "group_path": "PAL > PAL Recommended Standard Extended"
642
+ }
643
+ },
644
+ {
645
+ "id": 49,
646
+ "name": "PALTA12",
647
+ "dtype": "determine",
648
+ "related": [],
649
+ "isShown": true,
650
+ "type": "attribute",
651
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA12 | description: PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall. | Decimal Places: 0",
652
+ "metadata": {
653
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA12",
654
+ "group_path": "PAL > PAL Recommended Standard Extended"
655
+ }
656
+ },
657
+ {
658
+ "id": 50,
659
+ "name": "PALTA2",
660
+ "dtype": "determine",
661
+ "related": [],
662
+ "isShown": true,
663
+ "type": "attribute",
664
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA2 | description: PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall. | Decimal Places: 0",
665
+ "metadata": {
666
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA2",
667
+ "group_path": "PAL > PAL Recommended Standard Extended"
668
+ }
669
+ },
670
+ {
671
+ "id": 51,
672
+ "name": "PALTA28",
673
+ "dtype": "determine",
674
+ "related": [],
675
+ "isShown": true,
676
+ "type": "attribute",
677
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA28 | description: PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
678
+ "metadata": {
679
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA28",
680
+ "group_path": "PAL > PAL Recommended Standard Extended"
681
+ }
682
+ },
683
+ {
684
+ "id": 52,
685
+ "name": "PALTA4",
686
+ "dtype": "determine",
687
+ "related": [],
688
+ "isShown": true,
689
+ "type": "attribute",
690
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA4 | description: PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall. | Decimal Places: 0",
691
+ "metadata": {
692
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA4",
693
+ "group_path": "PAL > PAL Recommended Standard Extended"
694
+ }
695
+ },
696
+ {
697
+ "id": 53,
698
+ "name": "PALTA6",
699
+ "dtype": "determine",
700
+ "related": [],
701
+ "isShown": true,
702
+ "type": "attribute",
703
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA6 | description: PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall. | Decimal Places: 0",
704
+ "metadata": {
705
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA6",
706
+ "group_path": "PAL > PAL Recommended Standard Extended"
707
+ }
708
+ },
709
+ {
710
+ "id": 54,
711
+ "name": "PALTA8",
712
+ "dtype": "determine",
713
+ "related": [],
714
+ "isShown": true,
715
+ "type": "attribute",
716
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA8 | description: PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall. | Decimal Places: 0",
717
+ "metadata": {
718
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA8",
719
+ "group_path": "PAL > PAL Recommended Standard Extended"
720
+ }
721
+ },
722
+ {
723
+ "id": 55,
724
+ "name": "PALTE12",
725
+ "dtype": "determine",
726
+ "related": [],
727
+ "isShown": true,
728
+ "type": "attribute",
729
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE12 | description: PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials. | Decimal Places: 0",
730
+ "metadata": {
731
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE12",
732
+ "group_path": "PAL > PAL Recommended Standard Extended"
733
+ }
734
+ },
735
+ {
736
+ "id": 56,
737
+ "name": "PALTE2",
738
+ "dtype": "determine",
739
+ "related": [],
740
+ "isShown": true,
741
+ "type": "attribute",
742
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE2 | description: PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials. | Decimal Places: 0",
743
+ "metadata": {
744
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE2",
745
+ "group_path": "PAL > PAL Recommended Standard Extended"
746
+ }
747
+ },
748
+ {
749
+ "id": 57,
750
+ "name": "PALTE28",
751
+ "dtype": "determine",
752
+ "related": [],
753
+ "isShown": true,
754
+ "type": "attribute",
755
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE28 | description: PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials. Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
756
+ "metadata": {
757
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE28",
758
+ "group_path": "PAL > PAL Recommended Standard Extended"
759
+ }
760
+ },
761
+ {
762
+ "id": 58,
763
+ "name": "PALTE4",
764
+ "dtype": "determine",
765
+ "related": [],
766
+ "isShown": true,
767
+ "type": "attribute",
768
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE4 | description: PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials. | Decimal Places: 0",
769
+ "metadata": {
770
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE4",
771
+ "group_path": "PAL > PAL Recommended Standard Extended"
772
+ }
773
+ },
774
+ {
775
+ "id": 59,
776
+ "name": "PALTE6",
777
+ "dtype": "determine",
778
+ "related": [],
779
+ "isShown": true,
780
+ "type": "attribute",
781
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE6 | description: PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials. | Decimal Places: 0",
782
+ "metadata": {
783
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE6",
784
+ "group_path": "PAL > PAL Recommended Standard Extended"
785
+ }
786
+ },
787
+ {
788
+ "id": 60,
789
+ "name": "PALTE8",
790
+ "dtype": "determine",
791
+ "related": [],
792
+ "isShown": true,
793
+ "type": "attribute",
794
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE8 | description: PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials. | Decimal Places: 0",
795
+ "metadata": {
796
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE8",
797
+ "group_path": "PAL > PAL Recommended Standard Extended"
798
+ }
799
+ },
800
+ {
801
+ "id": 61,
802
+ "name": "PALTEA12",
803
+ "dtype": "determine",
804
+ "related": [],
805
+ "isShown": true,
806
+ "type": "attribute",
807
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA12 | description: PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
808
+ "metadata": {
809
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA12",
810
+ "group_path": "PAL > PAL Recommended Standard Extended"
811
+ }
812
+ },
813
+ {
814
+ "id": 62,
815
+ "name": "PALTEA2",
816
+ "dtype": "determine",
817
+ "related": [],
818
+ "isShown": true,
819
+ "type": "attribute",
820
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA2 | description: PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
821
+ "metadata": {
822
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA2",
823
+ "group_path": "PAL > PAL Recommended Standard Extended"
824
+ }
825
+ },
826
+ {
827
+ "id": 63,
828
+ "name": "PALTEA28",
829
+ "dtype": "determine",
830
+ "related": [],
831
+ "isShown": true,
832
+ "type": "attribute",
833
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA28 | description: KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task. In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
834
+ "metadata": {
835
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA28",
836
+ "group_path": "PAL > PAL Recommended Standard Extended"
837
+ }
838
+ },
839
+ {
840
+ "id": 64,
841
+ "name": "PALTEA4",
842
+ "dtype": "determine",
843
+ "related": [],
844
+ "isShown": true,
845
+ "type": "attribute",
846
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA4 | description: PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
847
+ "metadata": {
848
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA4",
849
+ "group_path": "PAL > PAL Recommended Standard Extended"
850
+ }
851
+ },
852
+ {
853
+ "id": 65,
854
+ "name": "PALTEA6",
855
+ "dtype": "determine",
856
+ "related": [],
857
+ "isShown": true,
858
+ "type": "attribute",
859
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA6 | description: PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
860
+ "metadata": {
861
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA6",
862
+ "group_path": "PAL > PAL Recommended Standard Extended"
863
+ }
864
+ },
865
+ {
866
+ "id": 66,
867
+ "name": "PALTEA8",
868
+ "dtype": "determine",
869
+ "related": [],
870
+ "isShown": true,
871
+ "type": "attribute",
872
+ "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA8 | description: PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
873
+ "metadata": {
874
+ "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA8",
875
+ "group_path": "PAL > PAL Recommended Standard Extended"
876
+ }
877
+ },
878
+ {
879
+ "id": 67,
880
+ "name": "PRMCLSDD",
881
+ "dtype": "determine",
882
+ "related": [],
883
+ "isShown": true,
884
+ "type": "attribute",
885
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDD | description: PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
886
+ "metadata": {
887
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMCLSDD",
888
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
889
+ }
890
+ },
891
+ {
892
+ "id": 68,
893
+ "name": "PRMCLSDI",
894
+ "dtype": "determine",
895
+ "related": [],
896
+ "isShown": true,
897
+ "type": "attribute",
898
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDI | description: PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
899
+ "metadata": {
900
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMCLSDI",
901
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
902
+ }
903
+ },
904
+ {
905
+ "id": 69,
906
+ "name": "PRMMCLD",
907
+ "dtype": "determine",
908
+ "related": [],
909
+ "isShown": true,
910
+ "type": "attribute",
911
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLD | description: PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
912
+ "metadata": {
913
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMCLD",
914
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
915
+ }
916
+ },
917
+ {
918
+ "id": 70,
919
+ "name": "PRMMCLI",
920
+ "dtype": "determine",
921
+ "related": [],
922
+ "isShown": true,
923
+ "type": "attribute",
924
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLI | description: PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
925
+ "metadata": {
926
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMCLI",
927
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
928
+ }
929
+ },
930
+ {
931
+ "id": 71,
932
+ "name": "PRMMDCLD",
933
+ "dtype": "determine",
934
+ "related": [],
935
+ "isShown": true,
936
+ "type": "attribute",
937
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLD | description: PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
938
+ "metadata": {
939
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMDCLD",
940
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
941
+ }
942
+ },
943
+ {
944
+ "id": 72,
945
+ "name": "PRMMDCLI",
946
+ "dtype": "determine",
947
+ "related": [],
948
+ "isShown": true,
949
+ "type": "attribute",
950
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLI | description: PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
951
+ "metadata": {
952
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMDCLI",
953
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
954
+ }
955
+ },
956
+ {
957
+ "id": 73,
958
+ "name": "PRMPCD",
959
+ "dtype": "determine",
960
+ "related": [],
961
+ "isShown": true,
962
+ "type": "attribute",
963
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCD | description: KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage. | Decimal Places: 2",
964
+ "metadata": {
965
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMPCD",
966
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
967
+ }
968
+ },
969
+ {
970
+ "id": 74,
971
+ "name": "PRMPCI",
972
+ "dtype": "determine",
973
+ "related": [],
974
+ "isShown": true,
975
+ "type": "attribute",
976
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCI | description: KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage. | Decimal Places: 2",
977
+ "metadata": {
978
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMPCI",
979
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
980
+ }
981
+ },
982
+ {
983
+ "id": 75,
984
+ "name": "PRMTSDSP",
985
+ "dtype": "determine",
986
+ "related": [],
987
+ "isShown": true,
988
+ "type": "attribute",
989
+ "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMTSDSP | description: PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition. | Decimal Places: 2",
990
+ "metadata": {
991
+ "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMTSDSP",
992
+ "group_path": "PRM > PRM Recommended Standard 18 Extended"
993
+ }
994
+ },
995
+ {
996
+ "id": 76,
997
+ "name": "RVPA",
998
+ "dtype": "determine",
999
+ "related": [],
1000
+ "isShown": true,
1001
+ "type": "attribute",
1002
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPA | description: KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences. | Decimal Places: 4",
1003
+ "metadata": {
1004
+ "leaf_id": "RVP > RVP 3 Targets.RVPA",
1005
+ "group_path": "RVP > RVP 3 Targets"
1006
+ }
1007
+ },
1008
+ {
1009
+ "id": 77,
1010
+ "name": "RVPLSD",
1011
+ "dtype": "determine",
1012
+ "related": [],
1013
+ "isShown": true,
1014
+ "type": "attribute",
1015
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPLSD | description: RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
1016
+ "metadata": {
1017
+ "leaf_id": "RVP > RVP 3 Targets.RVPLSD",
1018
+ "group_path": "RVP > RVP 3 Targets"
1019
+ }
1020
+ },
1021
+ {
1022
+ "id": 78,
1023
+ "name": "RVPMDL",
1024
+ "dtype": "determine",
1025
+ "related": [],
1026
+ "isShown": true,
1027
+ "type": "attribute",
1028
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPMDL | description: KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
1029
+ "metadata": {
1030
+ "leaf_id": "RVP > RVP 3 Targets.RVPMDL",
1031
+ "group_path": "RVP > RVP 3 Targets"
1032
+ }
1033
+ },
1034
+ {
1035
+ "id": 79,
1036
+ "name": "RVPML",
1037
+ "dtype": "determine",
1038
+ "related": [],
1039
+ "isShown": true,
1040
+ "type": "attribute",
1041
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPML | description: RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
1042
+ "metadata": {
1043
+ "leaf_id": "RVP > RVP 3 Targets.RVPML",
1044
+ "group_path": "RVP > RVP 3 Targets"
1045
+ }
1046
+ },
1047
+ {
1048
+ "id": 80,
1049
+ "name": "RVPPFA",
1050
+ "dtype": "determine",
1051
+ "related": [],
1052
+ "isShown": true,
1053
+ "type": "attribute",
1054
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPFA | description: KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms Γ· (False Alarms + Correct Rejections)) | Decimal Places: 4",
1055
+ "metadata": {
1056
+ "leaf_id": "RVP > RVP 3 Targets.RVPPFA",
1057
+ "group_path": "RVP > RVP 3 Targets"
1058
+ }
1059
+ },
1060
+ {
1061
+ "id": 81,
1062
+ "name": "RVPPH",
1063
+ "dtype": "determine",
1064
+ "related": [],
1065
+ "isShown": true,
1066
+ "type": "attribute",
1067
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPH | description: RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits Γ· total number of sequences) | Decimal Places: 4",
1068
+ "metadata": {
1069
+ "leaf_id": "RVP > RVP 3 Targets.RVPPH",
1070
+ "group_path": "RVP > RVP 3 Targets"
1071
+ }
1072
+ },
1073
+ {
1074
+ "id": 82,
1075
+ "name": "RVPTFA",
1076
+ "dtype": "determine",
1077
+ "related": [],
1078
+ "isShown": true,
1079
+ "type": "attribute",
1080
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTFA | description: RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms. | Decimal Places: 0",
1081
+ "metadata": {
1082
+ "leaf_id": "RVP > RVP 3 Targets.RVPTFA",
1083
+ "group_path": "RVP > RVP 3 Targets"
1084
+ }
1085
+ },
1086
+ {
1087
+ "id": 83,
1088
+ "name": "RVPTH",
1089
+ "dtype": "determine",
1090
+ "related": [],
1091
+ "isShown": true,
1092
+ "type": "attribute",
1093
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTH | description: RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks. | Decimal Places: 0",
1094
+ "metadata": {
1095
+ "leaf_id": "RVP > RVP 3 Targets.RVPTH",
1096
+ "group_path": "RVP > RVP 3 Targets"
1097
+ }
1098
+ },
1099
+ {
1100
+ "id": 84,
1101
+ "name": "RVPTM",
1102
+ "dtype": "determine",
1103
+ "related": [],
1104
+ "isShown": true,
1105
+ "type": "attribute",
1106
+ "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTM | description: RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks. | Decimal Places: 0",
1107
+ "metadata": {
1108
+ "leaf_id": "RVP > RVP 3 Targets.RVPTM",
1109
+ "group_path": "RVP > RVP 3 Targets"
1110
+ }
1111
+ },
1112
+ {
1113
+ "id": 85,
1114
+ "name": "SWMBE12",
1115
+ "dtype": "determine",
1116
+ "related": [],
1117
+ "isShown": true,
1118
+ "type": "attribute",
1119
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE12 | description: KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1120
+ "metadata": {
1121
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE12",
1122
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1123
+ }
1124
+ },
1125
+ {
1126
+ "id": 86,
1127
+ "name": "SWMBE4",
1128
+ "dtype": "determine",
1129
+ "related": [],
1130
+ "isShown": true,
1131
+ "type": "attribute",
1132
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE4 | description: KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1133
+ "metadata": {
1134
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE4",
1135
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1136
+ }
1137
+ },
1138
+ {
1139
+ "id": 87,
1140
+ "name": "SWMBE468",
1141
+ "dtype": "determine",
1142
+ "related": [],
1143
+ "isShown": true,
1144
+ "type": "attribute",
1145
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE468 | description: KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1146
+ "metadata": {
1147
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE468",
1148
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1149
+ }
1150
+ },
1151
+ {
1152
+ "id": 88,
1153
+ "name": "SWMBE6",
1154
+ "dtype": "determine",
1155
+ "related": [],
1156
+ "isShown": true,
1157
+ "type": "attribute",
1158
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE6 | description: KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1159
+ "metadata": {
1160
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE6",
1161
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1162
+ }
1163
+ },
1164
+ {
1165
+ "id": 89,
1166
+ "name": "SWMBE8",
1167
+ "dtype": "determine",
1168
+ "related": [],
1169
+ "isShown": true,
1170
+ "type": "attribute",
1171
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE8 | description: KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1172
+ "metadata": {
1173
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE8",
1174
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1175
+ }
1176
+ },
1177
+ {
1178
+ "id": 90,
1179
+ "name": "SWMDE12",
1180
+ "dtype": "determine",
1181
+ "related": [],
1182
+ "isShown": true,
1183
+ "type": "attribute",
1184
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE12 | description: SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1185
+ "metadata": {
1186
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE12",
1187
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1188
+ }
1189
+ },
1190
+ {
1191
+ "id": 91,
1192
+ "name": "SWMDE4",
1193
+ "dtype": "determine",
1194
+ "related": [],
1195
+ "isShown": true,
1196
+ "type": "attribute",
1197
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE4 | description: SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1198
+ "metadata": {
1199
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE4",
1200
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1201
+ }
1202
+ },
1203
+ {
1204
+ "id": 92,
1205
+ "name": "SWMDE468",
1206
+ "dtype": "determine",
1207
+ "related": [],
1208
+ "isShown": true,
1209
+ "type": "attribute",
1210
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE468 | description: SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1211
+ "metadata": {
1212
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE468",
1213
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1214
+ }
1215
+ },
1216
+ {
1217
+ "id": 93,
1218
+ "name": "SWMDE6",
1219
+ "dtype": "determine",
1220
+ "related": [],
1221
+ "isShown": true,
1222
+ "type": "attribute",
1223
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE6 | description: SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1224
+ "metadata": {
1225
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE6",
1226
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1227
+ }
1228
+ },
1229
+ {
1230
+ "id": 94,
1231
+ "name": "SWMDE8",
1232
+ "dtype": "determine",
1233
+ "related": [],
1234
+ "isShown": true,
1235
+ "type": "attribute",
1236
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE8 | description: SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1237
+ "metadata": {
1238
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE8",
1239
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1240
+ }
1241
+ },
1242
+ {
1243
+ "id": 95,
1244
+ "name": "SWMPR",
1245
+ "dtype": "determine",
1246
+ "related": [],
1247
+ "isShown": true,
1248
+ "type": "attribute",
1249
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMPR | description: SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete. | Decimal Places: 0",
1250
+ "metadata": {
1251
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMPR",
1252
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1253
+ }
1254
+ },
1255
+ {
1256
+ "id": 96,
1257
+ "name": "SWMS",
1258
+ "dtype": "determine",
1259
+ "related": [],
1260
+ "isShown": true,
1261
+ "type": "attribute",
1262
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS | description: KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens. | Decimal Places: 0",
1263
+ "metadata": {
1264
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMS",
1265
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1266
+ }
1267
+ },
1268
+ {
1269
+ "id": 97,
1270
+ "name": "SWMS6",
1271
+ "dtype": "determine",
1272
+ "related": [],
1273
+ "isShown": true,
1274
+ "type": "attribute",
1275
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS6 | description: SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. | Decimal Places: 0",
1276
+ "metadata": {
1277
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMS6",
1278
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1279
+ }
1280
+ },
1281
+ {
1282
+ "id": 98,
1283
+ "name": "SWMSX",
1284
+ "dtype": "determine",
1285
+ "related": [],
1286
+ "isShown": true,
1287
+ "type": "attribute",
1288
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMSX | description: SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more. | Decimal Places: 0",
1289
+ "metadata": {
1290
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMSX",
1291
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1292
+ }
1293
+ },
1294
+ {
1295
+ "id": 99,
1296
+ "name": "SWMTE12",
1297
+ "dtype": "determine",
1298
+ "related": [],
1299
+ "isShown": true,
1300
+ "type": "attribute",
1301
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE12 | description: SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1302
+ "metadata": {
1303
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE12",
1304
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1305
+ }
1306
+ },
1307
+ {
1308
+ "id": 100,
1309
+ "name": "SWMTE4",
1310
+ "dtype": "determine",
1311
+ "related": [],
1312
+ "isShown": true,
1313
+ "type": "attribute",
1314
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE4 | description: SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1315
+ "metadata": {
1316
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE4",
1317
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1318
+ }
1319
+ },
1320
+ {
1321
+ "id": 101,
1322
+ "name": "SWMTE468",
1323
+ "dtype": "determine",
1324
+ "related": [],
1325
+ "isShown": true,
1326
+ "type": "attribute",
1327
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE468 | description: SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1328
+ "metadata": {
1329
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE468",
1330
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1331
+ }
1332
+ },
1333
+ {
1334
+ "id": 102,
1335
+ "name": "SWMTE6",
1336
+ "dtype": "determine",
1337
+ "related": [],
1338
+ "isShown": true,
1339
+ "type": "attribute",
1340
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE6 | description: SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1341
+ "metadata": {
1342
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE6",
1343
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1344
+ }
1345
+ },
1346
+ {
1347
+ "id": 103,
1348
+ "name": "SWMTE8",
1349
+ "dtype": "determine",
1350
+ "related": [],
1351
+ "isShown": true,
1352
+ "type": "attribute",
1353
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE8 | description: SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1354
+ "metadata": {
1355
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE8",
1356
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1357
+ }
1358
+ },
1359
+ {
1360
+ "id": 104,
1361
+ "name": "SWMWE12",
1362
+ "dtype": "determine",
1363
+ "related": [],
1364
+ "isShown": true,
1365
+ "type": "attribute",
1366
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE12 | description: SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
1367
+ "metadata": {
1368
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE12",
1369
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1370
+ }
1371
+ },
1372
+ {
1373
+ "id": 105,
1374
+ "name": "SWMWE4",
1375
+ "dtype": "determine",
1376
+ "related": [],
1377
+ "isShown": true,
1378
+ "type": "attribute",
1379
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE4 | description: SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
1380
+ "metadata": {
1381
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE4",
1382
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1383
+ }
1384
+ },
1385
+ {
1386
+ "id": 106,
1387
+ "name": "SWMWE468",
1388
+ "dtype": "determine",
1389
+ "related": [],
1390
+ "isShown": true,
1391
+ "type": "attribute",
1392
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE468 | description: SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
1393
+ "metadata": {
1394
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE468",
1395
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1396
+ }
1397
+ },
1398
+ {
1399
+ "id": 107,
1400
+ "name": "SWMWE6",
1401
+ "dtype": "determine",
1402
+ "related": [],
1403
+ "isShown": true,
1404
+ "type": "attribute",
1405
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE6 | description: SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
1406
+ "metadata": {
1407
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE6",
1408
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1409
+ }
1410
+ },
1411
+ {
1412
+ "id": 108,
1413
+ "name": "SWMWE8",
1414
+ "dtype": "determine",
1415
+ "related": [],
1416
+ "isShown": true,
1417
+ "type": "attribute",
1418
+ "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE8 | description: SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
1419
+ "metadata": {
1420
+ "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE8",
1421
+ "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
1422
+ }
1423
+ },
1424
+ {
1425
+ "id": 109,
1426
+ "name": "Pal / Total",
1427
+ "related": [
1428
+ 110,
1429
+ 111,
1430
+ 112,
1431
+ 113,
1432
+ 46,
1433
+ 47,
1434
+ 48,
1435
+ 51
1436
+ ],
1437
+ "type": "aggregation",
1438
+ "isShown": true,
1439
+ "desc": "Cluster of 21 variables β€” label terms: Pal / Total",
1440
+ "dtype": "determine"
1441
+ },
1442
+ {
1443
+ "id": 110,
1444
+ "name": "Errors / Adjusted",
1445
+ "related": [
1446
+ 57,
1447
+ 63
1448
+ ],
1449
+ "type": "aggregation",
1450
+ "isShown": true,
1451
+ "desc": "Cluster of 2 variables β€” label terms: Errors / Adjusted",
1452
+ "dtype": "determine"
1453
+ },
1454
+ {
1455
+ "id": 111,
1456
+ "name": "Shapes / Adjusted",
1457
+ "related": [
1458
+ 61,
1459
+ 62,
1460
+ 64,
1461
+ 65,
1462
+ 66
1463
+ ],
1464
+ "type": "aggregation",
1465
+ "isShown": true,
1466
+ "desc": "Cluster of 5 variables β€” label terms: Shapes / Adjusted",
1467
+ "dtype": "determine"
1468
+ },
1469
+ {
1470
+ "id": 112,
1471
+ "name": "Patterns / Errors",
1472
+ "related": [
1473
+ 55,
1474
+ 56,
1475
+ 58,
1476
+ 59,
1477
+ 60
1478
+ ],
1479
+ "type": "aggregation",
1480
+ "isShown": true,
1481
+ "desc": "Cluster of 5 variables β€” label terms: Patterns / Errors",
1482
+ "dtype": "determine"
1483
+ },
1484
+ {
1485
+ "id": 113,
1486
+ "name": "Attempts / Patterns",
1487
+ "related": [
1488
+ 49,
1489
+ 50,
1490
+ 52,
1491
+ 53,
1492
+ 54
1493
+ ],
1494
+ "type": "aggregation",
1495
+ "isShown": true,
1496
+ "desc": "Cluster of 5 variables β€” label terms: Attempts / Patterns",
1497
+ "dtype": "determine"
1498
+ },
1499
+ {
1500
+ "id": 114,
1501
+ "name": "Rvp / False",
1502
+ "related": [
1503
+ 80,
1504
+ 81,
1505
+ 82,
1506
+ 83,
1507
+ 84
1508
+ ],
1509
+ "type": "aggregation",
1510
+ "isShown": true,
1511
+ "desc": "Cluster of 5 variables β€” label terms: Rvp / False",
1512
+ "dtype": "determine"
1513
+ },
1514
+ {
1515
+ "id": 115,
1516
+ "name": "Latency / Correct",
1517
+ "related": [
1518
+ 116,
1519
+ 117,
1520
+ 118,
1521
+ 119,
1522
+ 120,
1523
+ 121
1524
+ ],
1525
+ "type": "aggregation",
1526
+ "isShown": true,
1527
+ "desc": "Cluster of 28 variables β€” label terms: Latency / Correct",
1528
+ "dtype": "determine"
1529
+ },
1530
+ {
1531
+ "id": 116,
1532
+ "name": "Mean / Dms",
1533
+ "related": [
1534
+ 1,
1535
+ 14,
1536
+ 18,
1537
+ 19
1538
+ ],
1539
+ "type": "aggregation",
1540
+ "isShown": true,
1541
+ "desc": "Cluster of 4 variables β€” label terms: Mean / Dms",
1542
+ "dtype": "determine"
1543
+ },
1544
+ {
1545
+ "id": 117,
1546
+ "name": "Median / Prm",
1547
+ "related": [
1548
+ 8,
1549
+ 12,
1550
+ 13,
1551
+ 71,
1552
+ 72,
1553
+ 78
1554
+ ],
1555
+ "type": "aggregation",
1556
+ "isShown": true,
1557
+ "desc": "Cluster of 6 variables β€” label terms: Median / Prm",
1558
+ "dtype": "determine"
1559
+ },
1560
+ {
1561
+ "id": 118,
1562
+ "name": "Prm / Delayed",
1563
+ "related": [
1564
+ 67,
1565
+ 68,
1566
+ 69,
1567
+ 70
1568
+ ],
1569
+ "type": "aggregation",
1570
+ "isShown": true,
1571
+ "desc": "Cluster of 4 variables β€” label terms: Prm / Delayed",
1572
+ "dtype": "determine"
1573
+ },
1574
+ {
1575
+ "id": 119,
1576
+ "name": "Seconds / Delay",
1577
+ "related": [
1578
+ 9,
1579
+ 10,
1580
+ 11,
1581
+ 15,
1582
+ 16,
1583
+ 17
1584
+ ],
1585
+ "type": "aggregation",
1586
+ "isShown": true,
1587
+ "desc": "Cluster of 6 variables β€” label terms: Seconds / Delay",
1588
+ "dtype": "determine"
1589
+ },
1590
+ {
1591
+ "id": 120,
1592
+ "name": "Standard / Deviation",
1593
+ "related": [
1594
+ 2,
1595
+ 3,
1596
+ 4,
1597
+ 5,
1598
+ 6,
1599
+ 7
1600
+ ],
1601
+ "type": "aggregation",
1602
+ "isShown": true,
1603
+ "desc": "Cluster of 6 variables β€” label terms: Standard / Deviation",
1604
+ "dtype": "determine"
1605
+ },
1606
+ {
1607
+ "id": 121,
1608
+ "name": "Rvp / Response",
1609
+ "related": [
1610
+ 77,
1611
+ 79
1612
+ ],
1613
+ "type": "aggregation",
1614
+ "isShown": true,
1615
+ "desc": "Cluster of 2 variables β€” label terms: Rvp / Response",
1616
+ "dtype": "determine"
1617
+ },
1618
+ {
1619
+ "id": 122,
1620
+ "name": "Swm / Errors",
1621
+ "related": [
1622
+ 123,
1623
+ 124,
1624
+ 125,
1625
+ 95,
1626
+ 126,
1627
+ 127,
1628
+ 101,
1629
+ 92
1630
+ ],
1631
+ "type": "aggregation",
1632
+ "isShown": true,
1633
+ "desc": "Cluster of 21 variables β€” label terms: Swm / Errors",
1634
+ "dtype": "determine"
1635
+ },
1636
+ {
1637
+ "id": 123,
1638
+ "name": "Between / Within",
1639
+ "related": [
1640
+ 87,
1641
+ 106
1642
+ ],
1643
+ "type": "aggregation",
1644
+ "isShown": true,
1645
+ "desc": "Cluster of 2 variables β€” label terms: Between / Within",
1646
+ "dtype": "determine"
1647
+ },
1648
+ {
1649
+ "id": 124,
1650
+ "name": "Total / Boxes",
1651
+ "related": [
1652
+ 99,
1653
+ 100,
1654
+ 102,
1655
+ 103
1656
+ ],
1657
+ "type": "aggregation",
1658
+ "isShown": true,
1659
+ "desc": "Cluster of 4 variables β€” label terms: Total / Boxes",
1660
+ "dtype": "determine"
1661
+ },
1662
+ {
1663
+ "id": 125,
1664
+ "name": "Double / Boxes",
1665
+ "related": [
1666
+ 90,
1667
+ 91,
1668
+ 93,
1669
+ 94
1670
+ ],
1671
+ "type": "aggregation",
1672
+ "isShown": true,
1673
+ "desc": "Cluster of 4 variables β€” label terms: Double / Boxes",
1674
+ "dtype": "determine"
1675
+ },
1676
+ {
1677
+ "id": 126,
1678
+ "name": "Within / Boxes",
1679
+ "related": [
1680
+ 104,
1681
+ 105,
1682
+ 107,
1683
+ 108
1684
+ ],
1685
+ "type": "aggregation",
1686
+ "isShown": true,
1687
+ "desc": "Cluster of 4 variables β€” label terms: Within / Boxes",
1688
+ "dtype": "determine"
1689
+ },
1690
+ {
1691
+ "id": 127,
1692
+ "name": "Between / Boxes",
1693
+ "related": [
1694
+ 85,
1695
+ 86,
1696
+ 88,
1697
+ 89
1698
+ ],
1699
+ "type": "aggregation",
1700
+ "isShown": true,
1701
+ "desc": "Cluster of 4 variables β€” label terms: Between / Boxes",
1702
+ "dtype": "determine"
1703
+ },
1704
+ {
1705
+ "id": 128,
1706
+ "name": "Error / Dms",
1707
+ "related": [
1708
+ 129,
1709
+ 130,
1710
+ 131
1711
+ ],
1712
+ "type": "aggregation",
1713
+ "isShown": true,
1714
+ "desc": "Cluster of 8 variables β€” label terms: Error / Dms",
1715
+ "dtype": "determine"
1716
+ },
1717
+ {
1718
+ "id": 129,
1719
+ "name": "Distractor / Delays",
1720
+ "related": [
1721
+ 38,
1722
+ 39
1723
+ ],
1724
+ "type": "aggregation",
1725
+ "isShown": true,
1726
+ "desc": "Cluster of 2 variables β€” label terms: Distractor / Delays",
1727
+ "dtype": "determine"
1728
+ },
1729
+ {
1730
+ "id": 130,
1731
+ "name": "Incorrect / Colour",
1732
+ "related": [
1733
+ 36,
1734
+ 37,
1735
+ 40,
1736
+ 41
1737
+ ],
1738
+ "type": "aggregation",
1739
+ "isShown": true,
1740
+ "desc": "Cluster of 4 variables β€” label terms: Incorrect / Colour",
1741
+ "dtype": "determine"
1742
+ },
1743
+ {
1744
+ "id": 131,
1745
+ "name": "Probability / Given",
1746
+ "related": [
1747
+ 26,
1748
+ 27
1749
+ ],
1750
+ "type": "aggregation",
1751
+ "isShown": true,
1752
+ "desc": "Cluster of 2 variables β€” label terms: Probability / Given",
1753
+ "dtype": "determine"
1754
+ },
1755
+ {
1756
+ "id": 132,
1757
+ "name": "Dms / Correct",
1758
+ "related": [
1759
+ 133,
1760
+ 134,
1761
+ 135,
1762
+ 136,
1763
+ 137,
1764
+ 138
1765
+ ],
1766
+ "type": "aggregation",
1767
+ "isShown": true,
1768
+ "desc": "Cluster of 16 variables β€” label terms: Dms / Correct",
1769
+ "dtype": "determine"
1770
+ },
1771
+ {
1772
+ "id": 133,
1773
+ "name": "Percent / Dms",
1774
+ "related": [
1775
+ 20,
1776
+ 24,
1777
+ 25
1778
+ ],
1779
+ "type": "aggregation",
1780
+ "isShown": true,
1781
+ "desc": "Cluster of 3 variables β€” label terms: Percent / Dms",
1782
+ "dtype": "determine"
1783
+ },
1784
+ {
1785
+ "id": 134,
1786
+ "name": "Errors / Total",
1787
+ "related": [
1788
+ 32,
1789
+ 34,
1790
+ 35
1791
+ ],
1792
+ "type": "aggregation",
1793
+ "isShown": true,
1794
+ "desc": "Cluster of 3 variables β€” label terms: Errors / Total",
1795
+ "dtype": "determine"
1796
+ },
1797
+ {
1798
+ "id": 135,
1799
+ "name": "Delay / Percent",
1800
+ "related": [
1801
+ 21,
1802
+ 22,
1803
+ 23
1804
+ ],
1805
+ "type": "aggregation",
1806
+ "isShown": true,
1807
+ "desc": "Cluster of 3 variables β€” label terms: Delay / Percent",
1808
+ "dtype": "determine"
1809
+ },
1810
+ {
1811
+ "id": 136,
1812
+ "name": "Total / Simultaneous",
1813
+ "related": [
1814
+ 28,
1815
+ 33
1816
+ ],
1817
+ "type": "aggregation",
1818
+ "isShown": true,
1819
+ "desc": "Cluster of 2 variables β€” label terms: Total / Simultaneous",
1820
+ "dtype": "determine"
1821
+ },
1822
+ {
1823
+ "id": 137,
1824
+ "name": "Prm / Percent",
1825
+ "related": [
1826
+ 73,
1827
+ 74
1828
+ ],
1829
+ "type": "aggregation",
1830
+ "isShown": true,
1831
+ "desc": "Cluster of 2 variables β€” label terms: Prm / Percent",
1832
+ "dtype": "determine"
1833
+ },
1834
+ {
1835
+ "id": 138,
1836
+ "name": "Second / Delay",
1837
+ "related": [
1838
+ 29,
1839
+ 30,
1840
+ 31
1841
+ ],
1842
+ "type": "aggregation",
1843
+ "isShown": true,
1844
+ "desc": "Cluster of 3 variables β€” label terms: Second / Delay",
1845
+ "dtype": "determine"
1846
+ },
1847
+ {
1848
+ "id": 139,
1849
+ "name": "Strategy / Swm",
1850
+ "related": [
1851
+ 96,
1852
+ 97,
1853
+ 98
1854
+ ],
1855
+ "type": "aggregation",
1856
+ "isShown": true,
1857
+ "desc": "Cluster of 3 variables β€” label terms: Strategy / Swm",
1858
+ "dtype": "determine"
1859
+ },
1860
+ {
1861
+ "id": 140,
1862
+ "name": "Response / Trials",
1863
+ "related": [
1864
+ 42,
1865
+ 43,
1866
+ 44,
1867
+ 45,
1868
+ 75,
1869
+ 76
1870
+ ],
1871
+ "type": "aggregation",
1872
+ "isShown": true,
1873
+ "desc": "Cluster of 6 variables β€” label terms: Response / Trials",
1874
+ "dtype": "determine"
1875
+ }
1876
+ ]
version2/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ο»Ώstreamlit>=1.43
2
+ pandas>=2.0
3
+ numpy>=1.24
4
+ scikit-learn>=1.3
5
+ plotly>=5.18
6
+ sentence-transformers>=2.5
7
+ requests>=2.31
8
+ openpyxl>=3.1
9
+
10
+ # Approach 1 β€” WordNet concept lookups (optional, auto-downloads corpus at runtime)
11
+ nltk>=3.8
12
+
13
+ # Approach 2 β€” semantic aspect discovery (optional; torch already pulled by sentence-transformers)
14
+ fastopic>=0.0.5
15
+
16
+ # Approach 2 β€” OpenAI-compatible client for optional local-LLM label refinement
17
+ openai>=1.30
version2/views/methods.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ methods.py β€” single source of truth for method naming, descriptions and display
3
+ config, shared by the Demo View (viewer.py) and the Build pages (run_*.py).
4
+
5
+ Metadata Hierarchy Explorer β€” TFM 2026.
6
+
7
+ The internal keys ("Baseline" / "Approach 1" / "Approach 2") are kept stable on
8
+ purpose: the pre-built output filenames and the thesis cross-references depend on
9
+ them. The user-facing *title* is what gets shown in the app.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ METHOD_ORDER = ["Baseline", "Approach 1", "Approach 2"]
14
+
15
+ METHODS: dict[str, dict] = {
16
+ "Baseline": {
17
+ "title": "Baseline: Taxonomizer Semantic Space Hierarchy",
18
+ "tag": "Baseline Β· Word2Vec semantic space + agglomerative clustering "
19
+ "(Mahmood & Mueller, IEEE TVCG 2019)",
20
+ "color": "Greens",
21
+ "compress": False,
22
+ "node_link": True,
23
+ "about": (
24
+ "Classical clustering baseline. Word2Vec skip-gram embeddings of the "
25
+ "attribute names build a cosine semantic space, then balanced Ward "
26
+ "agglomerative clustering produces the tree; node labels are the most "
27
+ "discriminative terms per cluster. No external knowledge bases and no "
28
+ "neural language models β€” a deliberately simple reference point."
29
+ ),
30
+ },
31
+ "Approach 1": {
32
+ "title": "Approach 1: External Concept Alignment Hierarchy",
33
+ "tag": "Approach 1 Β· SBERT + GonΓ§alves NΓ—M alignment + HiExpan + Castanet facets",
34
+ "color": "Blues",
35
+ "compress": False,
36
+ "node_link": True,
37
+ "about": (
38
+ "Aligns each variable to concepts drawn from external knowledge bases. "
39
+ "SBERT embeddings and an NΓ—M concept-similarity matrix (GonΓ§alves 2019) "
40
+ "match variables to candidate concepts retrieved from Wikidata, Wikipedia, "
41
+ "WordNet and BioPortal; HiExpan refines the tree and Castanet builds "
42
+ "parallel facets. External enrichment activates automatically for "
43
+ "biomedical, cognitive and neurological domains."
44
+ ),
45
+ },
46
+ "Approach 2": {
47
+ "title": "Approach 2: Dataset Constrained Multi Aspect Hierarchy",
48
+ "tag": "Approach 2 Β· FASTopic + phrase-slot mining (Wu et al. NeurIPS 2024)",
49
+ "color": "Viridis",
50
+ "compress": True,
51
+ "node_link": True,
52
+ "about": (
53
+ "Builds the hierarchy using only evidence inside the dataset β€” no external "
54
+ "knowledge. Group structure anchors the top levels, phrase-slot mining and "
55
+ "FASTopic (Wu et al. 2024) discover semantic aspects, and per-aspect "
56
+ "clustering forms the branches. Labels are generated deterministically and "
57
+ "are fully auditable; an optional local LLM may re-phrase them under a "
58
+ "strict grounding check."
59
+ ),
60
+ },
61
+ }
62
+
63
+ # Reverse lookup: display title -> internal key.
64
+ TITLE_TO_KEY = {m["title"]: k for k, m in METHODS.items()}
65
+ TITLES = [METHODS[k]["title"] for k in METHOD_ORDER]
66
+
67
+
68
+ def title(key: str) -> str:
69
+ return METHODS[key]["title"]
70
+
71
+
72
+ def tag(key: str) -> str:
73
+ return METHODS[key]["tag"]
74
+
75
+
76
+ def about(key: str) -> str:
77
+ return METHODS[key]["about"]
version2/views/run_approach_1.py ADDED
The diff for this file is too large to render. See raw diff
 
version2/views/run_approach_2.py ADDED
The diff for this file is too large to render. See raw diff
 
version2/views/run_baseline.py ADDED
@@ -0,0 +1,1091 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # baseline.py β€” Metadata Hierarchy Builder β€” Baseline (Taxonomizer)
2
+ #
3
+ # Baseline = Taxonomizer (Mahmood & Mueller, IEEE TVCG 2019), semantic-space
4
+ # pipeline, adapted to a metadata-only setting. No hardcoded domain patterns.
5
+ #
6
+ # Pipeline:
7
+ # 1. Load metadata file (CSV / TSV / XLSX / JSON)
8
+ # 2. Detect column roles (leaf / context / text / meta) β€” same as Approach 1 / 2
9
+ # 3. Build canonical schema (incl. _semantic_text = description values only)
10
+ # 4. Embed each variable (code + description) via Word2Vec skip-gram and build
11
+ # the cosine-distance semantic space [TAX Β§3.2]
12
+ # 5. Recursively cluster (agglomerative, cosine) into the dendrogram taxonomy;
13
+ # internal-node labels = data-driven contrastive terms of each cluster
14
+ # 6. Visualise (Sunburst / Treemap / Node-link)
15
+ # 7. Export visualization-ready JSON + canonical CSV
16
+ #
17
+ # Paper & justified adaptations (metadata/schema setting, fully automatic):
18
+ # [TAX] Mahmood & Mueller β€” Taxonomizer, IEEE TVCG 2019.
19
+ # Builds a SEMANTIC space (cosine over word2vec skip-gram embeddings of
20
+ # attribute names; gensim, Wikipedia, window=5, dim=128) merged with a
21
+ # DATA space (correlation over raw values), clustered into a dendrogram;
22
+ # inner nodes labelled semi-automatically by distributional degree-of-
23
+ # entailment + WordNet synonyms.
24
+ # Adaptations (all documented):
25
+ # 1. No DATA space β€” a schema/dictionary has no raw values, so we use the
26
+ # semantic space alone (Taxonomizer with semantic weight = 1.0).
27
+ # 2. Embed the attribute's short NAME (the description's name clause), since
28
+ # the bare code goes out-of-vocabulary (a limitation the paper flags,
29
+ # e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a
30
+ # paragraph; using the short name (not the full description prose) keeps
31
+ # domain-specific words from being diluted by shared explanatory text.
32
+ # 3. Fully-automatic labels β€” the paper's labelling is semi-automatic
33
+ # (human picks from suggestions); a baseline must be non-interactive, so
34
+ # we use data-driven contrastive terms from each cluster's members.
35
+ #
36
+ # Dependencies: gensim
37
+ # pip install gensim
38
+
39
+ from __future__ import annotations
40
+ import csv, json, re, warnings
41
+ from collections import Counter, defaultdict
42
+ from pathlib import Path
43
+ import tempfile
44
+
45
+ import numpy as np
46
+ import pandas as pd
47
+ import plotly.graph_objects as go
48
+ import streamlit as st
49
+ from sklearn.cluster import AgglomerativeClustering
50
+ from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, silhouette_score
51
+ from sklearn.preprocessing import LabelEncoder
52
+
53
+ warnings.filterwarnings('ignore')
54
+
55
+ # set_page_config handled by the navigation router (demo.py)
56
+ import sys
57
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
58
+ import methods # shared method names
59
+
60
+ st.title(methods.title('Baseline'))
61
+ st.caption('Upload a metadata file (CSV / TSV / XLSX / JSON), confirm the column '
62
+ 'roles, then build. Semantic space only (no raw data values); no '
63
+ 'hardcoded patterns, no external APIs.')
64
+
65
+ # ─────────────────────────────────────────────────────────────────────────────
66
+ # CONSTANTS
67
+ # ─────────────────────────────────────────────────────────────────────────────
68
+ LEAF_KEYS = 'variable var field column attribute name code id item indicator question measure concept'.split()
69
+ GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
70
+ TEXT_KEYS = 'description definition desc label title question meaning note notes text display full details explanation comment'.split()
71
+ META_KEYS = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split()
72
+
73
+ # URL pattern β€” strip embedded links (e.g. HCP FreeSurfer NeuroLex URLs) so web
74
+ # tokens cannot dominate the embedding or the cluster label. [shared with A1]
75
+ _URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
76
+ re.IGNORECASE)
77
+
78
+ # ─────────────────────────────────────────────────────────────────────────────
79
+ # FILE LOADING
80
+ # ─────────────────────────────────────────────────────────────────────────────
81
+ def safe_name(name: str) -> str:
82
+ return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name)
83
+
84
+ def try_read_csv(path: Path) -> pd.DataFrame:
85
+ best, best_score = None, -1
86
+ for enc in ['utf-8-sig', 'utf-8', 'latin1']:
87
+ for sep in [None, ',', '\t', ';', '|']:
88
+ try:
89
+ df = pd.read_csv(path, sep=sep, engine='python', encoding=enc)
90
+ score = df.shape[1] * 10 - float(df.isna().mean().mean())
91
+ if score > best_score:
92
+ best, best_score = df, score
93
+ except Exception:
94
+ pass
95
+ if best is None:
96
+ raise ValueError(f'Could not read {path.name}')
97
+ best.columns = [str(c).strip().replace(';', '') for c in best.columns]
98
+ # Repair comma-packed rows (AI-Mind format)
99
+ if len(best) > 0:
100
+ first = best.iloc[:, 0].astype(str)
101
+ other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0
102
+ if first.str.contains(',').mean() > 0.50 and other_null > 0.70:
103
+ lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines()
104
+ if lines:
105
+ header = [h.strip().replace(';', '') for h in lines[0].split(',')]
106
+ rows = []
107
+ for line in lines[1:]:
108
+ line = line.strip().rstrip(';')
109
+ if not line:
110
+ continue
111
+ if line.startswith('"') and line.endswith('"'):
112
+ line = line[1:-1]
113
+ try:
114
+ parts = next(csv.reader([line], quotechar='"'))
115
+ except Exception:
116
+ continue
117
+ if len(parts) >= len(header):
118
+ rows.append(parts[:len(header)])
119
+ if rows:
120
+ best = pd.DataFrame(rows, columns=header)
121
+ best.columns = [str(c).strip().replace(';', '') for c in best.columns]
122
+ return best
123
+
124
+ def load_any(path: Path) -> pd.DataFrame:
125
+ s = path.suffix.lower()
126
+ if s in ['.csv', '.tsv', '.txt']:
127
+ return try_read_csv(path)
128
+ if s in ['.xlsx', '.xls']:
129
+ return pd.read_excel(path)
130
+ if s == '.json':
131
+ obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
132
+ if isinstance(obj, list):
133
+ return pd.json_normalize(obj)
134
+ if isinstance(obj, dict):
135
+ for v in obj.values():
136
+ if isinstance(v, list):
137
+ return pd.json_normalize(v)
138
+ raise ValueError(f'Unsupported file type: {s}')
139
+
140
+ def save_upload(f) -> Path:
141
+ tmp = Path(tempfile.mkdtemp(prefix='baseline_'))
142
+ p = tmp / safe_name(f.name)
143
+ p.write_bytes(f.getbuffer())
144
+ return p
145
+
146
+ # ─────────────────────────────────────────────────────────────────────────────
147
+ # ROLE DETECTION [GON]
148
+ # ─────────────────────────────────────────────────────────────────────────────
149
+ def norm(c: str) -> str:
150
+ return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_')
151
+
152
+ def kscore(c: str, keys: list) -> int:
153
+ nc = norm(c)
154
+ return sum(1 for k in keys if k in nc)
155
+
156
+ def profile_columns(df: pd.DataFrame) -> pd.DataFrame:
157
+ out = []
158
+ n = max(len(df), 1)
159
+ for col in df.columns:
160
+ s = df[col]
161
+ non = float(s.notna().mean())
162
+ nun = int(s.nunique(dropna=True))
163
+ ur = nun / n
164
+ avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
165
+ out.append({
166
+ 'column': str(col),
167
+ 'non_null': round(non, 3),
168
+ 'unique_values': nun,
169
+ 'unique_ratio': round(ur, 3),
170
+ 'avg_length': round(avg, 1),
171
+ 'leaf_score': 4*kscore(col, LEAF_KEYS) + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0),
172
+ 'group_score': 4*kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n*0.5, 80) else 0) + (1 if avg < 60 else 0),
173
+ 'text_score': 5*kscore(col, TEXT_KEYS) + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0),
174
+ 'metadata_score': 4*kscore(col, META_KEYS) + (2 if 1 < nun < min(n*0.8, 100) else 0),
175
+ })
176
+ return pd.DataFrame(out)
177
+
178
+ def detect_roles(df: pd.DataFrame) -> tuple:
179
+ """Auto-detect column roles. Identical logic to Approach 1 / 2 so the
180
+ preprocessing up to the canonical table is comparable across all apps."""
181
+ prof = profile_columns(df)
182
+ leaf = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist()
183
+ text = (prof[(prof.text_score >= 4) | (prof.avg_length > 80)]
184
+ .sort_values('text_score', ascending=False)['column'].tolist()) or leaf.copy()
185
+ group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)]
186
+ .sort_values('group_score', ascending=False)['column'].head(3).tolist())
187
+ meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
188
+ .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
189
+ # Representation columns (decimal/precision/unit/type/format/…) must never
190
+ # become structural levels; prefer them as metadata. [GON][TAX]
191
+ _META_SUBSTR_BLOCK = {
192
+ 'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
193
+ 'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
194
+ }
195
+ def _is_repr(col_name):
196
+ nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower())
197
+ return any(sub in nc for sub in _META_SUBSTR_BLOCK)
198
+ meta_extra = [c for c in prof['column'].tolist()
199
+ if _is_repr(c) and c not in text and c not in leaf and c not in meta]
200
+ group = [c for c in group if not _is_repr(c)]
201
+ meta = list(dict.fromkeys(meta + meta_extra))[:8]
202
+ return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof
203
+
204
+ # ─────────────────────────────────────────────────────────────────────────────
205
+ # CANONICAL SCHEMA [GON]
206
+ # ─────────────────────────────────────────────────────────────────────────────
207
+ def sv(x) -> str:
208
+ return '' if pd.isna(x) else str(x).strip()
209
+
210
+ def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
211
+ leaf_cols = cfg.get('leaf_cols', [])
212
+ group_cols = cfg.get('group_cols', [])
213
+ text_cols = cfg.get('text_cols', [])
214
+ meta_cols = cfg.get('metadata_cols', [])
215
+ rows = []
216
+ for i, row in df.iterrows():
217
+ leaf_parts = [sv(row.get(c, '')) for c in leaf_cols]
218
+ leaf_parts = [p for p in leaf_parts if p]
219
+ label = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}'
220
+ group_parts = [sv(row.get(c, '')) for c in group_cols]
221
+ group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']]
222
+ gpath = ' > '.join(group_parts) if group_parts else 'Ungrouped'
223
+ parts = []
224
+ for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)):
225
+ v = sv(row.get(c, ''))
226
+ if v:
227
+ parts.append(f'{c}: {v}')
228
+ text = ' | '.join(parts) if parts else label
229
+ # _semantic_text: description VALUES only β€” no "fieldname:" prefixes, no
230
+ # other fields, URLs stripped. This is the clean text Taxonomizer embeds
231
+ # (the attribute's meaning), identical in spirit to Approach 1's column.
232
+ sem_parts = [sv(row.get(c, '')) for c in text_cols]
233
+ sem_parts = [p for p in sem_parts if p]
234
+ if not sem_parts:
235
+ sem_parts = list(leaf_parts)
236
+ semantic = _URL_RE.sub(' ', ' '.join(sem_parts)) if sem_parts else label
237
+ rows.append({
238
+ '_source_file': source,
239
+ '_row_index': int(i),
240
+ '_leaf_label': label,
241
+ '_leaf_id': f'{gpath}.{label}' if gpath != 'Ungrouped' else label,
242
+ '_group_path': gpath,
243
+ '_text': text,
244
+ '_semantic_text': semantic,
245
+ })
246
+ can = pd.DataFrame(rows)
247
+ if can['_leaf_id'].duplicated().any():
248
+ cnt: dict = defaultdict(int)
249
+ ids = []
250
+ for lid in can['_leaf_id']:
251
+ cnt[lid] += 1
252
+ ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
253
+ can['_leaf_id'] = ids
254
+ return can
255
+
256
+ # ─────────────────────────────────────────────────────────────────────────────
257
+ # TAXONOMIZER CORE [TAX β€” Mahmood & Mueller, IEEE TVCG 2019]
258
+ #
259
+ # Taxonomizer builds the taxonomy from a SEMANTIC SPACE (cosine distance between
260
+ # word2vec skip-gram embeddings of attribute names) merged with a DATA SPACE
261
+ # (correlation over the raw values). In a metadata/schema setting we have no
262
+ # raw data values, so we use the semantic space alone (= Taxonomizer with
263
+ # semantic weight 1.0). Because attribute *names* here are opaque codes that go
264
+ # out-of-vocabulary β€” a limitation the paper explicitly flags (e.g. "BP") β€” we
265
+ # embed code + description so real words carry the meaning (OOV code tokens are
266
+ # skipped during averaging). Internal-node labels: the paper uses semi-automatic
267
+ # distributional degree-of-entailment + WordNet synonyms; a baseline must be
268
+ # fully automatic, so we use data-driven contrastive terms drawn from the data.
269
+ # ─────────────────────────────────────────────────────────────────────────────
270
+
271
+ _W2V_STOP = frozenset(
272
+ 'a an the and or but if in on at to of for with by is are was were be '
273
+ 'been being have has had do does did will would could should may might '
274
+ 'shall can this that these those i you he she it we they me him her us '
275
+ 'them my your his her its our their what which who whom when where why '
276
+ 'how all each every few more most other some such no not only same so '
277
+ 'than too very just because as until while'.split()
278
+ )
279
+
280
+ @st.cache_resource(show_spinner=False)
281
+ def _load_w2v():
282
+ """Load pre-trained Word2Vec / GloVe model via gensim downloader.
283
+
284
+ We prefer glove-wiki-gigaword-100 (~66 MB) because its Wikipedia training
285
+ corpus and skip-gram-style objective most closely match Taxonomizer's
286
+ described word2vec-Wikipedia-dim128 model.
287
+ """
288
+ try:
289
+ import gensim.downloader as api
290
+ return api.load('glove-wiki-gigaword-100')
291
+ except Exception as e:
292
+ st.error(
293
+ f'Could not load Word2Vec model: {e}\n\n'
294
+ 'Run: pip install gensim and restart the app.\n'
295
+ 'The model (~66 MB) is downloaded automatically on first use.'
296
+ )
297
+ return None
298
+
299
+ def _tokenize(label: str) -> list[str]:
300
+ return [t for t in re.sub(r'[^a-zA-Z]+', ' ', label).lower().split()
301
+ if len(t) > 2 and t not in _W2V_STOP]
302
+
303
+ def attribute_name(text: str) -> str:
304
+ """The attribute's short NAME β€” what Taxonomizer actually embeds [TAX Β§3.2].
305
+
306
+ The paper embeds the attribute name ("not more than a few words long"), not a
307
+ paragraph. Descriptions here are formatted '<name>: <full sentence>' (some
308
+ prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
309
+ that is not a pure all-caps marker. Embedding this short name β€” rather than
310
+ the full description prose β€” keeps the domain-specific words from being
311
+ diluted by shared explanatory text, so the taxonomy clusters more by theme
312
+ (e.g. DMS / PAL / SWM).
313
+ """
314
+ text = str(text)
315
+ for clause in re.split(r'[:\n]', text):
316
+ clause = clause.strip()
317
+ if clause and not all(2 <= len(w) <= 6 and w.isupper() for w in clause.split()):
318
+ return clause
319
+ return text.strip()
320
+
321
+ def embed_labels_w2v(labels: list[str], model) -> np.ndarray:
322
+ """Average Word2Vec vectors for each label's tokens [TAX Β§4.1].
323
+
324
+ Falls back to a zero vector for labels where none of the tokens are in the
325
+ model vocabulary (rare for standard English attribute names).
326
+ """
327
+ dim = model.vector_size
328
+ out = np.zeros((len(labels), dim), dtype=np.float32)
329
+ for i, label in enumerate(labels):
330
+ toks = _tokenize(label)
331
+ vecs = [model[t] for t in toks if t in model]
332
+ if vecs:
333
+ out[i] = np.mean(vecs, axis=0)
334
+ # L2-normalise so cosine distance = 1 - dot
335
+ norms = np.linalg.norm(out, axis=1, keepdims=True)
336
+ norms[norms == 0] = 1.0
337
+ return out / norms
338
+
339
+ def _cluster(X: np.ndarray, k: int) -> np.ndarray:
340
+ """Ward-linkage agglomerative cut into k clusters.
341
+
342
+ Ward (on the L2-normalised embedding vectors, where Euclidean ∝ √cosine)
343
+ minimises within-cluster variance and so produces *balanced* clusters.
344
+ This avoids the average/single-linkage chaining pathology that otherwise
345
+ peels off tiny clusters and leaves one giant residual (i.e. no real
346
+ hierarchy forms).
347
+ """
348
+ return AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(X)
349
+
350
+ def best_k(X: np.ndarray, n: int, k_min: int = 2, k_max: int = 8) -> int:
351
+ """Pick the number of clusters that maximises the silhouette score.
352
+
353
+ Fully data-driven β€” no fixed cluster count. Returns 1 only when the node
354
+ is too small to split (n <= k_min).
355
+ """
356
+ k_hi = min(k_max, n - 1)
357
+ if k_hi < k_min:
358
+ return 1
359
+ best, best_s = 1, -1.0
360
+ for k in range(k_min, k_hi + 1):
361
+ labels = _cluster(X, k)
362
+ if len(set(labels)) < 2:
363
+ continue
364
+ try:
365
+ s = silhouette_score(X, labels)
366
+ except Exception:
367
+ continue
368
+ if s > best_s:
369
+ best_s, best = s, k
370
+ return best
371
+
372
+ def _doc_freq(texts: list[str]) -> Counter:
373
+ """Document frequency: how many member texts each content word appears in."""
374
+ c: Counter = Counter()
375
+ for t in texts:
376
+ for w in set(_tokenize(t)):
377
+ c[w] += 1
378
+ return c
379
+
380
+ def cluster_term_label(member_texts: list[str], sibling_texts: list[str],
381
+ used: set, vocab=None, top_n: int = 2) -> str:
382
+ """Label a node with the content words most characteristic of its members.
383
+
384
+ Data-driven labelling: each candidate word is scored by how much more
385
+ frequent it is *inside* the cluster than in the sibling pool (contrastive
386
+ document frequency), so labels are domain terms drawn from the dataset
387
+ itself β€” not external ontology words. This replaces Taxonomizer's
388
+ WordNet degree-of-entailment, which produces over-general, off-domain
389
+ abstractions on specialised scientific metadata.
390
+
391
+ If `vocab` is given (the Word2Vec model), only real dictionary words are
392
+ eligible, so opaque attribute codes (e.g. 'dms', 'motml') are filtered out
393
+ of labels. Codes are used only as a last-resort fallback.
394
+ """
395
+ def in_vocab(w: str) -> bool:
396
+ return vocab is None or w in vocab
397
+
398
+ n_in = max(len(member_texts), 1)
399
+ n_out = max(len(sibling_texts), 1)
400
+ cin = _doc_freq(member_texts)
401
+ cout = _doc_freq(sibling_texts)
402
+
403
+ scores: dict[str, float] = {}
404
+ for w, f in cin.items():
405
+ if w in used or len(w) <= 2 or not in_vocab(w):
406
+ continue
407
+ p_in = f / n_in
408
+ p_out = cout.get(w, 0) / n_out
409
+ # ignore single-occurrence noise unless the term is widely shared
410
+ if f < 2 and p_in < 0.5:
411
+ continue
412
+ scores[w] = p_in - p_out
413
+
414
+ picks = [w for w, _ in sorted(scores.items(), key=lambda x: -x[1])[:top_n]
415
+ if scores[w] > 0]
416
+ if not picks:
417
+ # fallback: most frequent shared real word, then any shared token
418
+ for require_vocab in (True, False):
419
+ for w, _ in cin.most_common():
420
+ if w not in used and len(w) > 2 and (not require_vocab or in_vocab(w)):
421
+ picks = [w]
422
+ break
423
+ if picks:
424
+ break
425
+ return ' / '.join(p.title() for p in picks) if picks else 'Group'
426
+
427
+ # ─────────────────────────────────────────────────────────────────────────────
428
+ # HIERARCHY CONSTRUCTION [TAX + GON]
429
+ # ─────────────────────────────────────────────────────────────────────────────
430
+ def _nmap(nodes: list) -> dict:
431
+ return {int(n['id']): n for n in nodes}
432
+
433
+ def _next_id(nodes: list) -> int:
434
+ return max((int(n['id']) for n in nodes), default=0) + 1
435
+
436
+ def _add_child(nodes: list, parent_id: int, child_id: int):
437
+ m = _nmap(nodes)
438
+ p = m.get(int(parent_id))
439
+ if p is None:
440
+ return
441
+ rel = list(p.get('related', []))
442
+ if int(child_id) not in rel:
443
+ rel.append(int(child_id))
444
+ p['related'] = rel
445
+
446
+ def _make_agg(nid: int, name: str, desc: str = '') -> dict:
447
+ return {'id': int(nid), 'name': str(name), 'related': [],
448
+ 'type': 'aggregation', 'isShown': True, 'desc': desc, 'dtype': 'determine'}
449
+
450
+ def _leaf_ids(nodes: list, nid: int) -> list:
451
+ m = _nmap(nodes)
452
+ out: list = []
453
+ def rec(x):
454
+ n = m.get(int(x))
455
+ if not n:
456
+ return
457
+ if n.get('type') == 'attribute':
458
+ out.append(int(x))
459
+ return
460
+ for c in n.get('related', []):
461
+ rec(int(c))
462
+ rec(nid)
463
+ return list(dict.fromkeys(out))
464
+
465
+ def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
466
+ max_depth: int = 3, min_cluster_size: int = 6,
467
+ branch_max: int = 8) -> list:
468
+ """Taxonomizer semantic-space construction [TAX].
469
+
470
+ Embeds each variable from its short attribute NAME (Word2Vec skip-gram
471
+ average) β€” the name clause of the description, as Taxonomizer specifies.
472
+ Recursively clusters via balanced Ward linkage β€” the semantic-space
473
+ dendrogram. Labels each internal node with the contrastive content terms of
474
+ its members (data-driven, fully automatic). No hardcoding.
475
+ """
476
+ # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
477
+ nodes: list = [{'id': 0, 'name': project, 'type': 'root',
478
+ 'dtype': 'root', 'isShown': True, 'related': [], 'desc': 'Root node'}]
479
+ row_to_node: list = []
480
+ embed_list: list[str] = [] # short attribute name β†’ embedding input + labels
481
+ for i, (_, r) in enumerate(can.iterrows(), start=1):
482
+ sem = str(r.get('_semantic_text', '') or r['_leaf_label'])
483
+ name = attribute_name(sem) or str(r['_leaf_label'])
484
+ nodes.append({'id': i, 'name': r['_leaf_label'], 'dtype': 'determine',
485
+ 'related': [], 'isShown': True, 'type': 'attribute',
486
+ 'desc': r['_text'],
487
+ 'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}})
488
+ row_to_node.append(i)
489
+ embed_list.append(name)
490
+ label_list = embed_list
491
+ row_to_node = np.array(row_to_node)
492
+
493
+ # ── Word2Vec semantic-space embeddings [TAX Β§3.2] ─────────────────────────
494
+ emb = embed_labels_w2v(embed_list, w2v_model) # (N, dim), L2-normalised
495
+
496
+ # ── recursive clustering down the Ward dendrogram ─────────────────────────
497
+ def attach_leaves(parent_id: int, idx: np.ndarray):
498
+ for i in idx:
499
+ _add_child(nodes, parent_id, int(row_to_node[i]))
500
+
501
+ def recurse(parent_id: int, idx: np.ndarray, depth: int, used: set):
502
+ n = len(idx)
503
+ if n <= min_cluster_size or depth >= max_depth:
504
+ attach_leaves(parent_id, idx)
505
+ return
506
+
507
+ sub = emb[idx]
508
+ k_cap = min(branch_max, n - 1)
509
+ # Branching floor: a node with n leaves and `remaining` levels left must
510
+ # fan out enough to fit all its leaves into buckets of ~min_cluster_size
511
+ # by the depth cap, i.e. k >= (n / min_cluster_size) ** (1/remaining).
512
+ # Without this, silhouette keeps picking k=2 on overlapping data (e.g.
513
+ # HCP), giving a near-binary tree that dumps ~100 leaves per bottom node.
514
+ remaining = max(1, max_depth - depth)
515
+ k_floor = int(np.ceil((n / max(min_cluster_size, 1)) ** (1.0 / remaining)))
516
+ k_floor = max(2, min(k_floor, k_cap))
517
+ k = best_k(sub, n, k_min=k_floor, k_max=k_cap)
518
+ if k <= 1:
519
+ k = min(k_floor, k_cap) if n > min_cluster_size else 1
520
+ if k <= 1:
521
+ attach_leaves(parent_id, idx)
522
+ return
523
+
524
+ cluster_labels = _cluster(sub, k)
525
+ for c in range(k):
526
+ mask = cluster_labels == c
527
+ members = idx[mask]
528
+ if len(members) == 0:
529
+ continue
530
+ if len(members) == 1: # don't create singleton internal nodes
531
+ _add_child(nodes, parent_id, int(row_to_node[members[0]]))
532
+ continue
533
+ mset = set(members.tolist())
534
+ member_texts = [label_list[i] for i in members]
535
+ sibling_texts = [label_list[i] for i in idx if i not in mset]
536
+ # data-driven contrastive-term labelling
537
+ label = cluster_term_label(member_texts, sibling_texts, used)
538
+ nid = _next_id(nodes)
539
+ nodes.append(_make_agg(nid, label,
540
+ desc=f'Cluster of {len(members)} variables β€” '
541
+ f'label terms: {label}'))
542
+ _add_child(nodes, parent_id, nid)
543
+ recurse(nid, members, depth + 1, used | {label.lower()})
544
+
545
+ recurse(0, np.arange(len(can)), 0, set())
546
+
547
+ for n in nodes:
548
+ n['related'] = list(dict.fromkeys(int(x) for x in n.get('related', [])))
549
+ return nodes
550
+
551
+ # ─────────────────────────────────────────────────────────────────────────────
552
+ # VISUALISATION
553
+ # ─────────────────────────────────────────────────────────────────────────────
554
+ def _parent_map(nodes: list) -> dict:
555
+ pm: dict = {}
556
+ for n in nodes:
557
+ for c in n.get('related', []):
558
+ if int(c) not in pm:
559
+ pm[int(c)] = int(n['id'])
560
+ return pm
561
+
562
+ # ─────────────────────────────────────────────────────────────────────────────
563
+ # EVALUATION HELPERS
564
+ # ─────────────────────────────────────────────────────────────────────────────
565
+ def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list[int]:
566
+ """Return predicted cluster id (depth-1 aggregation ancestor) for each row in can."""
567
+ pm = _parent_map(nodes)
568
+ def depth1(nid: int) -> int:
569
+ # Walk up until our parent is root (id==0) or we have no parent
570
+ while pm.get(nid, -1) not in (-1, 0):
571
+ nid = pm[nid]
572
+ return nid
573
+ lid_to_nid = {n['metadata']['leaf_id']: int(n['id'])
574
+ for n in nodes if n.get('type') == 'attribute' and 'metadata' in n}
575
+ return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1
576
+ for lid in can['_leaf_id']]
577
+
578
+ def _purity(y_true, y_pred) -> float:
579
+ from collections import Counter
580
+ clusters: dict = {}
581
+ for t, p in zip(y_true, y_pred):
582
+ clusters.setdefault(p, []).append(t)
583
+ correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values())
584
+ return correct / max(len(y_true), 1)
585
+
586
+ def _structural_stats(nodes: list) -> dict:
587
+ pm = _parent_map(nodes)
588
+ def depth_of(nid: int) -> int:
589
+ d = 0
590
+ while nid in pm:
591
+ nid = pm[nid]; d += 1
592
+ return d
593
+ agg = [n for n in nodes if n.get('type') == 'aggregation']
594
+ leafs = [n for n in nodes if n.get('type') == 'attribute']
595
+ depths = [depth_of(int(n['id'])) for n in leafs]
596
+ branches = [len(n.get('related', [])) for n in agg]
597
+ singletons = sum(1 for b in branches if b == 1)
598
+ return {
599
+ 'n_aggregation_nodes': len(agg),
600
+ 'max_depth': int(max(depths, default=0)),
601
+ 'avg_leaf_depth': round(float(np.mean(depths)), 2) if depths else 0.0,
602
+ 'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0,
603
+ 'singleton_nodes_%': round(100.0 * singletons / max(len(agg), 1), 1),
604
+ }
605
+
606
+ def _wrap(text: str, width: int = 70) -> str:
607
+ """Wrap long hover text onto multiple <br> lines so it never runs off-screen."""
608
+ import textwrap
609
+ text = str(text).replace('<', '&lt;')
610
+ lines: list = []
611
+ for para in text.split('\n'):
612
+ wrapped = textwrap.wrap(para, width=width) or ['']
613
+ lines.extend(wrapped)
614
+ return '<br>'.join(lines)
615
+
616
+ def plot_sunburst(nodes: list, max_depth: int = 4) -> go.Figure:
617
+ pm = _parent_map(nodes)
618
+ ids, labels, parents, values, hover = [], [], [], [], []
619
+ for n in nodes:
620
+ nid = int(n['id'])
621
+ lc = len(_leaf_ids(nodes, nid))
622
+ ids.append(str(nid))
623
+ labels.append(str(n.get('name', ''))[:40])
624
+ parents.append('' if nid == 0 else str(pm.get(nid, 0)))
625
+ values.append(max(1, lc))
626
+ desc = _wrap(n.get('desc', ''))
627
+ hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Type: {n.get("type","")}'
628
+ f'<br>Variables: {lc}<br><br>{desc}')
629
+ fig = go.Figure(go.Sunburst(
630
+ ids=ids, labels=labels, parents=parents, values=values,
631
+ branchvalues='total', hovertext=hover, hoverinfo='text',
632
+ maxdepth=max_depth, insidetextorientation='radial',
633
+ marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
634
+ ))
635
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
636
+ title='Click a sector to drill down β€” click centre to go back')
637
+ return fig
638
+
639
+ def plot_treemap(nodes: list) -> go.Figure:
640
+ pm = _parent_map(nodes)
641
+ ids, labels, parents, values, hover = [], [], [], [], []
642
+ for n in nodes:
643
+ nid = int(n['id'])
644
+ lc = len(_leaf_ids(nodes, nid))
645
+ ids.append(str(nid))
646
+ labels.append(str(n.get('name', ''))[:40])
647
+ parents.append('' if nid == 0 else str(pm.get(nid, 0)))
648
+ values.append(max(1, lc))
649
+ desc = _wrap(n.get('desc', ''))
650
+ hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Variables: {lc}<br>{desc}')
651
+ fig = go.Figure(go.Treemap(
652
+ ids=ids, labels=labels, parents=parents, values=values,
653
+ branchvalues='total', hovertext=hover, hoverinfo='text',
654
+ textinfo='label+value',
655
+ marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
656
+ ))
657
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
658
+ return fig
659
+
660
+ # ─────────────────────────────────────────────────────────────────────────────
661
+ # NODE-LINK TREE (Reingold–Tilford layout β€” matches Approach 1 / 2 interface)
662
+ # ─────────────────────────────────────────────────────────────────────────────
663
+ def _bl_node_color(n: dict) -> str:
664
+ t = n.get('type', '')
665
+ if t == 'root': return '#2a7d2a'
666
+ if t == 'attribute': return '#74c476'
667
+ if t == 'collapsed': return '#bbbbbb'
668
+ return '#238b45'
669
+
670
+ def _display_graph(nodes: list, max_depth: int = 4):
671
+ """Walk the tree to the chosen depth, inserting 'collapsed' placeholders for
672
+ branches cut off below max_depth (the Level-of-Detail control)."""
673
+ m = _nmap(nodes)
674
+ dnodes: dict = {}
675
+ edges: list = []
676
+ counter = 10 ** 9
677
+
678
+ def rec(nid, depth):
679
+ nonlocal counter
680
+ n = m.get(int(nid))
681
+ if not n:
682
+ return
683
+ dnodes[int(nid)] = n
684
+ if depth >= max_depth and n.get('related'):
685
+ counter += 1
686
+ cid = counter
687
+ n_leaves = len(_leaf_ids(nodes, nid))
688
+ dnodes[cid] = {'id': cid, 'name': f'… {n_leaves} variables',
689
+ 'type': 'collapsed', 'related': [],
690
+ 'desc': f"Collapsed: {n.get('name')}", 'isShown': True}
691
+ edges.append((int(nid), cid))
692
+ return
693
+ for c in n.get('related', []):
694
+ if int(c) not in m:
695
+ continue
696
+ edges.append((int(nid), int(c)))
697
+ rec(int(c), depth + 1)
698
+
699
+ rec(0, 0)
700
+ return list(dnodes.values()), edges
701
+
702
+ def _positions(edges: list):
703
+ """Reingold–Tilford style positions: x = depth, y = subtree-aware vertical."""
704
+ H_SCALE, V_SPACE = 3.0, 1.8
705
+ children: dict = defaultdict(list)
706
+ for p, c in edges:
707
+ children[p].append(c)
708
+ pos: dict = {}
709
+ counter = {'v': 0}
710
+
711
+ def rec(nid, depth):
712
+ ch = children.get(nid, [])
713
+ if not ch:
714
+ y = counter['v'] * V_SPACE
715
+ counter['v'] += 1
716
+ pos[nid] = (depth * H_SCALE, y)
717
+ return y
718
+ y = float(np.mean([rec(c, depth + 1) for c in ch]))
719
+ pos[nid] = (depth * H_SCALE, y)
720
+ return y
721
+
722
+ rec(0, 0)
723
+ return pos
724
+
725
+ def plot_node_link(nodes: list, max_depth: int = 4, show_leaf_labels: bool = False) -> go.Figure:
726
+ """Node-link tree with elbow edges. Best for inspecting structure at moderate
727
+ depth; Sunburst is recommended for large hierarchies (Taxonomizer)."""
728
+ dnodes, edges = _display_graph(nodes, max_depth)
729
+ pos = _positions(edges)
730
+
731
+ ex, ey = [], []
732
+ for p, c in edges:
733
+ if p not in pos or c not in pos:
734
+ continue
735
+ x0, y0 = pos[p]; x1, y1 = pos[c]
736
+ xm = (x0 + x1) / 2
737
+ ex += [x0, xm, xm, x1, None]
738
+ ey += [y0, y0, y1, y1, None]
739
+ traces = [go.Scatter(x=ex, y=ey, mode='lines',
740
+ line=dict(width=1, color='#c8c8c8'),
741
+ hoverinfo='skip', showlegend=False)]
742
+
743
+ agg_x, agg_y, agg_l, agg_c, agg_h = [], [], [], [], []
744
+ lf_x, lf_y, lf_l, lf_c, lf_h = [], [], [], [], []
745
+ for n in dnodes:
746
+ nid = int(n['id'])
747
+ if nid not in pos:
748
+ continue
749
+ x, y = pos[nid]
750
+ lc = len(_leaf_ids(nodes, nid))
751
+ lab = str(n.get('name', nid))
752
+ htxt = (f"<b>{_wrap(n.get('name',''))}</b><br>Type: {n.get('type','')}"
753
+ f"<br>Variables: {lc}<br><br>{_wrap(n.get('desc',''))}")
754
+ col = _bl_node_color(n)
755
+ if n.get('type') in ('root', 'aggregation', 'collapsed'):
756
+ agg_x.append(x); agg_y.append(y)
757
+ agg_l.append((lab + (f' ({lc})' if lc else ''))[:50])
758
+ agg_c.append(col); agg_h.append(htxt)
759
+ else:
760
+ lf_x.append(x); lf_y.append(y)
761
+ lf_l.append(lab[:40] if show_leaf_labels else '')
762
+ lf_c.append(col); lf_h.append(htxt)
763
+
764
+ if agg_x:
765
+ traces.append(go.Scatter(
766
+ x=agg_x, y=agg_y, mode='markers+text', text=agg_l,
767
+ textposition='middle right', hovertext=agg_h, hoverinfo='text',
768
+ marker=dict(size=16, color=agg_c, line=dict(color='white', width=2)),
769
+ showlegend=False))
770
+ if lf_x:
771
+ traces.append(go.Scatter(
772
+ x=lf_x, y=lf_y, mode='markers+text', text=lf_l,
773
+ textposition='middle right', hovertext=lf_h, hoverinfo='text',
774
+ marker=dict(size=7, color=lf_c, symbol='circle', opacity=0.75,
775
+ line=dict(color='white', width=1)),
776
+ showlegend=False))
777
+
778
+ n_leaves = max(12, len(lf_x))
779
+ fig = go.Figure(traces)
780
+ fig.update_layout(
781
+ height=max(700, min(4000, int(n_leaves * 32))),
782
+ margin=dict(l=20, r=220, t=30, b=20),
783
+ plot_bgcolor='white', paper_bgcolor='white',
784
+ xaxis=dict(visible=False, fixedrange=False),
785
+ yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
786
+ dragmode='pan')
787
+ return fig
788
+
789
+ # ─────────────────────────────────────────────────────────────────────────────
790
+ # INPUT + CONFIGURATION (main area β€” UX v2: configuration is front-and-centre,
791
+ # not buried in the sidebar; expert knobs live under "Advanced settings")
792
+ # ─────────────────────────────────────────────────────────────────────────────
793
+ st.subheader('Upload metadata')
794
+ uploaded = st.file_uploader(
795
+ 'Upload a metadata file',
796
+ type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
797
+ accept_multiple_files=False,
798
+ )
799
+
800
+ with st.expander('Advanced settings', expanded=False):
801
+ gc1, gc2 = st.columns(2)
802
+ with gc1:
803
+ st.markdown('**Taxonomizer**')
804
+ tx_max_depth = st.slider('Max taxonomy depth', 2, 6, 3, 1,
805
+ help='How many abstract-to-concrete levels to build')
806
+ tx_min_size = st.slider('Min cluster size', 3, 20, 6, 1,
807
+ help='Clusters smaller than this stop splitting (leaves attach directly)')
808
+ tx_branch = st.slider('Max branches per node', 3, 12, 8, 1,
809
+ help='Upper bound on clusters per split; the actual number is chosen by silhouette')
810
+ with gc2:
811
+ st.markdown('**Display**')
812
+ max_items = st.slider('Maximum variables', 25, 1200, 900, 25,
813
+ help='Cap on variables included (lower only to speed up very large files). '
814
+ 'Default keeps full datasets like HCP (813).')
815
+ group_filter = st.text_input('Row filter (optional)', value='',
816
+ help='Filter rows by contextual path text before building')
817
+
818
+ # ─────────────────────────────────────────────────────────────────────────────
819
+ # MAIN
820
+ # ─────────────────────────────────────────────────────────────────────────────
821
+ if not uploaded:
822
+ st.info('Upload a metadata CSV / XLSX / JSON file to begin.')
823
+ st.markdown("""
824
+ ### Baseline algorithm β€” Taxonomizer (semantic space)
825
+
826
+ Based on **Mahmood & Mueller, IEEE TVCG 2019** (Taxonomizer), adapted to a
827
+ metadata-only setting. No hardcoded domain patterns, no external APIs.
828
+
829
+ | Step | Method | Paper |
830
+ |------|--------|-------|
831
+ | Variable representation | **short attribute name** (description's name clause; codes are OOV) | Taxonomizer Β§3.2 / Β§4.1 |
832
+ | Embedding | Word2Vec skip-gram β€” average of word vectors (`glove-wiki-gigaword-100`) | Taxonomizer Β§3.2 |
833
+ | Semantic space | Cosine-distance matrix (no data space β€” schema has no raw values) | Taxonomizer Β§3.2 *(adapted)* |
834
+ | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette β†’ dendrogram | Taxonomizer Β§4.2 |
835
+ | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer Β§4.3 *(adapted)* |
836
+
837
+ This page is the pure Taxonomizer-style semantic-space reference method:
838
+ variable meanings are embedded and recursively clustered into a hierarchy,
839
+ with node labels generated from contrastive terms.
840
+
841
+ **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
842
+
843
+ **Approach 2** adds NMF/FASTopic aspect discovery + GMM clustering + optional LLM labels.
844
+ """)
845
+ st.stop()
846
+
847
+ path = save_upload(uploaded)
848
+
849
+ @st.cache_data(show_spinner=False)
850
+ def _load_profile(path_str: str):
851
+ df = load_any(Path(path_str))
852
+ cfg, prof = detect_roles(df)
853
+ return df, cfg, prof
854
+
855
+ with st.spinner('Loading file…'):
856
+ df, auto_cfg, prof = _load_profile(str(path))
857
+
858
+ st.subheader('File preview')
859
+ with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
860
+ expanded=False):
861
+ st.dataframe(df.head(10), width='stretch')
862
+ score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
863
+ if c in prof.columns]
864
+ st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
865
+ width='stretch')
866
+
867
+ st.subheader('Confirm column roles')
868
+ cols = list(df.columns)
869
+ # Scope widget keys to the uploaded file so a NEW file always shows its own
870
+ # auto-detected defaults (Streamlit otherwise keeps the previous file's
871
+ # selections under a fixed key, which silently overrides the new defaults).
872
+ _fk = safe_name(uploaded.name)
873
+ with st.expander('Column configuration', expanded=True):
874
+ left, right = st.columns(2)
875
+ with left:
876
+ leaf_cols = st.multiselect('Leaf variable column(s)', cols,
877
+ default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
878
+ group_cols = st.multiselect('Context column(s) (optional)', cols,
879
+ default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
880
+ help='Optional contextual columns for display/filtering.')
881
+ with right:
882
+ text_cols = st.multiselect('Text/description column(s)', cols,
883
+ default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
884
+ meta_cols = st.multiselect('Metadata/type column(s)', cols,
885
+ default=[c for c in auto_cfg.get('metadata_cols', []) if c in cols], key=f'meta_{_fk}')
886
+
887
+ if not leaf_cols:
888
+ st.error('Choose at least one leaf variable column.')
889
+ st.stop()
890
+
891
+ cfg = {'leaf_cols': leaf_cols, 'group_cols': group_cols,
892
+ 'text_cols': text_cols, 'metadata_cols': meta_cols}
893
+
894
+ if st.button('Build baseline hierarchy', type='primary'):
895
+ # ── load Word2Vec model (cached after first call) ──────────────────────
896
+ with st.spinner('Loading Word2Vec model (first run downloads ~66 MB)…'):
897
+ _w2v = _load_w2v()
898
+ if _w2v is None:
899
+ st.stop()
900
+
901
+ with st.spinner('Building hierarchy…'):
902
+ _can = build_canonical(df, cfg, source=Path(uploaded.name).stem)
903
+
904
+ if group_filter.strip():
905
+ _can = _can[_can['_group_path'].str.contains(
906
+ group_filter.strip(), case=False, na=False)].copy()
907
+
908
+ if len(_can) > max_items:
909
+ _can = _can.head(max_items).copy()
910
+
911
+ _can = _can.reset_index(drop=True)
912
+
913
+ if len(_can) < 2:
914
+ st.error('Need at least 2 variables after filtering.')
915
+ st.stop()
916
+
917
+ _pname = Path(uploaded.name).stem
918
+ _nodes = build_hierarchy(_can, _w2v, project=_pname,
919
+ max_depth=tx_max_depth,
920
+ min_cluster_size=tx_min_size,
921
+ branch_max=tx_branch)
922
+
923
+ st.session_state['_bl_nodes'] = _nodes
924
+ st.session_state['_bl_can'] = _can
925
+ st.session_state['_bl_project'] = _pname
926
+
927
+ if '_bl_nodes' not in st.session_state:
928
+ st.info('Configure columns above then click **Build baseline hierarchy**.')
929
+ st.stop()
930
+
931
+ nodes = st.session_state['_bl_nodes']
932
+ can = st.session_state['_bl_can']
933
+ project_name = st.session_state['_bl_project']
934
+
935
+ _sm = _structural_stats(nodes)
936
+ n_leaves = len([n for n in nodes if n['type'] == 'attribute'])
937
+ n_internal = len([n for n in nodes if n['type'] == 'aggregation'])
938
+
939
+ st.divider()
940
+ c1, c2, c3, c4 = st.columns(4)
941
+ c1.metric('Variables', n_leaves)
942
+ c2.metric('Aggregation nodes', n_internal)
943
+ c3.metric('Max depth', _sm['max_depth'])
944
+ c4.metric('Avg branching', _sm['avg_branching_factor'])
945
+
946
+ tabs = st.tabs(['Visualization', 'Node detail', 'Canonical table', 'Export', 'Evaluation'])
947
+
948
+ with tabs[0]:
949
+ # ── Visualization controls (above chart β€” matches Approach 1 / 2) ─────────
950
+ vc1, vc2, vc3 = st.columns([3, 2, 1])
951
+ with vc1:
952
+ viz_mode = st.radio(
953
+ 'View mode',
954
+ ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
955
+ horizontal=True, index=0,
956
+ help='Sunburst best for large hierarchies [Taxonomizer]. '
957
+ 'Node-link best for inspecting structure at moderate depth.')
958
+ with vc2:
959
+ display_depth = st.slider('Depth (Level of Detail)', 1, 8, 4, 1,
960
+ help='How many levels to reveal at once.')
961
+ with vc3:
962
+ show_leaf_labels = st.checkbox('Leaf labels', value=False,
963
+ help='Show variable names on the node-link tree.')
964
+ st.divider()
965
+
966
+ if viz_mode == 'Sunburst (drill-down)':
967
+ st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth),
968
+ width='stretch')
969
+ st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
970
+ elif viz_mode == 'Treemap':
971
+ st.plotly_chart(plot_treemap(nodes), width='stretch')
972
+ else:
973
+ st.plotly_chart(plot_node_link(nodes, max_depth=display_depth,
974
+ show_leaf_labels=show_leaf_labels),
975
+ width='stretch')
976
+
977
+ with tabs[1]:
978
+ nm = _nmap(nodes)
979
+ agg_nodes = [n for n in nodes if n['type'] in ('aggregation', 'root')]
980
+ options = [f'{n["name"]} [{len(_leaf_ids(nodes, int(n["id"])))} vars]'
981
+ for n in agg_nodes]
982
+ if options:
983
+ sel = st.selectbox('Select a node', options)
984
+ sel_name = sel.split(' [')[0]
985
+ sel_node = next((n for n in agg_nodes if n['name'] == sel_name), None)
986
+ if sel_node:
987
+ lids = _leaf_ids(nodes, int(sel_node['id']))
988
+ leaf_ids_set = {nm[i]['metadata']['leaf_id']
989
+ for i in lids if i in nm and 'metadata' in nm[i]}
990
+ sub = can[can['_leaf_id'].isin(leaf_ids_set)]
991
+ st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
992
+ st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
993
+ width='stretch')
994
+
995
+ with tabs[2]:
996
+ st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), width='stretch')
997
+
998
+ with tabs[3]:
999
+ _base = safe_name(project_name)
1000
+ col1, col2 = st.columns(2)
1001
+ with col1:
1002
+ st.download_button(
1003
+ 'Hierarchy JSON',
1004
+ data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
1005
+ file_name=f'{_base}_baseline_hierarchy.json',
1006
+ mime='application/json',
1007
+ width='stretch',
1008
+ )
1009
+ with col2:
1010
+ st.download_button(
1011
+ 'Canonical CSV',
1012
+ data=can.to_csv(index=False).encode('utf-8'),
1013
+ file_name=f'{_base}_baseline_canonical.csv',
1014
+ mime='text/csv',
1015
+ width='stretch',
1016
+ )
1017
+
1018
+ st.divider()
1019
+ # ── Save directly into the project's outputs/baseline/ folder ──────────────
1020
+ _out_dir = Path(__file__).resolve().parent.parent / 'outputs' / 'baseline'
1021
+ st.markdown('### Save to project folder')
1022
+ st.caption(
1023
+ "The download buttons above go to your browser's Downloads folder (a browser "
1024
+ f'restriction). This button instead writes the files into `{_out_dir}` with the '
1025
+ 'dataset name β€” convenient for `evaluate_all.py`.'
1026
+ )
1027
+ if st.button('Save all to outputs/baseline/', type='primary',
1028
+ width='stretch'):
1029
+ try:
1030
+ _out_dir.mkdir(parents=True, exist_ok=True)
1031
+ (_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
1032
+ json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
1033
+ can.to_csv(_out_dir / f'{_base}_baseline_canonical.csv', index=False)
1034
+ st.success(f'Saved to `{_out_dir}`:\n\n'
1035
+ f'- {_base}_baseline_hierarchy.json\n'
1036
+ f'- {_base}_baseline_canonical.csv')
1037
+ except Exception as _e:
1038
+ st.error(f'Could not save: {_e}')
1039
+
1040
+ with tabs[4]:
1041
+ import hierarchy_eval as he
1042
+
1043
+ st.subheader('Hierarchy Quality Evaluation')
1044
+ st.caption(
1045
+ 'No manually curated reference taxonomy is available for this experiment. '
1046
+ 'The metrics below are reference-free: they assess hierarchy structure, '
1047
+ 'label coherence and interpretability directly.'
1048
+ )
1049
+
1050
+ with st.spinner('Computing reference-free metrics…'):
1051
+ tm = he.traco_metrics(nodes)
1052
+ npmi = he.npmi_coherence(nodes, can['_text'].tolist())
1053
+
1054
+ # ── PRIMARY: reference-free hierarchy quality ─────────────────────────────
1055
+ st.markdown('#### Primary β€” reference-free hierarchy quality')
1056
+ p1, p2, p3 = st.columns(3)
1057
+ p1.metric('Parent–child coherence', tm['pc_coherence'],
1058
+ help='TraCo (Wu et al., AAAI 2024). Mean similarity of each node to its parent. '
1059
+ 'Higher = children correctly nest under their parent theme.')
1060
+ p2.metric('Sibling diversity', tm['sibling_diversity'],
1061
+ help='TraCo (Wu et al., AAAI 2024). Mean distance between sibling nodes. '
1062
+ 'Higher = siblings are distinct (LOW = redundant/repeated siblings).')
1063
+ p3.metric('NPMI label coherence', npmi,
1064
+ help='Lau et al., EACL 2014. Whether node-label terms genuinely co-occur in the '
1065
+ 'data. Higher = meaningful labels, not arbitrary term salads.')
1066
+ st.caption(f'Embedding backend: **{tm["encoder"]}**. '
1067
+ 'Coherence & diversity ∈ [βˆ’1, 1]; NPMI ∈ β‰ˆ[βˆ’1, 1].')
1068
+
1069
+ # ── Label-quality proxies (interpretability) ──────────────────────────────
1070
+ st.markdown('#### Label quality *(interpretability β€” reference-free)*')
1071
+ lq = he.label_quality(nodes)
1072
+ l1, l2, l3 = st.columns(3)
1073
+ l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
1074
+ help='% of internal labels that read as a real concept (short noun '
1075
+ 'phrase, WordNet head) rather than a "/"-joined term fragment.')
1076
+ l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
1077
+ help='% of internal labels duplicating a sibling label (lower is better).')
1078
+ l3.metric('Avg label words', lq['avg_label_words'],
1079
+ help='Mean label length in words (shorter = more name-like).')
1080
+
1081
+ # ── Structural metrics ────────────────────────────────────────────────────
1082
+ st.markdown('#### Structural statistics')
1083
+ sm = he.structural_stats(nodes)
1084
+ s1, s2, s3, s4, s5 = st.columns(5)
1085
+ s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
1086
+ s2.metric('Max leaf depth', sm['max_depth'])
1087
+ s3.metric('Avg leaf depth', sm['avg_leaf_depth'])
1088
+ s4.metric('Avg branching', sm['avg_branching_factor'])
1089
+ s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%",
1090
+ help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
1091
+
version2/views/viewer.py ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metadata Hierarchy Explorer β€” TFM 2026
3
+ Pre-built results viewer for Baseline, Approach 1, and Approach 2.
4
+
5
+ Rendering faithfully replicates each app's display pipeline:
6
+ - Baseline : raw tree, Greens, Sunburst + Treemap
7
+ - Approach 1 : raw tree, Blues, Sunburst + Treemap + Node-link + Facets
8
+ - Approach 2 : compress one-child chains, Viridis, Sunburst + Treemap + Node-link
9
+
10
+ Level-of-Detail controls (depth, leaf labels, hidden nodes, compress chains)
11
+ match the controls in the individual apps.
12
+ """
13
+ from __future__ import annotations
14
+ import json
15
+ from collections import Counter, defaultdict
16
+ from pathlib import Path
17
+
18
+ import numpy as np
19
+ import plotly.graph_objects as go
20
+ import streamlit as st
21
+
22
+ # Shared method names / descriptions / display config (single source of truth).
23
+ import sys
24
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
25
+ import methods # noqa: E402
26
+
27
+ # Page config is set by the navigation router (demo.py).
28
+ ROOT = Path(__file__).resolve().parent.parent / "outputs"
29
+
30
+ DEFAULT_DEPTH = 7
31
+
32
+ # ─────────────────────────────────────────────────────────────────────────────
33
+ # PRE-BUILT OUTPUT PATHS
34
+ # ─────────────────────────────────────────────────────────────────────────────
35
+ PREBUILT = {
36
+ "Baseline": {
37
+ "AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
38
+ "HCP": {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
39
+ },
40
+ "Approach 1": {
41
+ "AI-MIND": {
42
+ "hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
43
+ "facets": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
44
+ },
45
+ "HCP": {
46
+ "hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
47
+ "facets": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
48
+ },
49
+ },
50
+ "Approach 2": {
51
+ "AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
52
+ "HCP": {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
53
+ },
54
+ }
55
+
56
+ # Per-approach rendering config + descriptions now live in methods.py
57
+ # (methods.METHODS[key] carries color / compress / node_link / title / tag / about).
58
+
59
+ # ─────────────────────────────────────────────────────────────────────────────
60
+ # TREE TRANSFORMS (copied from approach_2.py β€” display-only, exact behaviour)
61
+ # ─────────────────────────────────────────────────────────────────────────────
62
+ def _filter_dissolved(nodes: list) -> list:
63
+ drop_ids = {int(n["id"]) for n in nodes
64
+ if n.get("type") == "dissolved" or n.get("isShown") is False}
65
+ if not drop_ids:
66
+ return nodes
67
+ out = []
68
+ for n in nodes:
69
+ if int(n["id"]) in drop_ids:
70
+ continue
71
+ m = dict(n)
72
+ m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
73
+ out.append(m)
74
+ return out
75
+
76
+ def compress_one_child_chains(nodes: list) -> list:
77
+ """Collapse chains where an aggregation node has exactly one aggregation child
78
+ (e.g. 'DMS β†’ DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
79
+ nodes = _filter_dissolved(nodes)
80
+ nm = {int(n["id"]): dict(n) for n in nodes}
81
+
82
+ def _is_chain_link(n):
83
+ if n.get("type") != "aggregation":
84
+ return False
85
+ children = n.get("related", [])
86
+ return (len(children) == 1
87
+ and nm.get(int(children[0]), {}).get("type") == "aggregation")
88
+
89
+ changed = True
90
+ while changed:
91
+ changed = False
92
+ for nid, n in list(nm.items()):
93
+ if _is_chain_link(n):
94
+ child_id = int(n["related"][0])
95
+ child = nm[child_id]
96
+ new_node = dict(child)
97
+ new_node["id"] = nid
98
+ new_node["name"] = f"{n['name']} / {child['name']}"
99
+ new_node["desc"] = f"{n.get('desc', '')} | {child.get('desc', '')}"
100
+ nm[nid] = new_node
101
+ if child_id in nm:
102
+ del nm[child_id]
103
+ for other in nm.values():
104
+ other["related"] = [nid if int(c) == child_id else int(c)
105
+ for c in other.get("related", [])]
106
+ changed = True
107
+ break
108
+ return list(nm.values())
109
+
110
+ # ─────────────────────────────────────────────────────────────────────────────
111
+ # RENDER HELPERS (DAG-safe value map β€” copied from approach_2.py)
112
+ # ─────────────────────────────────────────────────────────────────────────────
113
+ def _leaf_ids(nodes: list, nid: int) -> list:
114
+ m = {int(n["id"]): n for n in nodes}
115
+ out = []
116
+ def rec(x):
117
+ n = m.get(int(x))
118
+ if not n:
119
+ return
120
+ if n.get("type") == "attribute":
121
+ out.append(int(x)); return
122
+ for c in n.get("related", []):
123
+ rec(int(c))
124
+ rec(nid)
125
+ return list(dict.fromkeys(out))
126
+
127
+ def _parent_map(nodes: list) -> dict:
128
+ pm = {}
129
+ for n in nodes:
130
+ for c in n.get("related", []):
131
+ if int(c) not in pm:
132
+ pm[int(c)] = int(n["id"])
133
+ return pm
134
+
135
+ def _tree_value_map(nodes: list, pm: dict) -> dict:
136
+ kids = {}
137
+ for child, par in pm.items():
138
+ kids.setdefault(int(par), []).append(int(child))
139
+ nodemap = {int(n["id"]): n for n in nodes}
140
+ memo = {}
141
+ def count(nid: int) -> int:
142
+ if nid in memo:
143
+ return memo[nid]
144
+ memo[nid] = 1
145
+ n = nodemap.get(nid)
146
+ if n is not None and n.get("type") == "attribute":
147
+ memo[nid] = 1
148
+ return 1
149
+ ch = kids.get(nid, [])
150
+ v = sum(count(c) for c in ch) if ch else 1
151
+ memo[nid] = max(1, v)
152
+ return memo[nid]
153
+ return {nid: count(nid) for nid in nodemap}
154
+
155
+ def _wrap_hover(text: str, width: int = 80) -> str:
156
+ import textwrap as _tw
157
+ s = str(text or "")
158
+ if not s:
159
+ return ""
160
+ lines = []
161
+ for raw_line in s.split("\n"):
162
+ lines.extend(_tw.wrap(raw_line, width=width) or [""])
163
+ return "<br>".join(lines)
164
+
165
+ def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
166
+ nodes = _filter_dissolved(nodes)
167
+ pm = _parent_map(nodes)
168
+ vm = _tree_value_map(nodes, pm)
169
+ ids, labels, parents, values, hover = [], [], [], [], []
170
+ for n in nodes:
171
+ nid = int(n["id"])
172
+ lc = len(_leaf_ids(nodes, nid))
173
+ ids.append(str(nid))
174
+ labels.append(str(n.get("name", ""))[:40])
175
+ parents.append("" if nid == 0 else str(pm.get(nid, 0)))
176
+ values.append(vm.get(nid, 1))
177
+ hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
178
+ f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
179
+ fig = go.Figure(go.Sunburst(
180
+ ids=ids, labels=labels, parents=parents, values=values,
181
+ branchvalues="total", hovertext=hover, hoverinfo="text",
182
+ maxdepth=max_depth, insidetextorientation="radial",
183
+ marker=dict(colorscale=color, line=dict(width=1, color="white"))))
184
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
185
+ title=dict(text="Click sector to drill down β€” click centre to go back",
186
+ font=dict(size=13), x=0.5))
187
+ return fig
188
+
189
+ def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
190
+ nodes = _filter_dissolved(nodes)
191
+ pm = _parent_map(nodes)
192
+ vm = _tree_value_map(nodes, pm)
193
+ ids, labels, parents, values, hover = [], [], [], [], []
194
+ for n in nodes:
195
+ nid = int(n["id"])
196
+ lc = len(_leaf_ids(nodes, nid))
197
+ ids.append(str(nid))
198
+ labels.append(str(n.get("name", ""))[:40])
199
+ parents.append("" if nid == 0 else str(pm.get(nid, 0)))
200
+ values.append(vm.get(nid, 1))
201
+ hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
202
+ f"{_wrap_hover(n.get('desc', ''))}")
203
+ fig = go.Figure(go.Treemap(
204
+ ids=ids, labels=labels, parents=parents, values=values,
205
+ branchvalues="total", hovertext=hover, hoverinfo="text",
206
+ textinfo="label+value", maxdepth=max_depth,
207
+ marker=dict(colorscale=color, line=dict(width=1, color="white"))))
208
+ fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
209
+ return fig
210
+
211
+ # ─────────────────────────────────────────────────────────────────────────────
212
+ # NODE-LINK TREE (Reingold-Tilford layout β€” copied from approach_2.py)
213
+ # ─────────────────────────────────────────────────────────────────────────────
214
+ def _node_color(n: dict) -> str:
215
+ t = n.get("type", "")
216
+ if t == "root": return "#c44e52"
217
+ if t == "attribute": return "#4C72B0"
218
+ if t == "collapsed": return "#bbbbbb"
219
+ return "#8C8C8C"
220
+
221
+ def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
222
+ m = {int(n["id"]): n for n in nodes}
223
+ dnodes: dict = {}
224
+ edges: list = []
225
+ counter = 10 ** 9
226
+
227
+ def rec(nid, depth):
228
+ nonlocal counter
229
+ n = m.get(int(nid))
230
+ if not n:
231
+ return
232
+ if not show_hidden and n.get("isShown") is False and depth > 0:
233
+ return
234
+ dnodes[int(nid)] = n
235
+ if depth >= max_depth and n.get("related"):
236
+ counter += 1
237
+ cid = counter
238
+ n_leaves = len(_leaf_ids(nodes, nid))
239
+ dnodes[cid] = {"id": cid, "name": f"… {n_leaves} variables",
240
+ "type": "collapsed", "related": [],
241
+ "desc": f"Collapsed: {n.get('name')}"}
242
+ edges.append((int(nid), cid))
243
+ return
244
+ for c in n.get("related", []):
245
+ ch = m.get(int(c))
246
+ if not ch:
247
+ continue
248
+ if not show_hidden and ch.get("isShown") is False:
249
+ continue
250
+ edges.append((int(nid), int(c)))
251
+ rec(int(c), depth + 1)
252
+
253
+ rec(0, 0)
254
+ return list(dnodes.values()), edges
255
+
256
+ def _positions(edges: list):
257
+ H_SCALE, V_SPACE = 3.0, 1.8
258
+ children: dict = defaultdict(list)
259
+ for p, c in edges:
260
+ children[p].append(c)
261
+ pos: dict = {}
262
+ counter = {"v": 0}
263
+
264
+ def rec(nid, depth):
265
+ ch = children.get(nid, [])
266
+ if not ch:
267
+ y_pos = counter["v"] * V_SPACE
268
+ counter["v"] += 1
269
+ pos[nid] = (depth * H_SCALE, y_pos)
270
+ return y_pos
271
+ child_ys = [rec(c, depth + 1) for c in ch]
272
+ y_pos = float(np.mean(child_ys))
273
+ pos[nid] = (depth * H_SCALE, y_pos)
274
+ return y_pos
275
+
276
+ rec(0, 0)
277
+ return pos
278
+
279
+ def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
280
+ nodes = _filter_dissolved(nodes)
281
+ dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
282
+ pos = _positions(edges)
283
+
284
+ ex, ey = [], []
285
+ for p, c in edges:
286
+ if p not in pos or c not in pos:
287
+ continue
288
+ x0, y0 = pos[p]
289
+ x1, y1 = pos[c]
290
+ xm = (x0 + x1) / 2
291
+ ex += [x0, xm, xm, x1, None]
292
+ ey += [y0, y0, y1, y1, None]
293
+ traces = [go.Scatter(x=ex, y=ey, mode="lines",
294
+ line=dict(width=1, color="#c8c8c8"),
295
+ hoverinfo="skip", showlegend=False)]
296
+
297
+ agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
298
+ lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
299
+ for n in dnodes:
300
+ nid = int(n["id"])
301
+ if nid not in pos:
302
+ continue
303
+ x, y = pos[nid]
304
+ lc = len(_leaf_ids(nodes, nid))
305
+ lab = str(n.get("name", ""))[:32]
306
+ hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
307
+ f"Variables: {lc}")
308
+ if n.get("type") == "attribute":
309
+ lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
310
+ lf_lab.append(lab if show_leaf_labels else "")
311
+ lf_hov.append(hov)
312
+ else:
313
+ agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
314
+ agg_lab.append(lab); agg_hov.append(hov)
315
+
316
+ traces.append(go.Scatter(
317
+ x=lf_x, y=lf_y, mode="markers+text" if show_leaf_labels else "markers",
318
+ text=lf_lab, textposition="middle right", textfont=dict(size=9),
319
+ marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
320
+ hovertext=lf_hov, hoverinfo="text", showlegend=False))
321
+ traces.append(go.Scatter(
322
+ x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
323
+ textposition="middle right", textfont=dict(size=10),
324
+ marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
325
+ hovertext=agg_hov, hoverinfo="text", showlegend=False))
326
+
327
+ n_rows = max(len(lf_y), len(agg_y), 1)
328
+ fig = go.Figure(traces)
329
+ fig.update_layout(
330
+ height=max(600, n_rows * 16),
331
+ margin=dict(l=10, r=140, t=10, b=10),
332
+ xaxis=dict(visible=False), yaxis=dict(visible=False),
333
+ plot_bgcolor="white",
334
+ )
335
+ return fig
336
+
337
+ # ─────────────────────────────────────────────────────────────────────────────
338
+ # STATS / SAFE RENDERING
339
+ # ─────────────────────────────────────────────────────────────────────────────
340
+ def _tree_depth(nodes: list) -> int:
341
+ """Max depth of the rendered single-parent tree (root = depth 0)."""
342
+ nodes = _filter_dissolved(nodes)
343
+ m = {int(n["id"]): n for n in nodes}
344
+ best = {"d": 0}
345
+ def rec(nid, d):
346
+ best["d"] = max(best["d"], d)
347
+ for c in m.get(int(nid), {}).get("related", []):
348
+ if int(c) in m:
349
+ rec(int(c), d + 1)
350
+ rec(0, 0)
351
+ return best["d"]
352
+
353
+ def safe_render_depth(nodes: list, requested: int) -> int:
354
+ """Plotly sunburst/treemap silently blank when asked to draw too many sectors
355
+ at once (large hierarchies like HCP). Cap the *initial* render depth β€” the
356
+ chart stays fully drillable by clicking, so no data is lost."""
357
+ n = len(_filter_dissolved(nodes))
358
+ if n > 400:
359
+ return min(requested, 3)
360
+ if n > 150:
361
+ return min(requested, 4)
362
+ return requested
363
+
364
+ # ─────────────────────────────────────────────────────────────────────────────
365
+ # IO
366
+ # ─────────────────────────────────────────────────────────────────────────────
367
+ @st.cache_data(show_spinner=False)
368
+ def _load_json(path_str: str):
369
+ with open(path_str, encoding="utf-8") as f:
370
+ return json.load(f)
371
+
372
+ def _read_bytes(path_str: str) -> bytes:
373
+ with open(path_str, "rb") as f:
374
+ return f.read()
375
+
376
+ @st.cache_data(show_spinner=False)
377
+ def _outputs_zip(root_str: str) -> bytes:
378
+ """Zip the entire bundled outputs/ folder for one-click download."""
379
+ import io, zipfile
380
+ root = Path(root_str)
381
+ buf = io.BytesIO()
382
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
383
+ for p in sorted(root.rglob("*")):
384
+ if p.is_file():
385
+ zf.write(p, arcname=p.relative_to(root.parent).as_posix())
386
+ return buf.getvalue()
387
+
388
+ def count_nodes(nodes: list) -> tuple[int, int]:
389
+ nodes = _filter_dissolved(nodes)
390
+ leaves = sum(1 for n in nodes if n.get("type") == "attribute")
391
+ aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
392
+ return leaves, aggs
393
+
394
+ def concept_aligned_pct(nodes: list) -> float | None:
395
+ """% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
396
+ aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
397
+ if not aggs:
398
+ return None
399
+ aligned = sum(1 for n in aggs
400
+ if n.get("concept_provenance")
401
+ or n.get("label_provenance")
402
+ or n.get("structure_provenance")
403
+ or n.get("provenance")
404
+ or n.get("concept")
405
+ or n.get("source_evidence"))
406
+ return 100.0 * aligned / len(aggs) if aligned else None
407
+
408
+ def label_source_counts(nodes: list) -> Counter:
409
+ out = Counter()
410
+ for n in _filter_dissolved(nodes):
411
+ if n.get("type") != "aggregation":
412
+ continue
413
+ lp = n.get("label_provenance") or n.get("concept_provenance") or {}
414
+ src = lp.get("label_source") or lp.get("source") or ("not recorded" if not lp else "other")
415
+ out[str(src)] += 1
416
+ return out
417
+
418
+ def structure_route_counts(nodes: list) -> Counter:
419
+ out = Counter()
420
+ for n in _filter_dissolved(nodes):
421
+ if n.get("type") != "aggregation":
422
+ continue
423
+ sp = n.get("structure_provenance") or {}
424
+ route = sp.get("route") or sp.get("route_used") or sp.get("aspect_method")
425
+ out[str(route or ("not recorded" if not sp else "other"))] += 1
426
+ return out
427
+
428
+ @st.cache_data(show_spinner=False)
429
+ def _leaf_counts_by_dataset(dataset_name: str) -> dict:
430
+ counts = {}
431
+ for key, per_dataset in PREBUILT.items():
432
+ path = per_dataset.get(dataset_name, {}).get("hierarchy")
433
+ if path and path.exists():
434
+ nodes = _load_json(str(path))
435
+ counts[key] = sum(1 for n in _filter_dissolved(nodes)
436
+ if n.get("type") == "attribute")
437
+ return counts
438
+
439
+ def output_manifest(paths: dict, nodes: list) -> list[dict]:
440
+ leaves = [n for n in nodes if n.get("type") == "attribute"]
441
+ leaf_ids = sum(1 for n in leaves if (n.get("metadata") or {}).get("leaf_id"))
442
+ row_indices = sum(1 for n in leaves if (n.get("metadata") or {}).get("row_index") is not None)
443
+ has_label_prov = any(n.get("label_provenance") or n.get("concept_provenance")
444
+ for n in nodes if n.get("type") == "aggregation")
445
+ has_struct_prov = any(n.get("structure_provenance")
446
+ for n in nodes if n.get("type") == "aggregation")
447
+ if leaf_ids:
448
+ leaf_status = f"present for {leaf_ids}/{len(leaves)} leaves"
449
+ elif row_indices:
450
+ leaf_status = f"row-index only for {row_indices}/{len(leaves)} leaves"
451
+ else:
452
+ leaf_status = "not recorded"
453
+ return [
454
+ {"Artifact": "Hierarchy JSON", "Status": "present",
455
+ "Purpose": "Tree topology, labels, leaf metadata"},
456
+ {"Artifact": "Stable leaf IDs", "Status": leaf_status,
457
+ "Purpose": "Cross-method matching and downstream evaluation scripts"},
458
+ {"Artifact": "Facet JSON", "Status": "present" if paths.get("facets") else "not applicable",
459
+ "Purpose": "Approach 1 parallel Castanet views"},
460
+ {"Artifact": "Label provenance", "Status": "present" if has_label_prov else "not recorded",
461
+ "Purpose": "Audit how internal labels were produced"},
462
+ {"Artifact": "Structure provenance", "Status": "present" if has_struct_prov else "not recorded",
463
+ "Purpose": "Audit how internal branches were produced"},
464
+ {"Artifact": "Canonical CSV", "Status": "not bundled in demo outputs",
465
+ "Purpose": "Available by rerunning a builder and exporting"},
466
+ {"Artifact": "Manual reference taxonomy", "Status": "not available",
467
+ "Purpose": "No accuracy claim is made"},
468
+ ]
469
+
470
+ # ─────────────────────────────────────────────────────────────────────────────
471
+ # SIDEBAR
472
+ # ─────────────────────────────────────────────────────────────────────────────
473
+ # Sidebar holds only context + repo link β€” selection lives in the main area.
474
+ with st.sidebar:
475
+ st.caption("Results are pre-built from the thesis experiments. To run on your "
476
+ "own data, open a Build page and upload a CSV.")
477
+ st.markdown("[GitHub Repository]"
478
+ "(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")
479
+
480
+ # ─────────────────────────────────────────────────────────────────────────────
481
+ # MAIN
482
+ # ─────────────────────────────────────────────────────────────────────────────
483
+ # Method + dataset selection sit in the MAIN area (professor's UX note:
484
+ # configuration belongs front-and-centre, not buried in the sidebar).
485
+ title_options = [methods.title(k) for k in methods.METHOD_ORDER]
486
+ sc1, sc2 = st.columns([3, 2])
487
+ with sc1:
488
+ selected_title = st.selectbox("Select method", title_options, index=0)
489
+ with sc2:
490
+ dataset = st.radio("Select dataset", ["AI-MIND", "HCP"], index=0, horizontal=True)
491
+ approach = methods.TITLE_TO_KEY[selected_title]
492
+
493
+ cfg = methods.METHODS[approach]
494
+ color = cfg["color"]
495
+
496
+ st.title(f"{cfg['title']} β€” {dataset}")
497
+
498
+ paths = PREBUILT[approach][dataset]
499
+ hier_path = paths.get("hierarchy")
500
+ if hier_path is None or not hier_path.exists():
501
+ st.error(f"Pre-built result not found: `{hier_path}`")
502
+ st.stop()
503
+
504
+ raw_nodes = _load_json(str(hier_path))
505
+
506
+ leaves, aggs = count_nodes(raw_nodes)
507
+ c1, c2, c3 = st.columns(3)
508
+ c1.metric("Leaf Variables", leaves)
509
+ c2.metric("Aggregation Nodes", aggs)
510
+ c3.metric("Total Nodes", leaves + aggs)
511
+
512
+ dataset_counts = _leaf_counts_by_dataset(dataset)
513
+ max_leaves = max(dataset_counts.values(), default=leaves)
514
+ if leaves < max_leaves:
515
+ st.warning(
516
+ f"This pre-built {cfg['title']} result contains {leaves}/{max_leaves} "
517
+ f"{dataset} variables. Treat cross-method comparisons for this dataset "
518
+ "as coverage-aware unless the output is regenerated with the same row cap."
519
+ )
520
+
521
+ # ── Build summary (collapsed) ────────────────────────────────────────────────
522
+ facet_path = paths.get("facets")
523
+ n_facets = None
524
+ if facet_path is not None and facet_path.exists():
525
+ try:
526
+ n_facets = len(_load_json(str(facet_path)))
527
+ except Exception:
528
+ n_facets = None
529
+
530
+ with st.expander("Build summary", expanded=False):
531
+ bs1, bs2, bs3, bs4 = st.columns(4)
532
+ bs1.metric("Variables", leaves)
533
+ bs2.metric("Internal nodes", aggs)
534
+ bs3.metric("Tree depth", _tree_depth(raw_nodes))
535
+ bs4.metric("Facets", n_facets if n_facets is not None else "β€”")
536
+ pct = concept_aligned_pct(raw_nodes)
537
+ if pct is not None:
538
+ st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
539
+ st.caption(
540
+ f"Source file: `{hier_path.name}` Β· "
541
+ f"Approach: **{approach}** Β· Dataset: **{dataset}**. "
542
+ "Tree topology and labels are reproduced exactly from the pre-built "
543
+ "thesis output (the algorithms are not re-run in this viewer)."
544
+ )
545
+ st.dataframe(output_manifest(paths, raw_nodes), width="stretch", hide_index=True)
546
+
547
+ # ── Provenance / traceability (method-aware) ─────────────────────────────────
548
+ # Approach 2 records the richest provenance; Approach 1 records concept-alignment
549
+ # coverage; the Baseline records none β€” and that contrast is itself a finding.
550
+ with st.expander("Label & structure provenance", expanded=False):
551
+ if approach == "Baseline":
552
+ st.caption(
553
+ "The baseline records **no provenance**: node labels are unsupervised "
554
+ "contrastive terms derived from each cluster, not traceable to a concept "
555
+ "source or a generation route. Traceability increases across the methods "
556
+ "(Baseline β†’ Approach 1 β†’ Approach 2) β€” itself a comparison point."
557
+ )
558
+ else:
559
+ pct = concept_aligned_pct(raw_nodes)
560
+ if pct is not None:
561
+ st.caption(f"Aggregation nodes carrying a concept / label source: **{pct:.1f}%**")
562
+ label_counts = label_source_counts(raw_nodes)
563
+ route_counts = structure_route_counts(raw_nodes)
564
+ has_detail = (any(k != "not recorded" for k in label_counts)
565
+ or any(k != "not recorded" for k in route_counts))
566
+ if has_detail:
567
+ pc1, pc2 = st.columns(2)
568
+ with pc1:
569
+ st.markdown("**Label sources**")
570
+ st.dataframe(
571
+ [{"Source": k, "Nodes": v} for k, v in label_counts.most_common()],
572
+ width="stretch", hide_index=True,
573
+ )
574
+ with pc2:
575
+ st.markdown("**Structure routes**")
576
+ st.dataframe(
577
+ [{"Route": k, "Nodes": v} for k, v in route_counts.most_common()],
578
+ width="stretch", hide_index=True,
579
+ )
580
+ elif pct is None:
581
+ st.info("No provenance fields were recorded in this output JSON.")
582
+
583
+ # ── Downloads ────────────────────────────────────────────────────────────────
584
+ d1, d2 = st.columns(2)
585
+ with d1:
586
+ st.download_button("Hierarchy JSON", data=_read_bytes(str(hier_path)),
587
+ file_name=hier_path.name, mime="application/json",
588
+ width='stretch')
589
+ with d2:
590
+ st.download_button("All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
591
+ file_name="metadata_hierarchy_outputs.zip",
592
+ mime="application/zip", width='stretch')
593
+
594
+ st.markdown("---")
595
+
596
+ # ── Level-of-Detail controls (above chart β€” matches the apps) ────────────────
597
+ view_options = ["Sunburst (drill-down)", "Treemap"]
598
+ if cfg["node_link"]:
599
+ view_options.append("Node-link tree")
600
+
601
+ if cfg["compress"]:
602
+ vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
603
+ else:
604
+ vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
605
+ vc5 = None
606
+
607
+ with vc1:
608
+ viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
609
+ help="Sunburst best for large hierarchies [Taxonomizer]. "
610
+ "Node-link best for moderate-depth structure inspection.")
611
+ with vc2:
612
+ depth = st.slider("Depth (Level of Detail)", 1, 9, DEFAULT_DEPTH, 1,
613
+ help="Maximum tree levels shown. Set high to see the whole "
614
+ "hierarchy, lower to peel back to the interior.")
615
+ with vc3:
616
+ show_leaf_labels = st.checkbox("Leaf labels", value=False)
617
+ with vc4:
618
+ show_hidden = st.checkbox("Hidden nodes", value=False)
619
+ if vc5 is not None:
620
+ with vc5:
621
+ compress_chains = st.checkbox("Compress chains", value=True,
622
+ help="Merge one-child aggregation chains "
623
+ '(e.g. "DMS β†’ DMS Recommended Standard") for '
624
+ "display. Export JSON keeps original structure.")
625
+ else:
626
+ compress_chains = False
627
+
628
+ st.divider()
629
+
630
+ display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
631
+ render_depth = safe_render_depth(display_nodes, depth)
632
+ if render_depth < depth and viz_mode in {"Sunburst (drill-down)", "Treemap"}:
633
+ st.caption(
634
+ f"Initial render capped at depth {render_depth} for performance; "
635
+ "the chart remains drillable."
636
+ )
637
+
638
+ if viz_mode == "Sunburst (drill-down)":
639
+ st.plotly_chart(plot_sunburst(display_nodes, color, render_depth), width='stretch')
640
+ elif viz_mode == "Treemap":
641
+ st.plotly_chart(plot_treemap(display_nodes, color, render_depth), width='stretch')
642
+ else:
643
+ st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
644
+ width='stretch')
645
+
646
+ # ── Facets (Approach 1 only) ─────────────────────────────────────────────────
647
+ if facet_path is not None and facet_path.exists():
648
+ st.markdown("---")
649
+ st.subheader("Parallel facets")
650
+ facets = _load_json(str(facet_path))
651
+ names = list(facets.keys())
652
+ if not names:
653
+ st.info("No facets available for this dataset.")
654
+ else:
655
+ sel = st.selectbox("Select facet", names)
656
+ fnodes = facets[sel]
657
+ ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
658
+ with ft1:
659
+ st.plotly_chart(plot_sunburst(fnodes, color, depth), width='stretch')
660
+ with ft2:
661
+ st.plotly_chart(plot_treemap(fnodes, color), width='stretch')
views/run_baseline.py CHANGED
@@ -5,7 +5,7 @@
5
  #
6
  # Pipeline:
7
  # 1. Load metadata file (CSV / TSV / XLSX / JSON)
8
- # 2. Detect column roles (leaf / group / text / meta) β€” same as Approach 1 / 2
9
  # 3. Build canonical schema (incl. _semantic_text = description values only)
10
  # 4. Embed each variable (code + description) via Word2Vec skip-gram and build
11
  # the cosine-distance semantic space [TAX Β§3.2]
@@ -28,7 +28,7 @@
28
  # the bare code goes out-of-vocabulary (a limitation the paper flags,
29
  # e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a
30
  # paragraph; using the short name (not the full description prose) keeps
31
- # task-distinctive words from being diluted by shared explanatory text.
32
  # 3. Fully-automatic labels β€” the paper's labelling is semi-automatic
33
  # (human picks from suggestions); a baseline must be non-interactive, so
34
  # we use data-driven contrastive terms from each cluster's members.
@@ -186,7 +186,7 @@ def detect_roles(df: pd.DataFrame) -> tuple:
186
  meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
187
  .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
188
  # Representation columns (decimal/precision/unit/type/format/…) must never
189
- # become structural levels β€” force them out of group and into metadata. [GON][TAX]
190
  _META_SUBSTR_BLOCK = {
191
  'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
192
  'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
@@ -306,9 +306,9 @@ def attribute_name(text: str) -> str:
306
  paragraph. Descriptions here are formatted '<name>: <full sentence>' (some
307
  prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
308
  that is not a pure all-caps marker. Embedding this short name β€” rather than
309
- the full description prose β€” keeps the task-distinctive words from being
310
- diluted by shared explanatory text, so the taxonomy groups far more by theme
311
- (e.g. DMS / PAL / SWM) without ever touching the group column.
312
  """
313
  text = str(text)
314
  for clause in re.split(r'[:\n]', text):
@@ -470,7 +470,7 @@ def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
470
  average) β€” the name clause of the description, as Taxonomizer specifies.
471
  Recursively clusters via balanced Ward linkage β€” the semantic-space
472
  dendrogram. Labels each internal node with the contrastive content terms of
473
- its members (data-driven, fully automatic). No group column, no hardcoding.
474
  """
475
  # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
476
  nodes: list = [{'id': 0, 'name': project, 'type': 'root',
@@ -807,8 +807,8 @@ with st.sidebar:
807
  max_items = st.slider('Maximum variables', 25, 1200, 900, 25,
808
  help='Cap on variables included (lower only to speed up very large files). '
809
  'Default keeps full datasets like HCP (813).')
810
- group_filter = st.text_input('Group filter (optional)', value='',
811
- help='Filter rows whose group path contains this text')
812
 
813
  # ─────────────────────────────────────────────────────────────────────────────
814
  # MAIN
@@ -829,8 +829,9 @@ if not uploaded:
829
  | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette β†’ dendrogram | Taxonomizer Β§4.2 |
830
  | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer Β§4.3 *(adapted)* |
831
 
832
- The group column is **not** used for construction, so the recovered taxonomy
833
- can be fairly evaluated against it (NMI / ARI / Purity in the Evaluation tab).
 
834
 
835
  **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
836
 
@@ -853,7 +854,7 @@ st.subheader('Step 1 β€” File preview')
853
  with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
854
  expanded=False):
855
  st.dataframe(df.head(10), width='stretch')
856
- score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
857
  if c in prof.columns]
858
  st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
859
  width='stretch')
@@ -869,8 +870,9 @@ with st.expander('Column configuration', expanded=True):
869
  with left:
870
  leaf_cols = st.multiselect('Leaf variable column(s)', cols,
871
  default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
872
- group_cols = st.multiselect('Group/task column(s)', cols,
873
- default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}')
 
874
  with right:
875
  text_cols = st.multiselect('Text/description column(s)', cols,
876
  default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
@@ -982,11 +984,11 @@ with tabs[1]:
982
  for i in lids if i in nm and 'metadata' in nm[i]}
983
  sub = can[can['_leaf_id'].isin(leaf_ids_set)]
984
  st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
985
- st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
986
  width='stretch')
987
 
988
  with tabs[2]:
989
- st.dataframe(can, width='stretch')
990
 
991
  with tabs[3]:
992
  _base = safe_name(project_name)
@@ -1035,9 +1037,9 @@ with tabs[4]:
1035
 
1036
  st.subheader('Hierarchy Quality Evaluation')
1037
  st.caption(
1038
- 'The group column is a *construction input* (GonΓ§alves text object), so it '
1039
- 'cannot serve as ground truth. The primary metrics below are **reference-free** '
1040
- 'β€” they assess the hierarchy itself, with no gold standard.'
1041
  )
1042
 
1043
  with st.spinner('Computing reference-free metrics…'):
@@ -1082,14 +1084,3 @@ with tabs[4]:
1082
  s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%",
1083
  help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
1084
 
1085
- # ── Held-out group recovery (VALID β€” group column not used in construction) ─
1086
- st.markdown('#### Held-out group recovery *(valid β€” group column not used)*')
1087
- st.caption(
1088
- 'The baseline never uses the group column (it embeds only attribute '
1089
- 'names), so this is a **valid held-out** recovery score. ARI and AMI are '
1090
- 'chance-corrected; NMI and Purity are omitted as inflated by over-splitting.'
1091
- )
1092
- gp = he.group_preservation(nodes, can)
1093
- g1, g2 = st.columns(2)
1094
- g1.metric('ARI', gp['ARI'], help='Adjusted Rand Index (chance-corrected).')
1095
- g2.metric('AMI', gp['AMI'], help='Adjusted Mutual Information (chance-corrected).')
 
5
  #
6
  # Pipeline:
7
  # 1. Load metadata file (CSV / TSV / XLSX / JSON)
8
+ # 2. Detect column roles (leaf / context / text / meta) β€” same as Approach 1 / 2
9
  # 3. Build canonical schema (incl. _semantic_text = description values only)
10
  # 4. Embed each variable (code + description) via Word2Vec skip-gram and build
11
  # the cosine-distance semantic space [TAX Β§3.2]
 
28
  # the bare code goes out-of-vocabulary (a limitation the paper flags,
29
  # e.g. "BP"). Taxonomizer embeds the NAME ("a few words"), not a
30
  # paragraph; using the short name (not the full description prose) keeps
31
+ # domain-specific words from being diluted by shared explanatory text.
32
  # 3. Fully-automatic labels β€” the paper's labelling is semi-automatic
33
  # (human picks from suggestions); a baseline must be non-interactive, so
34
  # we use data-driven contrastive terms from each cluster's members.
 
186
  meta = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
187
  .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
188
  # Representation columns (decimal/precision/unit/type/format/…) must never
189
+ # become structural levels; prefer them as metadata. [GON][TAX]
190
  _META_SUBSTR_BLOCK = {
191
  'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
192
  'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
 
306
  paragraph. Descriptions here are formatted '<name>: <full sentence>' (some
307
  prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
308
  that is not a pure all-caps marker. Embedding this short name β€” rather than
309
+ the full description prose β€” keeps the domain-specific words from being
310
+ diluted by shared explanatory text, so the taxonomy clusters more by theme
311
+ (e.g. DMS / PAL / SWM).
312
  """
313
  text = str(text)
314
  for clause in re.split(r'[:\n]', text):
 
470
  average) β€” the name clause of the description, as Taxonomizer specifies.
471
  Recursively clusters via balanced Ward linkage β€” the semantic-space
472
  dendrogram. Labels each internal node with the contrastive content terms of
473
+ its members (data-driven, fully automatic). No hardcoding.
474
  """
475
  # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
476
  nodes: list = [{'id': 0, 'name': project, 'type': 'root',
 
807
  max_items = st.slider('Maximum variables', 25, 1200, 900, 25,
808
  help='Cap on variables included (lower only to speed up very large files). '
809
  'Default keeps full datasets like HCP (813).')
810
+ group_filter = st.text_input('Row filter (optional)', value='',
811
+ help='Filter rows by contextual path text before building')
812
 
813
  # ─────────────────────────────────────────────────────────────────────────────
814
  # MAIN
 
829
  | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette β†’ dendrogram | Taxonomizer Β§4.2 |
830
  | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer Β§4.3 *(adapted)* |
831
 
832
+ This page is the pure Taxonomizer-style semantic-space reference method:
833
+ variable meanings are embedded and recursively clustered into a hierarchy,
834
+ with node labels generated from contrastive terms.
835
 
836
  **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
837
 
 
854
  with st.expander(f'{uploaded.name} ({len(df):,} rows, {len(df.columns)} columns)',
855
  expanded=False):
856
  st.dataframe(df.head(10), width='stretch')
857
+ score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
858
  if c in prof.columns]
859
  st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
860
  width='stretch')
 
870
  with left:
871
  leaf_cols = st.multiselect('Leaf variable column(s)', cols,
872
  default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
873
+ group_cols = st.multiselect('Context column(s) (optional)', cols,
874
+ default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
875
+ help='Optional contextual columns for display/filtering.')
876
  with right:
877
  text_cols = st.multiselect('Text/description column(s)', cols,
878
  default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
 
984
  for i in lids if i in nm and 'metadata' in nm[i]}
985
  sub = can[can['_leaf_id'].isin(leaf_ids_set)]
986
  st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
987
+ st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
988
  width='stretch')
989
 
990
  with tabs[2]:
991
+ st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), width='stretch')
992
 
993
  with tabs[3]:
994
  _base = safe_name(project_name)
 
1037
 
1038
  st.subheader('Hierarchy Quality Evaluation')
1039
  st.caption(
1040
+ 'No manually curated reference taxonomy is available for this experiment. '
1041
+ 'The metrics below are reference-free: they assess hierarchy structure, '
1042
+ 'label coherence and interpretability directly.'
1043
  )
1044
 
1045
  with st.spinner('Computing reference-free metrics…'):
 
1084
  s5.metric('Singleton nodes', f"{sm['singleton_nodes_%']}%",
1085
  help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
1086