Spaces:

rufasharon
/

metadata_hierarchy_tfm2026

Sleeping

RoophaSharon Claude Opus 4.8 commited on 2 days ago

Commit

dd46f48

1 Parent(s): fef0152

UX v2: move configuration to main area, deploy version2 to the Space

Professor feedback was that approach configuration belongs in the main area,
not the sidebar. New version2/ app implements this across every page:

- Demo View: method/dataset selectors moved into the main area.
- Build pages: upload + config moved to main area; expert knobs in a
collapsed "Advanced settings" expander; sidebar is navigation-only.
- Descriptive method names via methods.py (single source of truth),
e.g. "Approach 1: External Concept Alignment Hierarchy".
- Removed emoji, numbered "1." / "Step N -" headers; method-aware
provenance panel; Baseline gets all three visualizations.
- Output save paths aligned to version2/outputs so saved builds appear
in the Demo View.
- Robustness: broaden torch import guards (OSError on broken installs).
- Mirror v1 fixes into version2 (Approach 1 max-vars default 3000,
Approach 2 prune_empty_aggregations).

Dockerfile now runs version2/demo.py so the live Space serves version 2.
Root v1 files are left untouched as a fallback.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (37) hide show

Dockerfile +1 -1
approach_1.py +4 -1
approach_2.py +34 -0
baseline.py +21 -30
version2/.gitignore +26 -0
version2/Dockerfile +32 -0
version2/LICENSE +21 -0
version2/README.md +205 -0
version2/approach_1.py +0 -0
version2/approach_2.py +0 -0
version2/baseline.py +1086 -0
version2/data/HCP_S1200_DataDictionary_Oct_30_2023.csv +0 -0
version2/data/ai-mind-variable-descriptions(in).csv +109 -0
version2/data/dictionary_harmonized_categories.csv +571 -0
version2/data/tidytuesday_json_val.json +1911 -0
version2/demo.py +47 -0
version2/hierarchy_eval.py +622 -0
version2/launcher.py +137 -0
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_canonical.csv +0 -0
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_concept_labels.csv +159 -0
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json +0 -0
version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json +0 -0
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_canonical.csv +109 -0
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_concept_labels.csv +21 -0
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json +0 -0
version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json +0 -0
version2/outputs/approach_2/HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json +0 -0
version2/outputs/approach_2/ai-mind-variable-descriptions_in__approach2_lod.json +2716 -0
version2/outputs/baseline/HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json +0 -0
version2/outputs/baseline/ai-mind-variable-descriptions_in__baseline_hierarchy.json +1876 -0
version2/requirements.txt +17 -0
version2/views/methods.py +77 -0
version2/views/run_approach_1.py +0 -0
version2/views/run_approach_2.py +0 -0
version2/views/run_baseline.py +1091 -0
version2/views/viewer.py +661 -0
views/run_baseline.py +21 -30

Dockerfile CHANGED Viewed

@@ -26,7 +26,7 @@ EXPOSE 7860
 # XSRF/CORS disabled so file uploads work behind the Hugging Face proxy/iframe
 # (otherwise the uploader returns "AxiosError: 403"). Standard for HF Spaces.
-CMD ["streamlit", "run", "demo.py", \
      "--server.port=7860", "--server.address=0.0.0.0", \
      "--server.enableXsrfProtection=false", \
      "--server.enableCORS=false"]

 # XSRF/CORS disabled so file uploads work behind the Hugging Face proxy/iframe
 # (otherwise the uploader returns "AxiosError: 403"). Standard for HF Spaces.
+CMD ["streamlit", "run", "version2/demo.py", \
      "--server.port=7860", "--server.address=0.0.0.0", \
      "--server.enableXsrfProtection=false", \
      "--server.enableCORS=false"]

approach_1.py CHANGED Viewed

@@ -3826,7 +3826,10 @@ with st.sidebar:
     st.header('2. Generation')
     project    = st.text_input('Root / project name', value='metadata_project')
-    max_rows   = st.slider('Max variables', 10, 3000, 600, 10)
     merge_files = st.checkbox('Merge uploaded files', value=True)
     n_clusters = st.slider('Max clusters per group', 2, 16, 8, 1,
                             help='Maximum number of concept sub-groups per top-level group.')

     st.header('2. Generation')
     project    = st.text_input('Root / project name', value='metadata_project')
+    max_rows   = st.slider('Max variables', 10, 3000, 3000, 10,
+                            help='Variables to build from (uses df.head). Default '
+                                 'covers full datasets like HCP (~813); lower it '
+                                 'only to sample a subset for speed.')
     merge_files = st.checkbox('Merge uploaded files', value=True)
     n_clusters = st.slider('Max clusters per group', 2, 16, 8, 1,
                             help='Maximum number of concept sub-groups per top-level group.')

approach_2.py CHANGED Viewed

@@ -1958,6 +1958,33 @@ def enforce_single_parent(nodes: list) -> int:
                 removed += 1
     return removed
 def mine_phrase_slots(texts: list,
                        text_col_names: Optional[list] = None,
                        min_phrase_count: int = 2,
@@ -3037,6 +3064,12 @@ def build_dynamic_lod_tree(can: pd.DataFrame,
     except Exception:
         n_reparented = 0
     # Annotate the root with post-build statistics
     if nodes and nodes[0].get('type') == 'root':
         nodes[0]['post_build_stats'] = {
@@ -3044,6 +3077,7 @@ def build_dynamic_lod_tree(can: pd.DataFrame,
             'low_quality_nodes_dissolved':   int(n_dissolved),
             'group_prefix_labels_stripped':  int(n_stripped),
             'dag_links_removed':             int(n_reparented),
         }
     # Deduplicate children

                 removed += 1
     return removed
+def prune_empty_aggregations(nodes: list) -> int:
+    """
+    POST-BUILD PASS 5 — drop aggregation nodes that ended up with no children.
+    `enforce_single_parent` can empty a shallow aggregation when all of its
+    variables were kept under a deeper/sibling parent (e.g. 'RVP Response
+    Latency' losing every leaf to a more specific group).  An empty category
+    node is export noise — it renders as a blank sector and has no members.
+    Iteratively removes childless aggregation nodes and detaches them from
+    their parents (removal can empty a parent in turn).  Root and attribute
+    nodes are never touched.  Returns the number of nodes removed.
+    """
+    removed = 0
+    while True:
+        node_map = {int(n['id']): n for n in nodes}
+        empties = {int(n['id']) for n in nodes
+                   if n.get('type') == 'aggregation' and not n.get('related')}
+        if not empties:
+            break
+        nodes[:] = [n for n in nodes if int(n['id']) not in empties]
+        for n in nodes:
+            if any(int(c) in empties for c in n.get('related', [])):
+                n['related'] = [int(c) for c in n['related'] if int(c) not in empties]
+        removed += len(empties)
+    return removed
 def mine_phrase_slots(texts: list,
                        text_col_names: Optional[list] = None,
                        min_phrase_count: int = 2,
     except Exception:
         n_reparented = 0
+    # ── POST-BUILD PASS 5 — drop aggregation nodes left childless by PASS 4 ───
+    try:
+        n_empty_pruned = prune_empty_aggregations(nodes)
+    except Exception:
+        n_empty_pruned = 0
     # Annotate the root with post-build statistics
     if nodes and nodes[0].get('type') == 'root':
         nodes[0]['post_build_stats'] = {
             'low_quality_nodes_dissolved':   int(n_dissolved),
             'group_prefix_labels_stripped':  int(n_stripped),
             'dag_links_removed':             int(n_reparented),
+            'empty_aggregations_pruned':     int(n_empty_pruned),
         }
     # Deduplicate children

baseline.py CHANGED Viewed

@@ -5,7 +5,7 @@
 #
 # Pipeline:
 #   1. Load metadata file (CSV / TSV / XLSX / JSON)
-#   2. Detect column roles (leaf / group / text / meta) — same as Approach 1 / 2
 #   3. Build canonical schema (incl. _semantic_text = description values only)
 #   4. Embed each variable (code + description) via Word2Vec skip-gram and build
 #      the cosine-distance semantic space [TAX §3.2]
@@ -28,7 +28,7 @@
 #        the bare code goes out-of-vocabulary (a limitation the paper flags,
 #        e.g. "BP").  Taxonomizer embeds the NAME ("a few words"), not a
 #        paragraph; using the short name (not the full description prose) keeps
-#        task-distinctive words from being diluted by shared explanatory text.
 #     3. Fully-automatic labels — the paper's labelling is semi-automatic
 #        (human picks from suggestions); a baseline must be non-interactive, so
 #        we use data-driven contrastive terms from each cluster's members.
@@ -186,7 +186,7 @@ def detect_roles(df: pd.DataFrame) -> tuple:
     meta  = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
              .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
     # Representation columns (decimal/precision/unit/type/format/…) must never
-    # become structural levels — force them out of group and into metadata. [GON][TAX]
     _META_SUBSTR_BLOCK = {
         'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
         'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
@@ -306,9 +306,9 @@ def attribute_name(text: str) -> str:
     paragraph.  Descriptions here are formatted '<name>: <full sentence>' (some
     prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
     that is not a pure all-caps marker.  Embedding this short name — rather than
-    the full description prose — keeps the task-distinctive words from being
-    diluted by shared explanatory text, so the taxonomy groups far more by theme
-    (e.g. DMS / PAL / SWM) without ever touching the group column.
     """
     text = str(text)
     for clause in re.split(r'[:\n]', text):
@@ -470,7 +470,7 @@ def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
     average) — the name clause of the description, as Taxonomizer specifies.
     Recursively clusters via balanced Ward linkage — the semantic-space
     dendrogram.  Labels each internal node with the contrastive content terms of
-    its members (data-driven, fully automatic).  No group column, no hardcoding.
     """
     # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
     nodes: list = [{'id': 0, 'name': project, 'type': 'root',
@@ -807,8 +807,8 @@ with st.sidebar:
     max_items     = st.slider('Maximum variables', 25, 1200, 900, 25,
                               help='Cap on variables included (lower only to speed up very large files). '
                                    'Default keeps full datasets like HCP (813).')
-    group_filter  = st.text_input('Group filter (optional)', value='',
-                                  help='Filter rows whose group path contains this text')
 # ─────────────────────────────────────────────────────────────────────────────
 # MAIN
@@ -829,8 +829,9 @@ if not uploaded:
     | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram | Taxonomizer §4.2 |
     | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer §4.3 *(adapted)* |
-    The group column is **not** used for construction, so the recovered taxonomy
-    can be fairly evaluated against it (NMI / ARI / Purity in the Evaluation tab).
     **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
@@ -853,7 +854,7 @@ st.subheader('Step 1 — File preview')
 with st.expander(f'{uploaded.name}  ({len(df):,} rows, {len(df.columns)} columns)',
                  expanded=False):
     st.dataframe(df.head(10), use_container_width=True)
-    score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
                   if c in prof.columns]
     st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
                  use_container_width=True)
@@ -869,8 +870,9 @@ with st.expander('Column configuration', expanded=True):
     with left:
         leaf_cols = st.multiselect('Leaf variable column(s)', cols,
             default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
-        group_cols = st.multiselect('Group/task column(s)', cols,
-            default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}')
     with right:
         text_cols = st.multiselect('Text/description column(s)', cols,
             default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
@@ -982,11 +984,11 @@ with tabs[1]:
                             for i in lids if i in nm and 'metadata' in nm[i]}
             sub = can[can['_leaf_id'].isin(leaf_ids_set)]
             st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
-            st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
                          use_container_width=True)
 with tabs[2]:
-    st.dataframe(can, use_container_width=True)
 with tabs[3]:
     _base = safe_name(project_name)
@@ -1035,9 +1037,9 @@ with tabs[4]:
     st.subheader('Hierarchy Quality Evaluation')
     st.caption(
-        'The group column is a *construction input* (Gonçalves text object), so it '
-        'cannot serve as ground truth. The primary metrics below are **reference-free** '
-        '— they assess the hierarchy itself, with no gold standard.'
     )
     with st.spinner('Computing reference-free metrics…'):
@@ -1082,14 +1084,3 @@ with tabs[4]:
     s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%",
               help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
-    # ── Held-out group recovery (VALID — group column not used in construction) ─
-    st.markdown('#### Held-out group recovery *(valid — group column not used)*')
-    st.caption(
-        'The baseline never uses the group column (it embeds only attribute '
-        'names), so this is a **valid held-out** recovery score. ARI and AMI are '
-        'chance-corrected; NMI and Purity are omitted as inflated by over-splitting.'
-    )
-    gp = he.group_preservation(nodes, can)
-    g1, g2 = st.columns(2)
-    g1.metric('ARI', gp['ARI'], help='Adjusted Rand Index (chance-corrected).')
-    g2.metric('AMI', gp['AMI'], help='Adjusted Mutual Information (chance-corrected).')

 #
 # Pipeline:
 #   1. Load metadata file (CSV / TSV / XLSX / JSON)
+#   2. Detect column roles (leaf / context / text / meta) — same as Approach 1 / 2
 #   3. Build canonical schema (incl. _semantic_text = description values only)
 #   4. Embed each variable (code + description) via Word2Vec skip-gram and build
 #      the cosine-distance semantic space [TAX §3.2]
 #        the bare code goes out-of-vocabulary (a limitation the paper flags,
 #        e.g. "BP").  Taxonomizer embeds the NAME ("a few words"), not a
 #        paragraph; using the short name (not the full description prose) keeps
+#        domain-specific words from being diluted by shared explanatory text.
 #     3. Fully-automatic labels — the paper's labelling is semi-automatic
 #        (human picks from suggestions); a baseline must be non-interactive, so
 #        we use data-driven contrastive terms from each cluster's members.
     meta  = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
              .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
     # Representation columns (decimal/precision/unit/type/format/…) must never
+    # become structural levels; prefer them as metadata. [GON][TAX]
     _META_SUBSTR_BLOCK = {
         'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
         'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
     paragraph.  Descriptions here are formatted '<name>: <full sentence>' (some
     prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
     that is not a pure all-caps marker.  Embedding this short name — rather than
+    the full description prose — keeps the domain-specific words from being
+    diluted by shared explanatory text, so the taxonomy clusters more by theme
+    (e.g. DMS / PAL / SWM).
     """
     text = str(text)
     for clause in re.split(r'[:\n]', text):
     average) — the name clause of the description, as Taxonomizer specifies.
     Recursively clusters via balanced Ward linkage — the semantic-space
     dendrogram.  Labels each internal node with the contrastive content terms of
+    its members (data-driven, fully automatic). No hardcoding.
     """
     # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
     nodes: list = [{'id': 0, 'name': project, 'type': 'root',
     max_items     = st.slider('Maximum variables', 25, 1200, 900, 25,
                               help='Cap on variables included (lower only to speed up very large files). '
                                    'Default keeps full datasets like HCP (813).')
+    group_filter  = st.text_input('Row filter (optional)', value='',
+                                  help='Filter rows by contextual path text before building')
 # ─────────────────────────────────────────────────────────────────────────────
 # MAIN
     | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram | Taxonomizer §4.2 |
     | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer §4.3 *(adapted)* |
+    This page is the pure Taxonomizer-style semantic-space reference method:
+    variable meanings are embedded and recursively clustered into a hierarchy,
+    with node labels generated from contrastive terms.
     **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
 with st.expander(f'{uploaded.name}  ({len(df):,} rows, {len(df.columns)} columns)',
                  expanded=False):
     st.dataframe(df.head(10), use_container_width=True)
+    score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
                   if c in prof.columns]
     st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
                  use_container_width=True)
     with left:
         leaf_cols = st.multiselect('Leaf variable column(s)', cols,
             default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
+        group_cols = st.multiselect('Context column(s) (optional)', cols,
+            default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
+            help='Optional contextual columns for display/filtering.')
     with right:
         text_cols = st.multiselect('Text/description column(s)', cols,
             default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
                             for i in lids if i in nm and 'metadata' in nm[i]}
             sub = can[can['_leaf_id'].isin(leaf_ids_set)]
             st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
+            st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
                          use_container_width=True)
 with tabs[2]:
+    st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), use_container_width=True)
 with tabs[3]:
     _base = safe_name(project_name)
     st.subheader('Hierarchy Quality Evaluation')
     st.caption(
+        'No manually curated reference taxonomy is available for this experiment. '
+        'The metrics below are reference-free: they assess hierarchy structure, '
+        'label coherence and interpretability directly.'
     )
     with st.spinner('Computing reference-free metrics…'):
     s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%",
               help='Aggregation nodes with a single child (sparse-hierarchy indicator)')

version2/.gitignore ADDED Viewed

	@@ -0,0 +1,26 @@

+# Python
+__pycache__/
+*.pyc
+*.pyo
+.venv/
+venv/
+# Streamlit
+.streamlit/secrets.toml
+# Jupyter
+.ipynb_checkpoints/
+# OS / editor
+.DS_Store
+Thumbs.db
+.vscode/
+.idea/
+# Anaconda envs
+*.conda
+*.egg-info/
+# Temp
+~WRL*.tmp
+*.tmp

version2/Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.11-slim
+# System deps (build tools for some wheels, curl for healthcheck)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential curl \
+    && rm -rf /var/lib/apt/lists/*
+# Run as non-root user (Hugging Face Spaces convention: uid 1000)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Install Python dependencies first (better layer caching)
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the app
+COPY --chown=user . .
+# Hugging Face Spaces expects the app on port 7860
+EXPOSE 7860
+# XSRF/CORS disabled so file uploads work behind the Hugging Face proxy/iframe
+# (otherwise the uploader returns "AxiosError: 403"). Standard for HF Spaces.
+CMD ["streamlit", "run", "demo.py", \
+     "--server.port=7860", "--server.address=0.0.0.0", \
+     "--server.enableXsrfProtection=false", \
+     "--server.enableCORS=false"]

version2/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 RoophaSharon
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

version2/README.md ADDED Viewed

	@@ -0,0 +1,205 @@

+---
+title: Metadata Hierarchy Explorer
+colorFrom: green
+colorTo: blue
+sdk: docker
+app_port: 7860
+pinned: false
+license: mit
+---
+# Metadata Hierarchy Construction — TFM
+Master's thesis prototype: automatic hierarchy construction from data-dictionary metadata.
+Three algorithms are implemented for comparison.
+## Live demo
+The deployed app opens on a **pre-built results viewer** (`demo.py`) showing the
+AI-MIND and HCP hierarchies for all three approaches — no upload needed. Use the
+sidebar to switch approach/dataset and the Level-of-Detail controls to adjust depth.
+To **build a hierarchy from your own CSV**, open the **Baseline**, **Approach 1**, or
+**Approach 2** page from the left sidebar and upload a file. (Approach 2's optional
+local-LLM label refinement runs only on a local machine with Ollama; in the cloud it
+falls back to the deterministic pipeline automatically.)
+## Approaches
+- **Baseline** — Pure clustering baseline. Plain TF-IDF / Word2Vec embeddings + hierarchical
+  clustering. Documented in `README_baseline.md`.
+- **Approach 1** — Global embedding pipeline. Uses SBERT + N×M concept-table alignment
+  (Gonçalves 2019) + HiExpan refinement (Shen et al. KDD 2018) + Castanet parallel facets.
+  Optionally retrieves concept context from Wikidata / Wikipedia / WordNet / BioPortal.
+- **Approach 2** — Dataset-constrained multi-aspect hierarchy. Algorithmic pipeline with no
+  domain hardcoding:
+  1. Group-anchored L1/L2 (from detected metadata column structure — BISE 2026)
+  2. Phrase-slot mining (IE-style slot induction) for descriptions with regular structure
+  3. **FASTopic** semantic aspect discovery (Wu et al. NeurIPS 2024) — replaces NMF
+  4. NMF lexical fallback for small groups
+  5. GMM + BIC for small clusters, MiniBatchKMeans + silhouette for large ones
+  6. Deterministic 5-stage label generation (description prefix → group anchor → IDF filter
+     → bigram-preferred TF-IDF → optional LLM refinement)
+  7. **Optional local-LLM label refinement** via Ollama + Qwen 2.5 (TopicTag pattern, DocEng
+     2024). Strict grounding check rejects labels not derived from CSV evidence. Per-node
+     provenance recorded.
+  8. TraCo-inspired hierarchy diagnostics (AAAI 2024)
+  No facet trees — single coherent LoD tree.
+See each script's "Method" tab in the running app for the full algorithm and paper references.
+## Paper stack
+| Component | Paper |
+|---|---|
+| Multi-aspect taxonomy scaffold | Zhu et al. 2025, EMNLP |
+| Canonical metadata text objects | Gonçalves et al. 2019, ESWC |
+| Semantic aspect discovery | Wu et al. 2024 (FASTopic), NeurIPS, arXiv:2405.17978 |
+| Phrase-slot mining | IE / slot-induction literature (ACM CSUR 2022) |
+| LLM label refinement pattern | Eren et al. 2024 (TopicTag), DocEng, arXiv:2407.19616 |
+| Local LLM (used for refinement) | Qwen Team 2024 (Qwen 2.5), arXiv:2412.15115 |
+| Hierarchy quality diagnostics | Wu et al. 2024 (TraCo), AAAI, arXiv:2401.14113 |
+| Group-anchored entry strategy | Motamedi, Novalija, Rei 2026, Springer BISE |
+| Multidimensional taxonomy motivation | Kargupta et al. 2025 (TaxoAdapt), ACL |
+| Future-work semantic consistency | SC-Taxo 2026, arXiv:2605.00620 |
+| Concept-label evaluation framework | Kejriwal et al. 2022 (TICL), EAAI |
+## Project layout
+```
+Hierarchy tool/
+├── baseline.py          # Pure clustering baseline (Streamlit app)
+├── approach_1.py        # Approach 1 (Streamlit app)
+├── approach_2.py        # Approach 2 (Streamlit app)
+├── approach_1.ipynb     # Approach 1 reproducible notebook
+├── approach_2.ipynb     # Approach 2 reproducible notebook
+├── baseline.ipynb       # Baseline reproducible notebook
+├── launcher.py          # Run all three apps simultaneously on different ports
+├── data/                # Sample input CSVs (AI-MIND, HCP, etc.)
+├── outputs/             # Generated hierarchies (JSON)
+└── requirements.txt
+```
+## Running locally
+### 1. Install Python dependencies
+```bash
+pip install -r requirements.txt
+```
+Python 3.10 or 3.11 recommended.
+### 2. (Approach 2 only) Install Ollama for the local-LLM label refinement layer
+**This is optional — Approach 2 produces deterministic labels without it.**  If you want
+the optional TopicTag-style LLM label refinement:
+1. Download and install Ollama from https://ollama.com/download
+2. Open Ollama once so the background service starts (icon in the system tray)
+3. Pull the recommended model:
+   ```bash
+   ollama pull qwen2.5:3b-instruct
+   ```
+   (For higher quality at higher RAM cost: `ollama pull qwen2.5:7b-instruct`.)
+4. Verify the server is reachable:
+   - In a browser open `http://localhost:11434/api/tags`
+   - Or run `ollama list`
+When Approach 2 starts it auto-detects Ollama and the "Refine labels with LLM" checkbox
+defaults to ON. Uncheck any time. The deterministic pipeline is the canonical thesis
+result; the LLM is an optional re-phraser of evidence already in the CSV.
+To override the default URL or model:
+```bash
+# Optional environment variables
+set OLLAMA_URL=http://localhost:11434/v1
+set OLLAMA_MODEL=qwen2.5:3b-instruct
+```
+Or change them live in the Approach 2 sidebar.
+### 3. Run one app at a time
+```bash
+streamlit run baseline.py
+# or
+streamlit run approach_1.py
+# or
+streamlit run approach_2.py
+```
+Each opens at http://localhost:8501 by default.
+### 4. Run all three apps simultaneously (for side-by-side comparison)
+```bash
+python launcher.py
+```
+This opens three browser tabs:
+- http://localhost:8501 — Baseline
+- http://localhost:8502 — Approach 1
+- http://localhost:8503 — Approach 2
+Press **Enter** in the launcher terminal to stop all servers.
+## Using the apps
+1. Upload one or more metadata CSV / TSV / XLSX / JSON files in the sidebar.
+2. Confirm the auto-detected column roles (leaf / group / text / meta).
+3. Click **Build hierarchy**.
+4. Inspect the LoD tree, evaluation metrics, label provenance (Approach 2), and export JSON.
+Sample data is in `data/`:
+- `ai-mind-variable-descriptions(in).csv`
+- `HCP_S1200_DataDictionary_Oct_30_2023.csv`
+## Outputs
+- **Baseline / Approach 1** export two JSON files for visualization:
+  - `*_lod.json` — primary LoD tree
+  - `*_facets.json` — parallel Castanet facet trees
+- **Approach 2** exports a single LoD JSON:
+  - `*_approach2_lod.json` — primary LoD tree (every aggregation node carries
+    `label_provenance` with source stage, confidence, and evidence terms)
+Filenames are derived from the uploaded CSV file name, so different CSVs export under
+different filenames into `outputs/approach 2/`.
+Existing output examples are in `outputs/approach 1/` and `outputs/approach 2/`.
+## Defensibility highlights for Approach 2
+- **No domain hardcoding.** Slot names, group anchors, and labels are all derived from the
+  detected metadata columns + the uploaded CSV — no hand-curated domain vocabulary.
+- **Deterministic by default.** Tree topology and all five label-generation stages are
+  reproducible from the input CSV alone. Local LLM is opt-in.
+- **Grounded LLM refinement.** Every LLM-proposed label must pass a strict grounding
+  check — every word in the label must appear in the extracted evidence. Failed proposals
+  are rejected and the deterministic label is used instead. Per-node provenance lets
+  you answer "did the LLM invent this?" with hard evidence.
+- **Local-only LLM.** Qwen 2.5 runs on the thesis machine via Ollama. No external API
+  calls, no third-party data sharing, no key management.
+## Troubleshooting
+| Symptom | Fix |
+|---|---|
+| `FASTopic not installed` warning | `pip install fastopic` (also installs `torch`) |
+| `openai` package missing | `pip install openai` |
+| `Ollama not reachable` in sidebar | Open the Ollama app from Start menu; the service runs in the system tray |
+| Model not found | `ollama pull qwen2.5:3b-instruct` |
+| Build very slow with LLM on | Expected for HCP — ~15–40 min on CPU with a 3B model. Disable LLM for fast iteration. |
+| `LLM-labeled nodes: 0/N` after build | The grounding check rejected every LLM proposal. Check the **Label Provenance** tab — counts under `llm_rejected = True` show what happened. |
+| Hierarchy too shallow | Increase `Max LoD tree depth` slider (top of sidebar in Approach 2) |
+## License
+For thesis evaluation only.

version2/approach_1.py ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/approach_2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/baseline.py ADDED Viewed

	@@ -0,0 +1,1086 @@

+# baseline.py — Metadata Hierarchy Builder — Baseline (Taxonomizer)
+#
+# Baseline = Taxonomizer (Mahmood & Mueller, IEEE TVCG 2019), semantic-space
+# pipeline, adapted to a metadata-only setting.  No hardcoded domain patterns.
+#
+# Pipeline:
+#   1. Load metadata file (CSV / TSV / XLSX / JSON)
+#   2. Detect column roles (leaf / context / text / meta) — same as Approach 1 / 2
+#   3. Build canonical schema (incl. _semantic_text = description values only)
+#   4. Embed each variable (code + description) via Word2Vec skip-gram and build
+#      the cosine-distance semantic space [TAX §3.2]
+#   5. Recursively cluster (agglomerative, cosine) into the dendrogram taxonomy;
+#      internal-node labels = data-driven contrastive terms of each cluster
+#   6. Visualise (Sunburst / Treemap / Node-link)
+#   7. Export visualization-ready JSON + canonical CSV
+#
+# Paper & justified adaptations (metadata/schema setting, fully automatic):
+#   [TAX] Mahmood & Mueller — Taxonomizer, IEEE TVCG 2019.
+#         Builds a SEMANTIC space (cosine over word2vec skip-gram embeddings of
+#         attribute names; gensim, Wikipedia, window=5, dim=128) merged with a
+#         DATA space (correlation over raw values), clustered into a dendrogram;
+#         inner nodes labelled semi-automatically by distributional degree-of-
+#         entailment + WordNet synonyms.
+#   Adaptations (all documented):
+#     1. No DATA space — a schema/dictionary has no raw values, so we use the
+#        semantic space alone (Taxonomizer with semantic weight = 1.0).
+#     2. Embed the attribute's short NAME (the description's name clause), since
+#        the bare code goes out-of-vocabulary (a limitation the paper flags,
+#        e.g. "BP").  Taxonomizer embeds the NAME ("a few words"), not a
+#        paragraph; using the short name (not the full description prose) keeps
+#        domain-specific words from being diluted by shared explanatory text.
+#     3. Fully-automatic labels — the paper's labelling is semi-automatic
+#        (human picks from suggestions); a baseline must be non-interactive, so
+#        we use data-driven contrastive terms from each cluster's members.
+#
+# Dependencies: gensim
+#   pip install gensim
+from __future__ import annotations
+import csv, json, re, warnings
+from collections import Counter, defaultdict
+from pathlib import Path
+import tempfile
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, silhouette_score
+from sklearn.preprocessing import LabelEncoder
+warnings.filterwarnings('ignore')
+st.set_page_config(page_title='Metadata Hierarchy — Baseline', layout='wide')
+st.title('Metadata Hierarchy Builder — Baseline (Taxonomizer)')
+st.caption(
+    'Taxonomizer baseline [Mahmood & Mueller, IEEE TVCG 2019]: Word2Vec skip-gram '
+    'semantic space (short attribute names) + balanced Ward agglomerative clustering '
+    'into the dendrogram taxonomy; nodes labelled by data-driven contrastive terms. '
+    'Semantic space only (no raw data values); no hardcoded patterns, no external APIs.'
+)
+# ─────────────────────────────────────────────────────────────────────────────
+# CONSTANTS
+# ─────────────────────────────────────────────────────────────────────────────
+LEAF_KEYS  = 'variable var field column attribute name code id item indicator question measure concept'.split()
+GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
+TEXT_KEYS  = 'description definition desc label title question meaning note notes text display full details explanation comment'.split()
+META_KEYS  = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split()
+# URL pattern — strip embedded links (e.g. HCP FreeSurfer NeuroLex URLs) so web
+# tokens cannot dominate the embedding or the cluster label.  [shared with A1]
+_URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
+                     re.IGNORECASE)
+# ─────────────────────────────────────────────────────────────────────────────
+# FILE LOADING
+# ─────────────────────────────────────────────────────────────────────────────
+def safe_name(name: str) -> str:
+    return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name)
+def try_read_csv(path: Path) -> pd.DataFrame:
+    best, best_score = None, -1
+    for enc in ['utf-8-sig', 'utf-8', 'latin1']:
+        for sep in [None, ',', '\t', ';', '|']:
+            try:
+                df = pd.read_csv(path, sep=sep, engine='python', encoding=enc)
+                score = df.shape[1] * 10 - float(df.isna().mean().mean())
+                if score > best_score:
+                    best, best_score = df, score
+            except Exception:
+                pass
+    if best is None:
+        raise ValueError(f'Could not read {path.name}')
+    best.columns = [str(c).strip().replace(';', '') for c in best.columns]
+    # Repair comma-packed rows (AI-Mind format)
+    if len(best) > 0:
+        first = best.iloc[:, 0].astype(str)
+        other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0
+        if first.str.contains(',').mean() > 0.50 and other_null > 0.70:
+            lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines()
+            if lines:
+                header = [h.strip().replace(';', '') for h in lines[0].split(',')]
+                rows = []
+                for line in lines[1:]:
+                    line = line.strip().rstrip(';')
+                    if not line:
+                        continue
+                    if line.startswith('"') and line.endswith('"'):
+                        line = line[1:-1]
+                    try:
+                        parts = next(csv.reader([line], quotechar='"'))
+                    except Exception:
+                        continue
+                    if len(parts) >= len(header):
+                        rows.append(parts[:len(header)])
+                if rows:
+                    best = pd.DataFrame(rows, columns=header)
+    best.columns = [str(c).strip().replace(';', '') for c in best.columns]
+    return best
+def load_any(path: Path) -> pd.DataFrame:
+    s = path.suffix.lower()
+    if s in ['.csv', '.tsv', '.txt']:
+        return try_read_csv(path)
+    if s in ['.xlsx', '.xls']:
+        return pd.read_excel(path)
+    if s == '.json':
+        obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
+        if isinstance(obj, list):
+            return pd.json_normalize(obj)
+        if isinstance(obj, dict):
+            for v in obj.values():
+                if isinstance(v, list):
+                    return pd.json_normalize(v)
+    raise ValueError(f'Unsupported file type: {s}')
+def save_upload(f) -> Path:
+    tmp = Path(tempfile.mkdtemp(prefix='baseline_'))
+    p = tmp / safe_name(f.name)
+    p.write_bytes(f.getbuffer())
+    return p
+# ─────────────────────────────────────────────────────────────────────────────
+# ROLE DETECTION  [GON]
+# ─────────────────────────────────────────────────────────────────────────────
+def norm(c: str) -> str:
+    return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_')
+def kscore(c: str, keys: list) -> int:
+    nc = norm(c)
+    return sum(1 for k in keys if k in nc)
+def profile_columns(df: pd.DataFrame) -> pd.DataFrame:
+    out = []
+    n = max(len(df), 1)
+    for col in df.columns:
+        s = df[col]
+        non = float(s.notna().mean())
+        nun = int(s.nunique(dropna=True))
+        ur  = nun / n
+        avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
+        out.append({
+            'column':         str(col),
+            'non_null':       round(non, 3),
+            'unique_values':  nun,
+            'unique_ratio':   round(ur, 3),
+            'avg_length':     round(avg, 1),
+            'leaf_score':     4*kscore(col, LEAF_KEYS)  + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0),
+            'group_score':    4*kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n*0.5, 80) else 0) + (1 if avg < 60 else 0),
+            'text_score':     5*kscore(col, TEXT_KEYS)  + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0),
+            'metadata_score': 4*kscore(col, META_KEYS)  + (2 if 1 < nun < min(n*0.8, 100) else 0),
+        })
+    return pd.DataFrame(out)
+def detect_roles(df: pd.DataFrame) -> tuple:
+    """Auto-detect column roles.  Identical logic to Approach 1 / 2 so the
+    preprocessing up to the canonical table is comparable across all apps."""
+    prof  = profile_columns(df)
+    leaf  = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist()
+    text  = (prof[(prof.text_score >= 4) | (prof.avg_length > 80)]
+             .sort_values('text_score', ascending=False)['column'].tolist()) or leaf.copy()
+    group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)]
+             .sort_values('group_score', ascending=False)['column'].head(3).tolist())
+    meta  = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
+             .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
+    # Representation columns (decimal/precision/unit/type/format/…) must never
+    # become structural levels; prefer them as metadata. [GON][TAX]
+    _META_SUBSTR_BLOCK = {
+        'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
+        'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
+    }
+    def _is_repr(col_name):
+        nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower())
+        return any(sub in nc for sub in _META_SUBSTR_BLOCK)
+    meta_extra = [c for c in prof['column'].tolist()
+                  if _is_repr(c) and c not in text and c not in leaf and c not in meta]
+    group = [c for c in group if not _is_repr(c)]
+    meta  = list(dict.fromkeys(meta + meta_extra))[:8]
+    return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof
+# ─────────────────────────────────────────────────────────────────────────────
+# CANONICAL SCHEMA  [GON]
+# ─────────────────────────────────────────────────────────────────────────────
+def sv(x) -> str:
+    return '' if pd.isna(x) else str(x).strip()
+def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
+    leaf_cols  = cfg.get('leaf_cols', [])
+    group_cols = cfg.get('group_cols', [])
+    text_cols  = cfg.get('text_cols', [])
+    meta_cols  = cfg.get('metadata_cols', [])
+    rows = []
+    for i, row in df.iterrows():
+        leaf_parts  = [sv(row.get(c, '')) for c in leaf_cols]
+        leaf_parts  = [p for p in leaf_parts if p]
+        label       = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}'
+        group_parts = [sv(row.get(c, '')) for c in group_cols]
+        group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']]
+        gpath       = ' > '.join(group_parts) if group_parts else 'Ungrouped'
+        parts = []
+        for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)):
+            v = sv(row.get(c, ''))
+            if v:
+                parts.append(f'{c}: {v}')
+        text = ' | '.join(parts) if parts else label
+        # _semantic_text: description VALUES only — no "fieldname:" prefixes, no
+        # other fields, URLs stripped.  This is the clean text Taxonomizer embeds
+        # (the attribute's meaning), identical in spirit to Approach 1's column.
+        sem_parts = [sv(row.get(c, '')) for c in text_cols]
+        sem_parts = [p for p in sem_parts if p]
+        if not sem_parts:
+            sem_parts = list(leaf_parts)
+        semantic = _URL_RE.sub(' ', ' '.join(sem_parts)) if sem_parts else label
+        rows.append({
+            '_source_file':   source,
+            '_row_index':     int(i),
+            '_leaf_label':    label,
+            '_leaf_id':       f'{gpath}.{label}' if gpath != 'Ungrouped' else label,
+            '_group_path':    gpath,
+            '_text':          text,
+            '_semantic_text': semantic,
+        })
+    can = pd.DataFrame(rows)
+    if can['_leaf_id'].duplicated().any():
+        cnt: dict = defaultdict(int)
+        ids = []
+        for lid in can['_leaf_id']:
+            cnt[lid] += 1
+            ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
+        can['_leaf_id'] = ids
+    return can
+# ─────────────────────────────────────────────────────────────────────────────
+# TAXONOMIZER CORE  [TAX — Mahmood & Mueller, IEEE TVCG 2019]
+#
+# Taxonomizer builds the taxonomy from a SEMANTIC SPACE (cosine distance between
+# word2vec skip-gram embeddings of attribute names) merged with a DATA SPACE
+# (correlation over the raw values).  In a metadata/schema setting we have no
+# raw data values, so we use the semantic space alone (= Taxonomizer with
+# semantic weight 1.0).  Because attribute *names* here are opaque codes that go
+# out-of-vocabulary — a limitation the paper explicitly flags (e.g. "BP") — we
+# embed code + description so real words carry the meaning (OOV code tokens are
+# skipped during averaging).  Internal-node labels: the paper uses semi-automatic
+# distributional degree-of-entailment + WordNet synonyms; a baseline must be
+# fully automatic, so we use data-driven contrastive terms drawn from the data.
+# ─────────────────────────────────────────────────────────────────────────────
+_W2V_STOP = frozenset(
+    'a an the and or but if in on at to of for with by is are was were be '
+    'been being have has had do does did will would could should may might '
+    'shall can this that these those i you he she it we they me him her us '
+    'them my your his her its our their what which who whom when where why '
+    'how all each every few more most other some such no not only same so '
+    'than too very just because as until while'.split()
+)
+@st.cache_resource(show_spinner=False)
+def _load_w2v():
+    """Load pre-trained Word2Vec / GloVe model via gensim downloader.
+    We prefer glove-wiki-gigaword-100 (~66 MB) because its Wikipedia training
+    corpus and skip-gram-style objective most closely match Taxonomizer's
+    described word2vec-Wikipedia-dim128 model.
+    """
+    try:
+        import gensim.downloader as api
+        return api.load('glove-wiki-gigaword-100')
+    except Exception as e:
+        st.error(
+            f'Could not load Word2Vec model: {e}\n\n'
+            'Run:  pip install gensim  and restart the app.\n'
+            'The model (~66 MB) is downloaded automatically on first use.'
+        )
+        return None
+def _tokenize(label: str) -> list[str]:
+    return [t for t in re.sub(r'[^a-zA-Z]+', ' ', label).lower().split()
+            if len(t) > 2 and t not in _W2V_STOP]
+def attribute_name(text: str) -> str:
+    """The attribute's short NAME — what Taxonomizer actually embeds [TAX §3.2].
+    The paper embeds the attribute name ("not more than a few words long"), not a
+    paragraph.  Descriptions here are formatted '<name>: <full sentence>' (some
+    prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
+    that is not a pure all-caps marker.  Embedding this short name — rather than
+    the full description prose — keeps the domain-specific words from being
+    diluted by shared explanatory text, so the taxonomy clusters more by theme
+    (e.g. DMS / PAL / SWM).
+    """
+    text = str(text)
+    for clause in re.split(r'[:\n]', text):
+        clause = clause.strip()
+        if clause and not all(2 <= len(w) <= 6 and w.isupper() for w in clause.split()):
+            return clause
+    return text.strip()
+def embed_labels_w2v(labels: list[str], model) -> np.ndarray:
+    """Average Word2Vec vectors for each label's tokens [TAX §4.1].
+    Falls back to a zero vector for labels where none of the tokens are in the
+    model vocabulary (rare for standard English attribute names).
+    """
+    dim = model.vector_size
+    out = np.zeros((len(labels), dim), dtype=np.float32)
+    for i, label in enumerate(labels):
+        toks = _tokenize(label)
+        vecs = [model[t] for t in toks if t in model]
+        if vecs:
+            out[i] = np.mean(vecs, axis=0)
+    # L2-normalise so cosine distance = 1 - dot
+    norms = np.linalg.norm(out, axis=1, keepdims=True)
+    norms[norms == 0] = 1.0
+    return out / norms
+def _cluster(X: np.ndarray, k: int) -> np.ndarray:
+    """Ward-linkage agglomerative cut into k clusters.
+    Ward (on the L2-normalised embedding vectors, where Euclidean ∝ √cosine)
+    minimises within-cluster variance and so produces *balanced* clusters.
+    This avoids the average/single-linkage chaining pathology that otherwise
+    peels off tiny clusters and leaves one giant residual (i.e. no real
+    hierarchy forms).
+    """
+    return AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(X)
+def best_k(X: np.ndarray, n: int, k_min: int = 2, k_max: int = 8) -> int:
+    """Pick the number of clusters that maximises the silhouette score.
+    Fully data-driven — no fixed cluster count.  Returns 1 only when the node
+    is too small to split (n <= k_min).
+    """
+    k_hi = min(k_max, n - 1)
+    if k_hi < k_min:
+        return 1
+    best, best_s = 1, -1.0
+    for k in range(k_min, k_hi + 1):
+        labels = _cluster(X, k)
+        if len(set(labels)) < 2:
+            continue
+        try:
+            s = silhouette_score(X, labels)
+        except Exception:
+            continue
+        if s > best_s:
+            best_s, best = s, k
+    return best
+def _doc_freq(texts: list[str]) -> Counter:
+    """Document frequency: how many member texts each content word appears in."""
+    c: Counter = Counter()
+    for t in texts:
+        for w in set(_tokenize(t)):
+            c[w] += 1
+    return c
+def cluster_term_label(member_texts: list[str], sibling_texts: list[str],
+                       used: set, vocab=None, top_n: int = 2) -> str:
+    """Label a node with the content words most characteristic of its members.
+    Data-driven labelling: each candidate word is scored by how much more
+    frequent it is *inside* the cluster than in the sibling pool (contrastive
+    document frequency), so labels are domain terms drawn from the dataset
+    itself — not external ontology words.  This replaces Taxonomizer's
+    WordNet degree-of-entailment, which produces over-general, off-domain
+    abstractions on specialised scientific metadata.
+    If `vocab` is given (the Word2Vec model), only real dictionary words are
+    eligible, so opaque attribute codes (e.g. 'dms', 'motml') are filtered out
+    of labels.  Codes are used only as a last-resort fallback.
+    """
+    def in_vocab(w: str) -> bool:
+        return vocab is None or w in vocab
+    n_in  = max(len(member_texts), 1)
+    n_out = max(len(sibling_texts), 1)
+    cin   = _doc_freq(member_texts)
+    cout  = _doc_freq(sibling_texts)
+    scores: dict[str, float] = {}
+    for w, f in cin.items():
+        if w in used or len(w) <= 2 or not in_vocab(w):
+            continue
+        p_in  = f / n_in
+        p_out = cout.get(w, 0) / n_out
+        # ignore single-occurrence noise unless the term is widely shared
+        if f < 2 and p_in < 0.5:
+            continue
+        scores[w] = p_in - p_out
+    picks = [w for w, _ in sorted(scores.items(), key=lambda x: -x[1])[:top_n]
+             if scores[w] > 0]
+    if not picks:
+        # fallback: most frequent shared real word, then any shared token
+        for require_vocab in (True, False):
+            for w, _ in cin.most_common():
+                if w not in used and len(w) > 2 and (not require_vocab or in_vocab(w)):
+                    picks = [w]
+                    break
+            if picks:
+                break
+    return ' / '.join(p.title() for p in picks) if picks else 'Group'
+# ─────────────────────────────────────────────────────────────────────────────
+# HIERARCHY CONSTRUCTION  [TAX + GON]
+# ─────────────────────────────────────────────────────────────────────────────
+def _nmap(nodes: list) -> dict:
+    return {int(n['id']): n for n in nodes}
+def _next_id(nodes: list) -> int:
+    return max((int(n['id']) for n in nodes), default=0) + 1
+def _add_child(nodes: list, parent_id: int, child_id: int):
+    m = _nmap(nodes)
+    p = m.get(int(parent_id))
+    if p is None:
+        return
+    rel = list(p.get('related', []))
+    if int(child_id) not in rel:
+        rel.append(int(child_id))
+    p['related'] = rel
+def _make_agg(nid: int, name: str, desc: str = '') -> dict:
+    return {'id': int(nid), 'name': str(name), 'related': [],
+            'type': 'aggregation', 'isShown': True, 'desc': desc, 'dtype': 'determine'}
+def _leaf_ids(nodes: list, nid: int) -> list:
+    m = _nmap(nodes)
+    out: list = []
+    def rec(x):
+        n = m.get(int(x))
+        if not n:
+            return
+        if n.get('type') == 'attribute':
+            out.append(int(x))
+            return
+        for c in n.get('related', []):
+            rec(int(c))
+    rec(nid)
+    return list(dict.fromkeys(out))
+def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
+                    max_depth: int = 3, min_cluster_size: int = 6,
+                    branch_max: int = 8) -> list:
+    """Taxonomizer semantic-space construction [TAX].
+    Embeds each variable from its short attribute NAME (Word2Vec skip-gram
+    average) — the name clause of the description, as Taxonomizer specifies.
+    Recursively clusters via balanced Ward linkage — the semantic-space
+    dendrogram.  Labels each internal node with the contrastive content terms of
+    its members (data-driven, fully automatic). No hardcoding.
+    """
+    # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
+    nodes: list = [{'id': 0, 'name': project, 'type': 'root',
+                    'dtype': 'root', 'isShown': True, 'related': [], 'desc': 'Root node'}]
+    row_to_node: list = []
+    embed_list: list[str] = []    # short attribute name → embedding input + labels
+    for i, (_, r) in enumerate(can.iterrows(), start=1):
+        sem  = str(r.get('_semantic_text', '') or r['_leaf_label'])
+        name = attribute_name(sem) or str(r['_leaf_label'])
+        nodes.append({'id': i, 'name': r['_leaf_label'], 'dtype': 'determine',
+                      'related': [], 'isShown': True, 'type': 'attribute',
+                      'desc': r['_text'],
+                      'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}})
+        row_to_node.append(i)
+        embed_list.append(name)
+    label_list = embed_list
+    row_to_node = np.array(row_to_node)
+    # ── Word2Vec semantic-space embeddings [TAX §3.2] ─────────────────────────
+    emb = embed_labels_w2v(embed_list, w2v_model)   # (N, dim), L2-normalised
+    # ── recursive clustering down the Ward dendrogram ─────────────────────────
+    def attach_leaves(parent_id: int, idx: np.ndarray):
+        for i in idx:
+            _add_child(nodes, parent_id, int(row_to_node[i]))
+    def recurse(parent_id: int, idx: np.ndarray, depth: int, used: set):
+        n = len(idx)
+        if n <= min_cluster_size or depth >= max_depth:
+            attach_leaves(parent_id, idx)
+            return
+        sub = emb[idx]
+        k_cap = min(branch_max, n - 1)
+        # Branching floor: a node with n leaves and `remaining` levels left must
+        # fan out enough to fit all its leaves into buckets of ~min_cluster_size
+        # by the depth cap, i.e. k >= (n / min_cluster_size) ** (1/remaining).
+        # Without this, silhouette keeps picking k=2 on overlapping data (e.g.
+        # HCP), giving a near-binary tree that dumps ~100 leaves per bottom node.
+        remaining = max(1, max_depth - depth)
+        k_floor = int(np.ceil((n / max(min_cluster_size, 1)) ** (1.0 / remaining)))
+        k_floor = max(2, min(k_floor, k_cap))
+        k = best_k(sub, n, k_min=k_floor, k_max=k_cap)
+        if k <= 1:
+            k = min(k_floor, k_cap) if n > min_cluster_size else 1
+        if k <= 1:
+            attach_leaves(parent_id, idx)
+            return
+        cluster_labels = _cluster(sub, k)
+        for c in range(k):
+            mask    = cluster_labels == c
+            members = idx[mask]
+            if len(members) == 0:
+                continue
+            if len(members) == 1:           # don't create singleton internal nodes
+                _add_child(nodes, parent_id, int(row_to_node[members[0]]))
+                continue
+            mset = set(members.tolist())
+            member_texts  = [label_list[i] for i in members]
+            sibling_texts = [label_list[i] for i in idx if i not in mset]
+            # data-driven contrastive-term labelling
+            label = cluster_term_label(member_texts, sibling_texts, used)
+            nid = _next_id(nodes)
+            nodes.append(_make_agg(nid, label,
+                                   desc=f'Cluster of {len(members)} variables — '
+                                        f'label terms: {label}'))
+            _add_child(nodes, parent_id, nid)
+            recurse(nid, members, depth + 1, used | {label.lower()})
+    recurse(0, np.arange(len(can)), 0, set())
+    for n in nodes:
+        n['related'] = list(dict.fromkeys(int(x) for x in n.get('related', [])))
+    return nodes
+# ─────────────────────────────────────────────────────────────────────────────
+# VISUALISATION
+# ─────────────────────────────────────────────────────────────────────────────
+def _parent_map(nodes: list) -> dict:
+    pm: dict = {}
+    for n in nodes:
+        for c in n.get('related', []):
+            if int(c) not in pm:
+                pm[int(c)] = int(n['id'])
+    return pm
+# ─────────────────────────────────────────────────────────────────────────────
+# EVALUATION HELPERS
+# ─────────────────────────────────────────────────────────────────────────────
+def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list[int]:
+    """Return predicted cluster id (depth-1 aggregation ancestor) for each row in can."""
+    pm = _parent_map(nodes)
+    def depth1(nid: int) -> int:
+        # Walk up until our parent is root (id==0) or we have no parent
+        while pm.get(nid, -1) not in (-1, 0):
+            nid = pm[nid]
+        return nid
+    lid_to_nid = {n['metadata']['leaf_id']: int(n['id'])
+                  for n in nodes if n.get('type') == 'attribute' and 'metadata' in n}
+    return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1
+            for lid in can['_leaf_id']]
+def _purity(y_true, y_pred) -> float:
+    from collections import Counter
+    clusters: dict = {}
+    for t, p in zip(y_true, y_pred):
+        clusters.setdefault(p, []).append(t)
+    correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values())
+    return correct / max(len(y_true), 1)
+def _structural_stats(nodes: list) -> dict:
+    pm = _parent_map(nodes)
+    def depth_of(nid: int) -> int:
+        d = 0
+        while nid in pm:
+            nid = pm[nid]; d += 1
+        return d
+    agg   = [n for n in nodes if n.get('type') == 'aggregation']
+    leafs = [n for n in nodes if n.get('type') == 'attribute']
+    depths   = [depth_of(int(n['id'])) for n in leafs]
+    branches = [len(n.get('related', [])) for n in agg]
+    singletons = sum(1 for b in branches if b == 1)
+    return {
+        'n_aggregation_nodes':  len(agg),
+        'max_depth':            int(max(depths, default=0)),
+        'avg_leaf_depth':       round(float(np.mean(depths)), 2) if depths else 0.0,
+        'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0,
+        'singleton_nodes_%':    round(100.0 * singletons / max(len(agg), 1), 1),
+    }
+def _wrap(text: str, width: int = 70) -> str:
+    """Wrap long hover text onto multiple <br> lines so it never runs off-screen."""
+    import textwrap
+    text = str(text).replace('<', '&lt;')
+    lines: list = []
+    for para in text.split('\n'):
+        wrapped = textwrap.wrap(para, width=width) or ['']
+        lines.extend(wrapped)
+    return '<br>'.join(lines)
+def plot_sunburst(nodes: list, max_depth: int = 4) -> go.Figure:
+    pm = _parent_map(nodes)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n['id'])
+        lc  = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get('name', ''))[:40])
+        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+        values.append(max(1, lc))
+        desc = _wrap(n.get('desc', ''))
+        hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Type: {n.get("type","")}'
+                     f'<br>Variables: {lc}<br><br>{desc}')
+    fig = go.Figure(go.Sunburst(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues='total', hovertext=hover, hoverinfo='text',
+        maxdepth=max_depth, insidetextorientation='radial',
+        marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
+    ))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
+                      title='Click a sector to drill down — click centre to go back')
+    return fig
+def plot_treemap(nodes: list) -> go.Figure:
+    pm = _parent_map(nodes)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n['id'])
+        lc  = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get('name', ''))[:40])
+        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+        values.append(max(1, lc))
+        desc = _wrap(n.get('desc', ''))
+        hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Variables: {lc}<br>{desc}')
+    fig = go.Figure(go.Treemap(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues='total', hovertext=hover, hoverinfo='text',
+        textinfo='label+value',
+        marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
+    ))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
+    return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# NODE-LINK TREE  (Reingold–Tilford layout — matches Approach 1 / 2 interface)
+# ─────────────────────────────────────────────────────────────────────────────
+def _bl_node_color(n: dict) -> str:
+    t = n.get('type', '')
+    if t == 'root':      return '#2a7d2a'
+    if t == 'attribute': return '#74c476'
+    if t == 'collapsed': return '#bbbbbb'
+    return '#238b45'
+def _display_graph(nodes: list, max_depth: int = 4):
+    """Walk the tree to the chosen depth, inserting 'collapsed' placeholders for
+    branches cut off below max_depth (the Level-of-Detail control)."""
+    m = _nmap(nodes)
+    dnodes: dict = {}
+    edges: list  = []
+    counter = 10 ** 9
+    def rec(nid, depth):
+        nonlocal counter
+        n = m.get(int(nid))
+        if not n:
+            return
+        dnodes[int(nid)] = n
+        if depth >= max_depth and n.get('related'):
+            counter += 1
+            cid = counter
+            n_leaves = len(_leaf_ids(nodes, nid))
+            dnodes[cid] = {'id': cid, 'name': f'… {n_leaves} variables',
+                           'type': 'collapsed', 'related': [],
+                           'desc': f"Collapsed: {n.get('name')}", 'isShown': True}
+            edges.append((int(nid), cid))
+            return
+        for c in n.get('related', []):
+            if int(c) not in m:
+                continue
+            edges.append((int(nid), int(c)))
+            rec(int(c), depth + 1)
+    rec(0, 0)
+    return list(dnodes.values()), edges
+def _positions(edges: list):
+    """Reingold–Tilford style positions: x = depth, y = subtree-aware vertical."""
+    H_SCALE, V_SPACE = 3.0, 1.8
+    children: dict = defaultdict(list)
+    for p, c in edges:
+        children[p].append(c)
+    pos: dict = {}
+    counter = {'v': 0}
+    def rec(nid, depth):
+        ch = children.get(nid, [])
+        if not ch:
+            y = counter['v'] * V_SPACE
+            counter['v'] += 1
+            pos[nid] = (depth * H_SCALE, y)
+            return y
+        y = float(np.mean([rec(c, depth + 1) for c in ch]))
+        pos[nid] = (depth * H_SCALE, y)
+        return y
+    rec(0, 0)
+    return pos
+def plot_node_link(nodes: list, max_depth: int = 4, show_leaf_labels: bool = False) -> go.Figure:
+    """Node-link tree with elbow edges. Best for inspecting structure at moderate
+    depth; Sunburst is recommended for large hierarchies (Taxonomizer)."""
+    dnodes, edges = _display_graph(nodes, max_depth)
+    pos = _positions(edges)
+    ex, ey = [], []
+    for p, c in edges:
+        if p not in pos or c not in pos:
+            continue
+        x0, y0 = pos[p]; x1, y1 = pos[c]
+        xm = (x0 + x1) / 2
+        ex += [x0, xm, xm, x1, None]
+        ey += [y0, y0, y1, y1, None]
+    traces = [go.Scatter(x=ex, y=ey, mode='lines',
+                         line=dict(width=1, color='#c8c8c8'),
+                         hoverinfo='skip', showlegend=False)]
+    agg_x, agg_y, agg_l, agg_c, agg_h = [], [], [], [], []
+    lf_x,  lf_y,  lf_l,  lf_c,  lf_h  = [], [], [], [], []
+    for n in dnodes:
+        nid = int(n['id'])
+        if nid not in pos:
+            continue
+        x, y = pos[nid]
+        lc   = len(_leaf_ids(nodes, nid))
+        lab  = str(n.get('name', nid))
+        htxt = (f"<b>{_wrap(n.get('name',''))}</b><br>Type: {n.get('type','')}"
+                f"<br>Variables: {lc}<br><br>{_wrap(n.get('desc',''))}")
+        col  = _bl_node_color(n)
+        if n.get('type') in ('root', 'aggregation', 'collapsed'):
+            agg_x.append(x); agg_y.append(y)
+            agg_l.append((lab + (f' ({lc})' if lc else ''))[:50])
+            agg_c.append(col); agg_h.append(htxt)
+        else:
+            lf_x.append(x); lf_y.append(y)
+            lf_l.append(lab[:40] if show_leaf_labels else '')
+            lf_c.append(col); lf_h.append(htxt)
+    if agg_x:
+        traces.append(go.Scatter(
+            x=agg_x, y=agg_y, mode='markers+text', text=agg_l,
+            textposition='middle right', hovertext=agg_h, hoverinfo='text',
+            marker=dict(size=16, color=agg_c, line=dict(color='white', width=2)),
+            showlegend=False))
+    if lf_x:
+        traces.append(go.Scatter(
+            x=lf_x, y=lf_y, mode='markers+text', text=lf_l,
+            textposition='middle right', hovertext=lf_h, hoverinfo='text',
+            marker=dict(size=7, color=lf_c, symbol='circle', opacity=0.75,
+                        line=dict(color='white', width=1)),
+            showlegend=False))
+    n_leaves = max(12, len(lf_x))
+    fig = go.Figure(traces)
+    fig.update_layout(
+        height=max(700, min(4000, int(n_leaves * 32))),
+        margin=dict(l=20, r=220, t=30, b=20),
+        plot_bgcolor='white', paper_bgcolor='white',
+        xaxis=dict(visible=False, fixedrange=False),
+        yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
+        dragmode='pan')
+    return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# SIDEBAR
+# ─────────────────────────────────────────────────────────────────────────────
+with st.sidebar:
+    st.header('1. Upload')
+    uploaded = st.file_uploader(
+        'Upload a metadata file',
+        type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
+        accept_multiple_files=False,
+    )
+    st.header('2. Taxonomizer settings')
+    tx_max_depth = st.slider('Max taxonomy depth', 2, 6, 3, 1,
+                             help='How many abstract-to-concrete levels to build')
+    tx_min_size  = st.slider('Min cluster size', 3, 20, 6, 1,
+                             help='Clusters smaller than this stop splitting (leaves attach directly)')
+    tx_branch    = st.slider('Max branches per node', 3, 12, 8, 1,
+                             help='Upper bound on clusters per split; the actual number is chosen by silhouette')
+    st.header('3. Display')
+    max_items     = st.slider('Maximum variables', 25, 1200, 900, 25,
+                              help='Cap on variables included (lower only to speed up very large files). '
+                                   'Default keeps full datasets like HCP (813).')
+    group_filter  = st.text_input('Row filter (optional)', value='',
+                                  help='Filter rows by contextual path text before building')
+# ─────────────────────────────────────────────────────────────────────────────
+# MAIN
+# ───────────────────────────────────────────────────────────────��─────────────
+if not uploaded:
+    st.info('Upload a metadata CSV / XLSX / JSON file to begin.')
+    st.markdown("""
+    ### Baseline algorithm — Taxonomizer (semantic space)
+    Based on **Mahmood & Mueller, IEEE TVCG 2019** (Taxonomizer), adapted to a
+    metadata-only setting. No hardcoded domain patterns, no external APIs.
+    | Step | Method | Paper |
+    |------|--------|-------|
+    | Variable representation | **short attribute name** (description's name clause; codes are OOV) | Taxonomizer §3.2 / §4.1 |
+    | Embedding | Word2Vec skip-gram — average of word vectors (`glove-wiki-gigaword-100`) | Taxonomizer §3.2 |
+    | Semantic space | Cosine-distance matrix (no data space — schema has no raw values) | Taxonomizer §3.2 *(adapted)* |
+    | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram | Taxonomizer §4.2 |
+    | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer §4.3 *(adapted)* |
+    This page is the pure Taxonomizer-style semantic-space reference method:
+    variable meanings are embedded and recursively clustered into a hierarchy,
+    with node labels generated from contrastive terms.
+    **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
+    **Approach 2** adds NMF/FASTopic aspect discovery + GMM clustering + optional LLM labels.
+    """)
+    st.stop()
+path = save_upload(uploaded)
+@st.cache_data(show_spinner=False)
+def _load_profile(path_str: str):
+    df = load_any(Path(path_str))
+    cfg, prof = detect_roles(df)
+    return df, cfg, prof
+with st.spinner('Loading file…'):
+    df, auto_cfg, prof = _load_profile(str(path))
+st.subheader('Step 1 — File preview')
+with st.expander(f'{uploaded.name}  ({len(df):,} rows, {len(df.columns)} columns)',
+                 expanded=False):
+    st.dataframe(df.head(10), use_container_width=True)
+    score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
+                  if c in prof.columns]
+    st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
+                 use_container_width=True)
+st.subheader('Step 2 — Confirm column roles')
+cols = list(df.columns)
+# Scope widget keys to the uploaded file so a NEW file always shows its own
+# auto-detected defaults (Streamlit otherwise keeps the previous file's
+# selections under a fixed key, which silently overrides the new defaults).
+_fk = safe_name(uploaded.name)
+with st.expander('Column configuration', expanded=True):
+    left, right = st.columns(2)
+    with left:
+        leaf_cols = st.multiselect('Leaf variable column(s)', cols,
+            default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
+        group_cols = st.multiselect('Context column(s) (optional)', cols,
+            default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
+            help='Optional contextual columns for display/filtering.')
+    with right:
+        text_cols = st.multiselect('Text/description column(s)', cols,
+            default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
+        meta_cols = st.multiselect('Metadata/type column(s)', cols,
+            default=[c for c in auto_cfg.get('metadata_cols', []) if c in cols], key=f'meta_{_fk}')
+if not leaf_cols:
+    st.error('Choose at least one leaf variable column.')
+    st.stop()
+cfg = {'leaf_cols': leaf_cols, 'group_cols': group_cols,
+       'text_cols': text_cols, 'metadata_cols': meta_cols}
+if st.button('Build baseline hierarchy', type='primary'):
+    # ── load Word2Vec model (cached after first call) ──────────────────────
+    with st.spinner('Loading Word2Vec model (first run downloads ~66 MB)…'):
+        _w2v = _load_w2v()
+    if _w2v is None:
+        st.stop()
+    with st.spinner('Building hierarchy…'):
+        _can = build_canonical(df, cfg, source=Path(uploaded.name).stem)
+        if group_filter.strip():
+            _can = _can[_can['_group_path'].str.contains(
+                group_filter.strip(), case=False, na=False)].copy()
+        if len(_can) > max_items:
+            _can = _can.head(max_items).copy()
+        _can = _can.reset_index(drop=True)
+        if len(_can) < 2:
+            st.error('Need at least 2 variables after filtering.')
+            st.stop()
+        _pname = Path(uploaded.name).stem
+        _nodes = build_hierarchy(_can, _w2v, project=_pname,
+                                 max_depth=tx_max_depth,
+                                 min_cluster_size=tx_min_size,
+                                 branch_max=tx_branch)
+    st.session_state['_bl_nodes']   = _nodes
+    st.session_state['_bl_can']     = _can
+    st.session_state['_bl_project'] = _pname
+if '_bl_nodes' not in st.session_state:
+    st.info('Configure columns above then click **Build baseline hierarchy**.')
+    st.stop()
+nodes        = st.session_state['_bl_nodes']
+can          = st.session_state['_bl_can']
+project_name = st.session_state['_bl_project']
+_sm = _structural_stats(nodes)
+n_leaves   = len([n for n in nodes if n['type'] == 'attribute'])
+n_internal = len([n for n in nodes if n['type'] == 'aggregation'])
+st.divider()
+c1, c2, c3, c4 = st.columns(4)
+c1.metric('Variables', n_leaves)
+c2.metric('Aggregation nodes', n_internal)
+c3.metric('Max depth', _sm['max_depth'])
+c4.metric('Avg branching', _sm['avg_branching_factor'])
+tabs = st.tabs(['Visualization', 'Node detail', 'Canonical table', 'Export', 'Evaluation'])
+with tabs[0]:
+    # ── Visualization controls (above chart — matches Approach 1 / 2) ─────────
+    vc1, vc2, vc3 = st.columns([3, 2, 1])
+    with vc1:
+        viz_mode = st.radio(
+            'View mode',
+            ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
+            horizontal=True, index=0,
+            help='Sunburst best for large hierarchies [Taxonomizer]. '
+                 'Node-link best for inspecting structure at moderate depth.')
+    with vc2:
+        display_depth = st.slider('Depth (Level of Detail)', 1, 8, 4, 1,
+                                  help='How many levels to reveal at once.')
+    with vc3:
+        show_leaf_labels = st.checkbox('Leaf labels', value=False,
+                                       help='Show variable names on the node-link tree.')
+    st.divider()
+    if viz_mode == 'Sunburst (drill-down)':
+        st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth),
+                        use_container_width=True)
+        st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
+    elif viz_mode == 'Treemap':
+        st.plotly_chart(plot_treemap(nodes), use_container_width=True)
+    else:
+        st.plotly_chart(plot_node_link(nodes, max_depth=display_depth,
+                                       show_leaf_labels=show_leaf_labels),
+                        use_container_width=True)
+with tabs[1]:
+    nm = _nmap(nodes)
+    agg_nodes = [n for n in nodes if n['type'] in ('aggregation', 'root')]
+    options   = [f'{n["name"]}  [{len(_leaf_ids(nodes, int(n["id"])))} vars]'
+                 for n in agg_nodes]
+    if options:
+        sel      = st.selectbox('Select a node', options)
+        sel_name = sel.split('  [')[0]
+        sel_node = next((n for n in agg_nodes if n['name'] == sel_name), None)
+        if sel_node:
+            lids = _leaf_ids(nodes, int(sel_node['id']))
+            leaf_ids_set = {nm[i]['metadata']['leaf_id']
+                            for i in lids if i in nm and 'metadata' in nm[i]}
+            sub = can[can['_leaf_id'].isin(leaf_ids_set)]
+            st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
+            st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
+                         use_container_width=True)
+with tabs[2]:
+    st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), use_container_width=True)
+with tabs[3]:
+    _base = safe_name(project_name)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.download_button(
+            'Hierarchy JSON',
+            data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
+            file_name=f'{_base}_baseline_hierarchy.json',
+            mime='application/json',
+            use_container_width=True,
+        )
+    with col2:
+        st.download_button(
+            'Canonical CSV',
+            data=can.to_csv(index=False).encode('utf-8'),
+            file_name=f'{_base}_baseline_canonical.csv',
+            mime='text/csv',
+            use_container_width=True,
+        )
+    st.divider()
+    # ── Save directly into the project's outputs/baseline/ folder ──────────────
+    _out_dir = Path(__file__).resolve().parent / 'outputs' / 'baseline'
+    st.markdown('### Save to project folder')
+    st.caption(
+        "The download buttons above go to your browser's Downloads folder (a browser "
+        f'restriction). This button instead writes the files into `{_out_dir}` with the '
+        'dataset name — convenient for `evaluate_all.py`.'
+    )
+    if st.button('Save all to outputs/baseline/', type='primary',
+                 use_container_width=True):
+        try:
+            _out_dir.mkdir(parents=True, exist_ok=True)
+            (_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
+                json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
+            can.to_csv(_out_dir / f'{_base}_baseline_canonical.csv', index=False)
+            st.success(f'Saved to `{_out_dir}`:\n\n'
+                       f'- {_base}_baseline_hierarchy.json\n'
+                       f'- {_base}_baseline_canonical.csv')
+        except Exception as _e:
+            st.error(f'Could not save: {_e}')
+with tabs[4]:
+    import hierarchy_eval as he
+    st.subheader('Hierarchy Quality Evaluation')
+    st.caption(
+        'No manually curated reference taxonomy is available for this experiment. '
+        'The metrics below are reference-free: they assess hierarchy structure, '
+        'label coherence and interpretability directly.'
+    )
+    with st.spinner('Computing reference-free metrics…'):
+        tm = he.traco_metrics(nodes)
+        npmi = he.npmi_coherence(nodes, can['_text'].tolist())
+    # ── PRIMARY: reference-free hierarchy quality ─────────────────────────────
+    st.markdown('#### Primary — reference-free hierarchy quality')
+    p1, p2, p3 = st.columns(3)
+    p1.metric('Parent–child coherence', tm['pc_coherence'],
+              help='TraCo (Wu et al., AAAI 2024). Mean similarity of each node to its parent. '
+                   'Higher = children correctly nest under their parent theme.')
+    p2.metric('Sibling diversity', tm['sibling_diversity'],
+              help='TraCo (Wu et al., AAAI 2024). Mean distance between sibling nodes. '
+                   'Higher = siblings are distinct (LOW = redundant/repeated siblings).')
+    p3.metric('NPMI label coherence', npmi,
+              help='Lau et al., EACL 2014. Whether node-label terms genuinely co-occur in the '
+                   'data. Higher = meaningful labels, not arbitrary term salads.')
+    st.caption(f'Embedding backend: **{tm["encoder"]}**.  '
+               'Coherence & diversity ∈ [−1, 1]; NPMI ∈ ≈[−1, 1].')
+    # ── Label-quality proxies (interpretability) ──────────────────────────────
+    st.markdown('#### Label quality *(interpretability — reference-free)*')
+    lq = he.label_quality(nodes)
+    l1, l2, l3 = st.columns(3)
+    l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
+              help='% of internal labels that read as a real concept (short noun '
+                   'phrase, WordNet head) rather than a "/"-joined term fragment.')
+    l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
+              help='% of internal labels duplicating a sibling label (lower is better).')
+    l3.metric('Avg label words', lq['avg_label_words'],
+              help='Mean label length in words (shorter = more name-like).')
+    # ── Structural metrics ────────────────────────────────────────────────────
+    st.markdown('#### Structural statistics')
+    sm = he.structural_stats(nodes)
+    s1, s2, s3, s4, s5 = st.columns(5)
+    s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
+    s2.metric('Max leaf depth',    sm['max_depth'])
+    s3.metric('Avg leaf depth',    sm['avg_leaf_depth'])
+    s4.metric('Avg branching',     sm['avg_branching_factor'])
+    s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%",
+              help='Aggregation nodes with a single child (sparse-hierarchy indicator)')

version2/data/HCP_S1200_DataDictionary_Oct_30_2023.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/data/ai-mind-variable-descriptions(in).csv ADDED Viewed

	@@ -0,0 +1,109 @@

+Task,Variant,name,description,Decimal Places
+DMS,DMS Recommended Standard,DMSCC,"DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays).",2
+DMS,DMS Recommended Standard,DMSL0SD,"DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay.",4
+DMS,DMS Recommended Standard,DMSL12SD,"DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay.",4
+DMS,DMS Recommended Standard,DMSL4SD,"DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay.",4
+DMS,DMS Recommended Standard,DMSLADSD,"DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay.",4
+DMS,DMS Recommended Standard,DMSLSD,DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,4
+DMS,DMS Recommended Standard,DMSLSSD,"DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations.",4
+DMS,DMS Recommended Standard,DMSMDL,DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,4
+DMS,DMS Recommended Standard,DMSMDL0,DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,4
+DMS,DMS Recommended Standard,DMSMDL12,DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,4
+DMS,DMS Recommended Standard,DMSMDL4,DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,4
+DMS,DMS Recommended Standard,DMSMDLAD,DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,4
+DMS,DMS Recommended Standard,DMSMDLS,DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,4
+DMS,DMS Recommended Standard,DMSML,DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,4
+DMS,DMS Recommended Standard,DMSML0,DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,4
+DMS,DMS Recommended Standard,DMSML12,DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,4
+DMS,DMS Recommended Standard,DMSML4,DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,4
+DMS,DMS Recommended Standard,DMSMLAD,DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,4
+DMS,DMS Recommended Standard,DMSMLS,DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,4
+DMS,DMS Recommended Standard,DMSPC,DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,0
+DMS,DMS Recommended Standard,DMSPC0,KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay.,0
+DMS,DMS Recommended Standard,DMSPC12,KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay.,0
+DMS,DMS Recommended Standard,DMSPC4,KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay.,0
+DMS,DMS Recommended Standard,DMSPCAD,KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay.,0
+DMS,DMS Recommended Standard,DMSPCS,KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli.,0
+DMS,DMS Recommended Standard,DMSPEGC,DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays).,4
+DMS,DMS Recommended Standard,DMSPEGE,KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays).,4
+DMS,DMS Recommended Standard,DMSTC,DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,0
+DMS,DMS Recommended Standard,DMSTC0,DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds.,0
+DMS,DMS Recommended Standard,DMSTC12,DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds.,0
+DMS,DMS Recommended Standard,DMSTC4,DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds.,0
+DMS,DMS Recommended Standard,DMSTCAD,DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay.,0
+DMS,DMS Recommended Standard,DMSTCS,DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli.,0
+DMS,DMS Recommended Standard,DMSTE,"DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen.",0
+DMS,DMS Recommended Standard,DMSTEAD,DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component.,0
+DMS,DMS Recommended Standard,DMSTEC,"DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays).",0
+DMS,DMS Recommended Standard,DMSTECAD,"DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component.",0
+DMS,DMS Recommended Standard,DMSTED,"DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays).",0
+DMS,DMS Recommended Standard,DMSTEDAD,"DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component.",0
+DMS,DMS Recommended Standard,DMSTEP,"DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays).",0
+DMS,DMS Recommended Standard,DMSTEPAD,"DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component.",0
+MOT,MOT Tone 2.0,MOTML,The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials.,1
+MOT,MOT Tone 2.0,MOTSDL,"This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials.",2
+MOT,MOT Tone 2.0,MOTTC,The total number of assessment trials on which the subject made a correct response.,0
+MOT,MOT Tone 2.0,MOTTE,The total number of assessment trials on which the subject failed to make a correct response.,0
+PAL,PAL Recommended Standard Extended,PALFAMS28,"KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard..",0
+PAL,PAL Recommended Standard Extended,PALMETS28,PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage.  Does not include 12 box level to provide a direct comparison to Recommended Standard.,0
+PAL,PAL Recommended Standard Extended,PALNPR28,PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached.,0
+PAL,PAL Recommended Standard Extended,PALTA12,PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall.,0
+PAL,PAL Recommended Standard Extended,PALTA2,PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall.,0
+PAL,PAL Recommended Standard Extended,PALTA28,PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems.  Does not include 12 box level to provide a direct comparison to Recommended Standard.,0
+PAL,PAL Recommended Standard Extended,PALTA4,PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall.,0
+PAL,PAL Recommended Standard Extended,PALTA6,PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall.,0
+PAL,PAL Recommended Standard Extended,PALTA8,PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall.,0
+PAL,PAL Recommended Standard Extended,PALTE12,PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials.,0
+PAL,PAL Recommended Standard Extended,PALTE2,PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials.,0
+PAL,PAL Recommended Standard Extended,PALTE28,PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials.  Does not include 12 box level to provide a direct comparison to Recommended Standard.,0
+PAL,PAL Recommended Standard Extended,PALTE4,PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials.,0
+PAL,PAL Recommended Standard Extended,PALTE6,PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials.,0
+PAL,PAL Recommended Standard Extended,PALTE8,PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials.,0
+PAL,PAL Recommended Standard Extended,PALTEA12,"PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach.",0
+PAL,PAL Recommended Standard Extended,PALTEA2,"PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach.",0
+PAL,PAL Recommended Standard Extended,PALTEA28,"KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task.  In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard.",0
+PAL,PAL Recommended Standard Extended,PALTEA4,"PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach.",0
+PAL,PAL Recommended Standard Extended,PALTEA6,"PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach.",0
+PAL,PAL Recommended Standard Extended,PALTEA8,"PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach.",0
+PRM,PRM Recommended Standard 18 Extended,PRMCLSDD,"PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds.",2
+PRM,PRM Recommended Standard 18 Extended,PRMCLSDI,"PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds.",2
+PRM,PRM Recommended Standard 18 Extended,PRMMCLD,"PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",2
+PRM,PRM Recommended Standard 18 Extended,PRMMCLI,"PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",2
+PRM,PRM Recommended Standard 18 Extended,PRMMDCLD,"PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",2
+PRM,PRM Recommended Standard 18 Extended,PRMMDCLI,"PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",2
+PRM,PRM Recommended Standard 18 Extended,PRMPCD,"KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage.",2
+PRM,PRM Recommended Standard 18 Extended,PRMPCI,"KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage.",2
+PRM,PRM Recommended Standard 18 Extended,PRMTSDSP,PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition.,2
+RVP,RVP 3 Targets,RVPA,"KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences.",4
+RVP,RVP 3 Targets,RVPLSD,RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials.,4
+RVP,RVP 3 Targets,RVPMDL,KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials.,4
+RVP,RVP 3 Targets,RVPML,RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials.,4
+RVP,RVP 3 Targets,RVPPFA,KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms ÷ (False Alarms + Correct Rejections)),4
+RVP,RVP 3 Targets,RVPPH,"RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits ÷ total number of sequences)",4
+RVP,RVP 3 Targets,RVPTFA,RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms.,0
+RVP,RVP 3 Targets,RVPTH,RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks.,0
+RVP,RVP 3 Targets,RVPTM,RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMBE12,KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMBE4,KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMBE468,"KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMBE6,KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMBE8,KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMDE12,SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMDE4,SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMDE468,"SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMDE6,SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMDE8,SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMPR,"SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMS,"KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMS6,"SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMSX,"SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMTE12,"SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMTE4,"SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMTE468,"SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMTE6,"SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMTE8,"SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMWE12,SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMWE4,SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMWE468,"SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials.",0
+SWM,SWM Recommended Standard 2.0 Extended,SWMWE6,SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only.,0
+SWM,SWM Recommended Standard 2.0 Extended,SWMWE8,SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only.,0

version2/data/dictionary_harmonized_categories.csv ADDED Viewed

	@@ -0,0 +1,571 @@

+,variable_codename_use,variable_description_use,harmonized_categories,harmonized_categories_description,in_dataset
+1,DMDBORN4,In what country {were you/was SP} born?,1,"Born in 50 US states or Washington, DC",Demographics
+2,DMDBORN4,In what country {were you/was SP} born?,2,Others,Demographics
+3,DMDBORN4,In what country {were you/was SP} born?,77,Refused,Demographics
+4,DMDBORN4,In what country {were you/was SP} born?,99,Don't Know,Demographics
+5,DMDEDUC2,Education level - Adults 20+,1,Less Than 9th Grade,Demographics
+6,DMDEDUC2,Education level - Adults 20+,2,9-11th Grade (Includes 12th grade with no diploma),Demographics
+7,DMDEDUC2,Education level - Adults 20+,3,High School Grad/GED or Equivalent,Demographics
+8,DMDEDUC2,Education level - Adults 20+,4,Some College or AA degree,Demographics
+9,DMDEDUC2,Education level - Adults 20+,5,College Graduate or above,Demographics
+10,DMDEDUC2,Education level - Adults 20+,7,Refused,Demographics
+11,DMDEDUC2,Education level - Adults 20+,9,Don't know,Demographics
+12,DMDEDUC3,Education level - Children/Youth 6-19,0,Never Attended / Kindergarten Only,Demographics
+13,DMDEDUC3,Education level - Children/Youth 6-19,1,1st Grade,Demographics
+14,DMDEDUC3,Education level - Children/Youth 6-19,2,2nd Grade,Demographics
+15,DMDEDUC3,Education level - Children/Youth 6-19,3,3rd Grade,Demographics
+16,DMDEDUC3,Education level - Children/Youth 6-19,4,4th Grade,Demographics
+17,DMDEDUC3,Education level - Children/Youth 6-19,5,5th Grade,Demographics
+18,DMDEDUC3,Education level - Children/Youth 6-19,6,6th Grade,Demographics
+19,DMDEDUC3,Education level - Children/Youth 6-19,7,7th Grade,Demographics
+20,DMDEDUC3,Education level - Children/Youth 6-19,8,8th Grade,Demographics
+21,DMDEDUC3,Education level - Children/Youth 6-19,9,9th Grade,Demographics
+22,DMDEDUC3,Education level - Children/Youth 6-19,10,10th Grade,Demographics
+23,DMDEDUC3,Education level - Children/Youth 6-19,11,11th Grade,Demographics
+24,DMDEDUC3,Education level - Children/Youth 6-19,12,"12th Grade, No Diploma",Demographics
+25,DMDEDUC3,Education level - Children/Youth 6-19,13,High School Graduate,Demographics
+26,DMDEDUC3,Education level - Children/Youth 6-19,14,GED or Equivalent,Demographics
+27,DMDEDUC3,Education level - Children/Youth 6-19,15,More than high school,Demographics
+28,DMDEDUC3,Education level - Children/Youth 6-19,55,Less Than 5th Grade,Demographics
+29,DMDEDUC3,Education level - Children/Youth 6-19,66,Less Than 9th Grade,Demographics
+30,DMDEDUC3,Education level - Children/Youth 6-19,77,Refused,Demographics
+31,DMDEDUC3,Education level - Children/Youth 6-19,99,Don't know,Demographics
+32,DMDFMSIZ,Total number of people in the Family,1,1,Demographics
+33,DMDFMSIZ,Total number of people in the Family,2,2,Demographics
+34,DMDFMSIZ,Total number of people in the Family,3,3,Demographics
+35,DMDFMSIZ,Total number of people in the Family,4,4,Demographics
+36,DMDFMSIZ,Total number of people in the Family,5,5,Demographics
+37,DMDFMSIZ,Total number of people in the Family,6,6,Demographics
+38,DMDFMSIZ,Total number of people in the Family,7,7 or more people in the Family,Demographics
+39,DMDHHSIZ,Total number of people in the Household,1,1,Demographics
+40,DMDHHSIZ,Total number of people in the Household,2,2,Demographics
+41,DMDHHSIZ,Total number of people in the Household,3,3,Demographics
+42,DMDHHSIZ,Total number of people in the Household,4,4,Demographics
+43,DMDHHSIZ,Total number of people in the Household,5,5,Demographics
+44,DMDHHSIZ,Total number of people in the Household,6,6,Demographics
+45,DMDHHSIZ,Total number of people in the Household,7,7 or more people in the Household,Demographics
+46,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,1,<20 years,Demographics
+47,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,2,20-39 years,Demographics
+48,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,3,40-59 years,Demographics
+49,DMDHRAGE,Age in years of the household reference person at the time of HH screening.,4,60+ years,Demographics
+50,DMDYRSUS,Length of time the participant has been in the US.,1,Less than 1 year,Demographics
+51,DMDYRSUS,Length of time the participant has been in the US.,2,"1 yr., less than 5 yrs.",Demographics
+52,DMDYRSUS,Length of time the participant has been in the US.,3,"5 yrs., less than 10 yrs.",Demographics
+53,DMDYRSUS,Length of time the participant has been in the US.,4,"10 yrs., less than 15 yrs.",Demographics
+54,DMDHRBR4,HH reference person's country of birth,1,"Born in 50 US states or Washington, DC",Demographics
+55,DMDHRBR4,HH reference person's country of birth,2,Others,Demographics
+56,DMDHRBR4,HH reference person's country of birth,77,Refused,Demographics
+57,DMDHRBR4,HH reference person's country of birth,99,Don't Know,Demographics
+58,DMDHREDU,HH reference person's education level,1,Less than high school degree,Demographics
+59,DMDHREDU,HH reference person's education level,2,High school grad/GED or some college/AA degree,Demographics
+60,DMDHREDU,HH reference person's education level,3,College graduate or above,Demographics
+61,DMDHREDU,HH reference person's education level,7,Refused,Demographics
+62,DMDHREDU,HH reference person's education level,9,Don't know,Demographics
+63,DMDHREDU,HH reference person's education level,9,Don't Know,Demographics
+64,DMDHREDU,HH reference person's education level,3,High school grad/GED or some college/AA degree,Demographics
+65,DMDHRGND,Gender of the household reference person,1,Male,Demographics
+66,DMDHRGND,Gender of the household reference person,2,Female,Demographics
+67,DMDHRMAR,Marital Status of household reference person,1,Married/Living with partner,Demographics
+68,DMDHRMAR,Marital Status of household reference person,2,Widowed/Divorced/Separated,Demographics
+69,DMDHRMAR,Marital Status of household reference person,3,Never Married,Demographics
+70,DMDHRMAR,Marital Status of household reference person,77,Refused,Demographics
+71,DMDHRMAR,Marital Status of household reference person,99,Don't Know,Demographics
+72,DMDHSEDU,HH reference person's spouse's education level,1,Less than high school degree,Demographics
+73,DMDHSEDU,HH reference person's spouse's education level,2,High school grad/GED or some college/AA degree,Demographics
+74,DMDHSEDU,HH reference person's spouse's education level,3,College graduate or above,Demographics
+75,DMDHSEDU,HH reference person's spouse's education level,7,Refused,Demographics
+76,DMDHSEDU,HH reference person's spouse's education level,9,Don't Know,Demographics
+77,DMDMARTL,Marital status,1,Married,Demographics
+78,DMDMARTL,Marital status,2,Widowed,Demographics
+79,DMDMARTL,Marital status,3,Divorced,Demographics
+80,DMDMARTL,Marital status,4,Separated,Demographics
+81,DMDMARTL,Marital status,5,Never married,Demographics
+82,DMDMARTL,Marital status,6,Living with partner,Demographics
+83,DMDMARTL,Marital status,77,Refused,Demographics
+84,DMDMARTL,Marital status,99,Don't know,Demographics
+85,DMDYRSUS,Length of time the participant has been in the US.,5,"15 yrs., less than 20 yrs.",Demographics
+86,DMDYRSUS,Length of time the participant has been in the US.,6,"20 yrs., less than 30 yrs.",Demographics
+87,DMDYRSUS,Length of time the participant has been in the US.,7,"30 yrs., less than 40 yrs.",Demographics
+88,DMDYRSUS,Length of time the participant has been in the US.,8,"40 yrs., less than 50 yrs.",Demographics
+89,DMDYRSUS,Length of time the participant has been in the US.,9,50 years or more,Demographics
+90,DMDYRSUS,Length of time the participant has been in the US.,77,Refused,Demographics
+91,DMDYRSUS,Length of time the participant has been in the US.,99,Don't know,Demographics
+92,FIALANG,Language of the Family Interview Instrument,1,English,Demographics
+93,FIALANG,Language of the Family Interview Instrument,2,Spanish,Demographics
+94,FIALANG,Language of the Family Interview Instrument,3,Other,Demographics
+95,INDFMIN2,Total family income (reported as a range value in dollars),1,"$ 0 to $ 4,999",Demographics
+96,INDFMIN2,Total family income (reported as a range value in dollars),2,"$ 5,000 to $ 9,999",Demographics
+97,INDFMIN2,Total family income (reported as a range value in dollars),3,"$10,000 to $14,999",Demographics
+98,INDFMIN2,Total family income (reported as a range value in dollars),4,"$15,000 to $19,999",Demographics
+99,INDFMIN2,Total family income (reported as a range value in dollars),5,"$20,000 to $24,999",Demographics
+100,INDFMIN2,Total family income (reported as a range value in dollars),6,"$25,000 to $34,999",Demographics
+101,INDFMIN2,Total family income (reported as a range value in dollars),7,"$35,000 to $44,999",Demographics
+102,INDFMIN2,Total family income (reported as a range value in dollars),8,"$45,000 to $54,999",Demographics
+103,INDFMIN2,Total family income (reported as a range value in dollars),16,"$50,000 and over",Demographics
+104,INDFMIN2,Total family income (reported as a range value in dollars),99,Don't know,Demographics
+105,INDFMIN2,Total family income (reported as a range value in dollars),9,"$55,000 to $64,999",Demographics
+106,INDFMIN2,Total family income (reported as a range value in dollars),10,"$65,000 to $74,999",Demographics
+107,INDFMIN2,Total family income (reported as a range value in dollars),12,"$20,000 and Over",Demographics
+108,INDFMIN2,Total family income (reported as a range value in dollars),13,"Under $20,000",Demographics
+109,INDFMIN2,Total family income (reported as a range value in dollars),14,"$75,000 to $99,999",Demographics
+110,INDFMIN2,Total family income (reported as a range value in dollars),15,"$100,000 and Over",Demographics
+111,INDFMIN2,Total family income (reported as a range value in dollars),77,Refused,Demographics
+112,INDFMIN2,Total family income (reported as a range value in dollars),11,"$75,000 and Over",Demographics
+113,RIDRETH1,Recode of reported race and Hispanic origin information,1,Mexican American,Demographics
+114,RIDRETH1,Recode of reported race and Hispanic origin information,3,Non-Hispanic White,Demographics
+115,RIDRETH1,Recode of reported race and Hispanic origin information,4,Non-Hispanic Black,Demographics
+116,RIDRETH1,Recode of reported race and Hispanic origin information,5,Other Race - Including Multi-Racial,Demographics
+117,RIDRETH1,Recode of reported race and Hispanic origin information,2,Other Hispanic,Demographics
+118,RIDSTATR,Interview and Examination Status of the Sample Person.,1,Interviewed Only,Demographics
+119,RIDSTATR,Interview and Examination Status of the Sample Person.,2,Both Interviewed and MEC examined,Demographics
+120,MCD180B,Age when told you had congestive heart failure,16,16 years or younger,Questionnaire
+121,MCD180B,Age when told you had congestive heart failure,17-79,17-79 years old,Questionnaire
+122,MCD180B,Age when told you had congestive heart failure,17-84,17-84 years old,Questionnaire
+123,MCD180B,Age when told you had congestive heart failure,17-89,17-89 years old,Questionnaire
+124,MCD180B,Age when told you had congestive heart failure,18-79,18-79 years old,Questionnaire
+125,MCD180B,Age when told you had congestive heart failure,80,80 years or older,Questionnaire
+126,MCD180B,Age when told you had congestive heart failure,85,85 years or older,Questionnaire
+127,MCD180B,Age when told you had congestive heart failure,90,90 + years,Questionnaire
+128,MCD180B,Age when told you had congestive heart failure,99999,Don't know,Questionnaire
+129,MCD180B,Age when told you had congestive heart failure,77777,Refused,Questionnaire
+130,MCD180C,Age when told had coronary heart disease,16,16 years or younger,Questionnaire
+131,MCD180C,Age when told had coronary heart disease,17-79,17-79 years old,Questionnaire
+132,MCD180C,Age when told had coronary heart disease,17-84,17-84 years old,Questionnaire
+133,MCD180C,Age when told had coronary heart disease,20-79,20-79 years old,Questionnaire
+134,MCD180C,Age when told had coronary heart disease,80,80 years or older,Questionnaire
+135,MCD180C,Age when told had coronary heart disease,85,85 years or older,Questionnaire
+136,MCD180C,Age when told had coronary heart disease,99999,Don't know,Questionnaire
+137,MCD180C,Age when told had coronary heart disease,77777,Refused,Questionnaire
+138,MCD180D,Age when told you had angina pectoris,16,16 years or younger,Questionnaire
+139,MCD180D,Age when told you had angina pectoris,17-84,17-84 years old,Questionnaire
+140,MCD180D,Age when told you had angina pectoris,85,85 years or older,Questionnaire
+141,MCD180D,Age when told you had angina pectoris,99999,Don't know,Questionnaire
+142,MCD180D,Age when told you had angina pectoris,77777,Refused,Questionnaire
+143,MCD180D,Age when told you had angina pectoris,17-79,17-79 years old,Questionnaire
+144,MCD180D,Age when told you had angina pectoris,20-79,20-79 years old,Questionnaire
+145,MCD180D,Age when told you had angina pectoris,80,80 years or older,Questionnaire
+146,MCD180E,Age when told you had heart attack,16,16 years or younger,Questionnaire
+147,MCD180E,Age when told you had heart attack,17-79,17-79 years old,Questionnaire
+148,MCD180E,Age when told you had heart attack,17-84,17-84 years old,Questionnaire
+149,MCD180E,Age when told you had heart attack,17-88,17-88 years old,Questionnaire
+150,MCD180E,Age when told you had heart attack,19-79,19-79 years old,Questionnaire
+151,MCD180E,Age when told you had heart attack,80,80 years or older,Questionnaire
+152,MCD180E,Age when told you had heart attack,85,85 years or older,Questionnaire
+153,MCD180E,Age when told you had heart attack,90,90 + years,Questionnaire
+154,MCD180E,Age when told you had heart attack,99999,Don't know,Questionnaire
+155,MCD180E,Age when told you had heart attack,77777,Refused,Questionnaire
+156,MCD180F,Age when told you had a stroke,16,16 years or younger,Questionnaire
+157,MCD180F,Age when told you had a stroke,17-79,17-79 years old,Questionnaire
+158,MCD180F,Age when told you had a stroke,17-84,17-84 years old,Questionnaire
+159,MCD180F,Age when told you had a stroke,17-89,17-89 years old,Questionnaire
+160,MCD180F,Age when told you had a stroke,80,80 years or older,Questionnaire
+161,MCD180F,Age when told you had a stroke,85,85 years or older,Questionnaire
+162,MCD180F,Age when told you had a stroke,90,90 + years,Questionnaire
+163,MCD180F,Age when told you had a stroke,99999,Don't know,Questionnaire
+164,MCD180F,Age when told you had a stroke,77777,Refused,Questionnaire
+165,MCD180G,Age when told you had emphysema,16,16 years or younger,Questionnaire
+166,MCD180G,Age when told you had emphysema,17-79,17-79 years old,Questionnaire
+167,MCD180G,Age when told you had emphysema,17-84,17-84 years old,Questionnaire
+168,MCD180G,Age when told you had emphysema,17-89,17-89 years old,Questionnaire
+169,MCD180G,Age when told you had emphysema,80,80 years or older,Questionnaire
+170,MCD180G,Age when told you had emphysema,85,85 years or older,Questionnaire
+171,MCD180G,Age when told you had emphysema,90,90 + years,Questionnaire
+172,MCD180G,Age when told you had emphysema,99999,Don't know,Questionnaire
+173,MCD180G,Age when told you had emphysema,77777,Refused,Questionnaire
+174,MCD180K,Age when told you had chronic bronchitis,16,16 years or younger,Questionnaire
+175,MCD180K,Age when told you had chronic bronchitis,17-79,17-79 years old,Questionnaire
+176,MCD180K,Age when told you had chronic bronchitis,17-83,17-83 years old,Questionnaire
+177,MCD180K,Age when told you had chronic bronchitis,17-89,17-89 years old,Questionnaire
+178,MCD180K,Age when told you had chronic bronchitis,80,80 years or older,Questionnaire
+179,MCD180K,Age when told you had chronic bronchitis,85,85 years or older,Questionnaire
+180,MCD180K,Age when told you had chronic bronchitis,90,90 + years,Questionnaire
+181,MCD180K,Age when told you had chronic bronchitis,99999,Don't know,Questionnaire
+182,MCD180K,Age when told you had chronic bronchitis,77777,Refused,Questionnaire
+183,MCD180L,Age when told you had a liver condition,16,16 years or younger,Questionnaire
+184,MCD180L,Age when told you had a liver condition,17-78,17-78 years old,Questionnaire
+185,MCD180L,Age when told you had a liver condition,17-79,17-79 years old,Questionnaire
+186,MCD180L,Age when told you had a liver condition,17-83,17-83 years old,Questionnaire
+187,MCD180L,Age when told you had a liver condition,80,80 years or older,Questionnaire
+188,MCD180L,Age when told you had a liver condition,85,85 years or older,Questionnaire
+189,MCD180L,Age when told you had a liver condition,99999,Don't know,Questionnaire
+190,MCD180L,Age when told you had a liver condition,77777,Refused,Questionnaire
+191,MCQ180H,Age when told you had a goiter,16,16 years or younger,Questionnaire
+192,MCQ180H,Age when told you had a goiter,17-84,17-84 years old,Questionnaire
+193,MCQ180H,Age when told you had a goiter,90,90 + years,Questionnaire
+194,MCQ180H,Age when told you had a goiter,99999,Don't know,Questionnaire
+195,MCD180M,Age when told you had thyroid problem,17-89,17-89 years old,Questionnaire
+196,MCD180M,Age when told you had thyroid problem,16,16 years or younger,Questionnaire
+197,MCD180M,Age when told you had thyroid problem,99999,Don't know,Questionnaire
+198,MCD180M,Age when told you had thyroid problem,17-84,17-84 years old,Questionnaire
+199,MCD180M,Age when told you had thyroid problem,80,80 years or older,Questionnaire
+200,MCD180M,Age when told you had thyroid problem,85,85 years or older,Questionnaire
+201,MCD180M,Age when told you had thyroid problem,77777,Refused,Questionnaire
+202,MCD180M,Age when told you had thyroid problem,17-79,17-79 years old,Questionnaire
+203,MCD180N,Age when told you had gout,16,16 years or younger,Questionnaire
+204,MCD180N,Age when told you had gout,17-79,17-79 years old,Questionnaire
+205,MCD180N,Age when told you had gout,17-86,17-86 years old,Questionnaire
+206,MCD180N,Age when told you had gout,80,80 years or older,Questionnaire
+207,MCD180N,Age when told you had gout,99999,Don't know,Questionnaire
+208,MCD180N,Age when told you had gout,77777,Refused,Questionnaire
+209,MCQ025,Age when first had asthma,Jan-19,1-19 years old,Questionnaire
+210,MCQ025,Age when first had asthma,Jan-79,1-79 years old,Questionnaire
+211,MCQ025,Age when first had asthma,Jan-84,1-84 years old,Questionnaire
+212,MCQ025,Age when first had asthma,Jan-88,1-88 years old,Questionnaire
+213,MCQ025,Age when first had asthma,80,80 years or older,Questionnaire
+214,MCQ025,Age when first had asthma,85,85 years or older,Questionnaire
+215,MCQ025,Age when first had asthma,99999,Don't know,Questionnaire
+216,MCQ025,Age when first had asthma,1,Less than 1 year,Questionnaire
+217,MCQ025,Age when first had asthma,77777,Refused,Questionnaire
+218,MCD180A,Age when told you had arthritis,16,16 years or younger,Questionnaire
+219,MCD180A,Age when told you had arthritis,17-89,17-89 years old,Questionnaire
+220,MCD180A,Age when told you had arthritis,90,90 + years,Questionnaire
+221,MCD180A,Age when told you had arthritis,99999,Don't know,Questionnaire
+222,MCD180A,Age when told you had arthritis,17-79,17-79 years old,Questionnaire
+223,MCD180A,Age when told you had arthritis,80,80 years or older,Questionnaire
+224,MCD180A,Age when told you had arthritis,77777,Refused,Questionnaire
+225,MCD180A,Age when told you had arthritis,17-84,17-84 years old,Questionnaire
+226,MCD180A,Age when told you had arthritis,85,85 years or older,Questionnaire
+227,MCQ180H,Age when told you had a goiter,17-72,17-72 years old,Questionnaire
+228,MCQ180H,Age when told you had a goiter,85,85 years or older,Questionnaire
+229,MCQ180H,Age when told you had a goiter,77777,Refused,Questionnaire
+230,MCQ195,Which type of arthritis was it,9,Don't know,Questionnaire
+231,MCQ195,Which type of arthritis was it,2,Osteoarthritis or degenerative arthritis,Questionnaire
+232,MCQ195,Which type of arthritis was it,4,Other,Questionnaire
+233,MCQ195,Which type of arthritis was it,3,Psoriatic arthritis,Questionnaire
+234,MCQ195,Which type of arthritis was it,7,Refused,Questionnaire
+235,MCQ195,Which type of arthritis was it,1,Rheumatoid arthritis,Questionnaire
+236,MCQ240A,Age when bladder cancer first diagnosed,17-78,17-78 years old,Questionnaire
+237,MCQ240A,Age when bladder cancer first diagnosed,17-83,17-83 years old,Questionnaire
+238,MCQ240A,Age when bladder cancer first diagnosed,16,16 years or younger,Questionnaire
+239,MCQ240A,Age when bladder cancer first diagnosed,80,80 years or older,Questionnaire
+240,MCQ240A,Age when bladder cancer first diagnosed,85,85 years or older,Questionnaire
+241,MCQ240A,Age when bladder cancer first diagnosed,99999,Don't know,Questionnaire
+242,MCQ240A,Age when bladder cancer first diagnosed,77777,Refused,Questionnaire
+243,MCQ240B,Age when blood cancer was first diagnosed,16,16 years or younger,Questionnaire
+244,MCQ240B,Age when blood cancer was first diagnosed,17-66,17-66 years old,Questionnaire
+245,MCQ240B,Age when blood cancer was first diagnosed,17-70,17-70 years old,Questionnaire
+246,MCQ240B,Age when blood cancer was first diagnosed,80,80 years or older,Questionnaire
+247,MCQ240B,Age when blood cancer was first diagnosed,85,85 years or older,Questionnaire
+248,MCQ240B,Age when blood cancer was first diagnosed,99999,Don't know,Questionnaire
+249,MCQ240B,Age when blood cancer was first diagnosed,77777,Refused,Questionnaire
+250,MCQ240C,Age when bone cancer was first diagnosed,17-77,17-77 years old,Questionnaire
+251,MCQ240C,Age when bone cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+252,MCQ240C,Age when bone cancer was first diagnosed,16,16 years or younger,Questionnaire
+253,MCQ240C,Age when bone cancer was first diagnosed,55-76,55-76 years old,Questionnaire
+254,MCQ240C,Age when bone cancer was first diagnosed,80,80 years or older,Questionnaire
+255,MCQ240C,Age when bone cancer was first diagnosed,85,85 years or older,Questionnaire
+256,MCQ240C,Age when bone cancer was first diagnosed,99999,Don't know,Questionnaire
+257,MCQ240C,Age when bone cancer was first diagnosed,77777,Refused,Questionnaire
+258,MCQ240CC,Age when uterine cancer was first diagnosed,16,16 years or younger,Questionnaire
+259,MCQ240CC,Age when uterine cancer was first diagnosed,17-77,17-77 years old,Questionnaire
+260,MCQ240CC,Age when uterine cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+261,MCQ240CC,Age when uterine cancer was first diagnosed,20-72,20-72 years old,Questionnaire
+262,MCQ240CC,Age when uterine cancer was first diagnosed,80,80 years or older,Questionnaire
+263,MCQ240CC,Age when uterine cancer was first diagnosed,85,85 years or older,Questionnaire
+264,MCQ240CC,Age when uterine cancer was first diagnosed,99999,Don't know,Questionnaire
+265,MCQ240CC,Age when uterine cancer was first diagnosed,77777,Refused,Questionnaire
+266,MCQ240D,Age when brain cancer was first diagnosed,16,16 years or younger,Questionnaire
+267,MCQ240D,Age when brain cancer was first diagnosed,17-73,17-73 years old,Questionnaire
+268,MCQ240D,Age when brain cancer was first diagnosed,17-75,17-75 years old,Questionnaire
+269,MCQ240D,Age when brain cancer was first diagnosed,80,80 years or older,Questionnaire
+270,MCQ240D,Age when brain cancer was first diagnosed,85,85 years or older,Questionnaire
+271,MCQ240D,Age when brain cancer was first diagnosed,99999,Don't know,Questionnaire
+272,MCQ240D,Age when brain cancer was first diagnosed,77777,Refused,Questionnaire
+273,MCQ240DD,Age when some other type of cancer was first diagnosed,16,16 years or younger,Questionnaire
+274,MCQ240DD,Age when some other type of cancer was first diagnosed,17-77,17-77 years old,Questionnaire
+275,MCQ240DD,Age when some other type of cancer was first diagnosed,17-78,17-78 years old,Questionnaire
+276,MCQ240DD,Age when some other type of cancer was first diagnosed,17-83,17-83 years old,Questionnaire
+277,MCQ240DD,Age when some other type of cancer was first diagnosed,80,80 years or older,Questionnaire
+278,MCQ240DD,Age when some other type of cancer was first diagnosed,85,85 years or older,Questionnaire
+279,MCQ240DD,Age when some other type of cancer was first diagnosed,99999,Don't know,Questionnaire
+280,MCQ240DD,Age when some other type of cancer was first diagnosed,77777,Refused,Questionnaire
+281,MCQ240DK,Age when cancer was first diagnosed,20-80,20-80 years old,Questionnaire
+282,MCQ240DK,Age when cancer was first diagnosed,23-47,23-47 years old,Questionnaire
+283,MCQ240DK,Age when cancer was first diagnosed,80,80 years or older,Questionnaire
+284,MCQ240DK,Age when cancer was first diagnosed,85,85 years or older,Questionnaire
+285,MCQ240DK,Age when cancer was first diagnosed,99999,Don't know,Questionnaire
+286,MCQ240DK,Age when cancer was first diagnosed,77777,Refused,Questionnaire
+287,MCQ240E,Age when breast cancer was first diagnosed,16,16 years or younger,Questionnaire
+288,MCQ240E,Age when breast cancer was first diagnosed,17-78,17-78 years old,Questionnaire
+289,MCQ240E,Age when breast cancer was first diagnosed,17-79,17-79 years old,Questionnaire
+290,MCQ240E,Age when breast cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+291,MCQ240E,Age when breast cancer was first diagnosed,80,80 years or older,Questionnaire
+292,MCQ240E,Age when breast cancer was first diagnosed,85,85 years or older,Questionnaire
+293,MCQ240E,Age when breast cancer was first diagnosed,99999,Don't know,Questionnaire
+294,MCQ240E,Age when breast cancer was first diagnosed,77777,Refused,Questionnaire
+295,MCQ240F,Age when cervical cancer was first diagnosed,16,16 years or younger,Questionnaire
+296,MCQ240F,Age when cervical cancer was first diagnosed,17-65,17-65 years old,Questionnaire
+297,MCQ240F,Age when cervical cancer was first diagnosed,17-73,17-73 years old,Questionnaire
+298,MCQ240F,Age when cervical cancer was first diagnosed,80,80 years or older,Questionnaire
+299,MCQ240F,Age when cervical cancer was first diagnosed,85,85 years or older,Questionnaire
+300,MCQ240F,Age when cervical cancer was first diagnosed,99999,Don't know,Questionnaire
+301,MCQ240F,Age when cervical cancer was first diagnosed,77777,Refused,Questionnaire
+302,MCQ240G,Age when colon cancer was first diagnosed,16,16 years or younger,Questionnaire
+303,MCQ240G,Age when colon cancer was first diagnosed,17-79,17-79 years old,Questionnaire
+304,MCQ240G,Age when colon cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+305,MCQ240G,Age when colon cancer was first diagnosed,21-79,21-79 years old,Questionnaire
+306,MCQ240G,Age when colon cancer was first diagnosed,80,80 years or older,Questionnaire
+307,MCQ240G,Age when colon cancer was first diagnosed,85,85 years or older,Questionnaire
+308,MCQ240G,Age when colon cancer was first diagnosed,99999,Don't know,Questionnaire
+309,MCQ240G,Age when colon cancer was first diagnosed,77777,Refused,Questionnaire
+310,MCQ240L,Age when leukemia was first diagnosed,17-70,17-70 years old,Questionnaire
+311,MCQ240L,Age when leukemia was first diagnosed,17-75,17-75 years old,Questionnaire
+312,MCQ240L,Age when leukemia was first diagnosed,28-84,28-84 years old,Questionnaire
+313,MCQ240L,Age when leukemia was first diagnosed,16,16 years or younger,Questionnaire
+314,MCQ240L,Age when leukemia was first diagnosed,80,80 years or older,Questionnaire
+315,MCQ240L,Age when leukemia was first diagnosed,85,85 years or older,Questionnaire
+316,MCQ240L,Age when leukemia was first diagnosed,99999,Don't know,Questionnaire
+317,MCQ240L,Age when leukemia was first diagnosed,77777,Refused,Questionnaire
+318,MCQ240N,Age when lung cancer was first diagnosed,16,16 years or younger,Questionnaire
+319,MCQ240N,Age when lung cancer was first diagnosed,17-76,17-76 years old,Questionnaire
+320,MCQ240N,Age when lung cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+321,MCQ240N,Age when lung cancer was first diagnosed,29-79,29-79 years old,Questionnaire
+322,MCQ240N,Age when lung cancer was first diagnosed,80,80 years or older,Questionnaire
+323,MCQ240N,Age when lung cancer was first diagnosed,85,85 years or older,Questionnaire
+324,MCQ240N,Age when lung cancer was first diagnosed,99999,Don't know,Questionnaire
+325,MCQ240N,Age when lung cancer was first diagnosed,77777,Refused,Questionnaire
+326,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,16,16 years or younger,Questionnaire
+327,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,17-76,17-76 years old,Questionnaire
+328,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,17-80,17-80 years old,Questionnaire
+329,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,19-79,19-79 years old,Questionnaire
+330,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,80,80 years or older,Questionnaire
+331,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,85,85 years or older,Questionnaire
+332,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,99999,Don't know,Questionnaire
+333,MCQ240O,Age when lymphoma or Hodgkin's Disease was first diagnosed,77777,Refused,Questionnaire
+334,MCQ240P,Age when melanoma was first diagnosed,16,16 years or younger,Questionnaire
+335,MCQ240P,Age when melanoma was first diagnosed,17-78,17-78 years old,Questionnaire
+336,MCQ240P,Age when melanoma was first diagnosed,17-79,17-79 years old,Questionnaire
+337,MCQ240P,Age when melanoma was first diagnosed,17-83,17-83 years old,Questionnaire
+338,MCQ240P,Age when melanoma was first diagnosed,80,80 years or older,Questionnaire
+339,MCQ240P,Age when melanoma was first diagnosed,85,85 years or older,Questionnaire
+340,MCQ240P,Age when melanoma was first diagnosed,99999,Don't know,Questionnaire
+341,MCQ240P,Age when melanoma was first diagnosed,77777,Refused,Questionnaire
+342,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",16,16 years or younger,Questionnaire
+343,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",17-79,17-79 years old,Questionnaire
+344,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",27-70,27-70 years old,Questionnaire
+345,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",30-70,30-70 years old,Questionnaire
+346,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",80,80 years or older,Questionnaire
+347,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",85,85 years or older,Questionnaire
+348,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",99999,Don't know,Questionnaire
+349,MCQ240Q,"Age when mouth, tongue, or lip cancer was first diagnosed",77777,Refused,Questionnaire
+350,MCQ240U,Age when prostate cancer was first diagnosed,17-79,17-79 years old,Questionnaire
+351,MCQ240U,Age when prostate cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+352,MCQ240U,Age when prostate cancer was first diagnosed,16,16 years or younger,Questionnaire
+353,MCQ240U,Age when prostate cancer was first diagnosed,32-79,32-79 years old,Questionnaire
+354,MCQ240U,Age when prostate cancer was first diagnosed,80,80 years or older,Questionnaire
+355,MCQ240U,Age when prostate cancer was first diagnosed,85,85 years or older,Questionnaire
+356,MCQ240U,Age when prostate cancer was first diagnosed,99999,Don't know,Questionnaire
+357,MCQ240U,Age when prostate cancer was first diagnosed,77777,Refused,Questionnaire
+358,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,16,16 years or younger,Questionnaire
+359,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,17-78,17-78 years old,Questionnaire
+360,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,17-79,17-79 years old,Questionnaire
+361,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+362,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,80,80 years or older,Questionnaire
+363,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,85,85 years or older,Questionnaire
+364,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,99999,Don't know,Questionnaire
+365,MCQ240W,Age when non-melanoma skin cancer was first diagnosed,77777,Refused,Questionnaire
+366,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,16,16 years or younger,Questionnaire
+367,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,17-79,17-79 years old,Questionnaire
+368,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,17-84,17-84 years old,Questionnaire
+369,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,18-79,18-79 years old,Questionnaire
+370,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,80,80 years or older,Questionnaire
+371,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,85,85 years or older,Questionnaire
+372,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,99999,Don't know,Questionnaire
+373,MCQ240X,Age when the unknown kind of skin cancer was first diagnosed,77777,Refused,Questionnaire
+374,MCQ240Z,Age when stomach cancer was first diagnosed,16,16 years or younger,Questionnaire
+375,MCQ240Z,Age when stomach cancer was first diagnosed,17-79,17-79 years old,Questionnaire
+376,MCQ240Z,Age when stomach cancer was first diagnosed,22-82,22-82 years old,Questionnaire
+377,MCQ240Z,Age when stomach cancer was first diagnosed,32-76,32-76 years old,Questionnaire
+378,MCQ240Z,Age when stomach cancer was first diagnosed,80,80 years or older,Questionnaire
+379,MCQ240Z,Age when stomach cancer was first diagnosed,85,85 years or older,Questionnaire
+380,MCQ240Z,Age when stomach cancer was first diagnosed,99999,Don't know,Questionnaire
+381,MCQ240Z,Age when stomach cancer was first diagnosed,77777,Refused,Questionnaire
+382,MCQ280,About how old was she when she fractured her hip (the first time)?,1-101,1-101 years old,Questionnaire
+383,MCQ280,About how old was she when she fractured her hip (the first time)?,555,50 +,Questionnaire
+384,MCQ280,About how old was she when she fractured her hip (the first time)?,9-107,9-107 years old,Questionnaire
+385,MCQ280,About how old was she when she fractured her hip (the first time)?,99999,Don't know,Questionnaire
+386,MCQ280,About how old was she when she fractured her hip (the first time)?,77777,Refused,Questionnaire
+387,MCQ280,About how old was she when she fractured her hip (the first time)?,444,Under 50,Questionnaire
+388,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,16,16 years or younger,Questionnaire
+389,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,17-79,17-79 years old,Questionnaire
+390,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,17-85,17-85 years old,Questionnaire
+391,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,80,80 years or older,Questionnaire
+392,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,999,Don't know,Questionnaire
+393,MCQ320,How old {were you/was SP} when {you/he} first had {your/his} PSA test?,777,Refused,Questionnaire
+394,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,Nov-99,11-99 years old,Questionnaire
+395,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,15-79,15-79 years old,Questionnaire
+396,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,20-87,20-87 years old,Questionnaire
+397,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,80,80 years or older,Questionnaire
+398,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,90,90 + years,Questionnaire
+399,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,99999,Don't know,Questionnaire
+400,MCQ570,How old {were you/was SP} when {you /s/he} first had gallbladder surgery?,77777,Refused,Questionnaire
+401,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",Jan-95,1-95 times,Questionnaire
+402,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",Jan-99,1-99 times,Questionnaire
+403,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",100,100 +,Questionnaire
+404,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",99999,Don't know,Questionnaire
+405,PAD120,"[Over the past 30 days], how often did {you/SP} do these tasks in or around {your/his/her} home or yard, that is tasks requiring at least moderate effort? [Such as raking leaves, mowing the lawn or heavy cleaning.]",77777,Refused,Questionnaire
+406,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",Jan-91,1-91 times,Questionnaire
+407,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",Jan-99,1-99 times,Questionnaire
+408,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",100,100 +,Questionnaire
+409,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",999,Don't know,Questionnaire
+410,PAD460,"[Over the past 30 days], how often did {you/SP} do these physical activities? [Activities designed to strengthen {your/his/her} muscles such as lifting weights, push-ups or sit-ups.]",777,Refused,Questionnaire
+411,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",Jan-91,1-91 times,Questionnaire
+412,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",Jan-99,1-99 times,Questionnaire
+413,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",100,100 +,Questionnaire
+414,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",99999,Don't know,Questionnaire
+415,PAQ050Q,"[Over the past 30 days], how often did {you/SP} do this? [Walk or bicycle as part of getting to and from work, or school, or to do errands.] PROBE: How many times per day, per week, or per month did {you/s/he} do these activities?",77777,Refused,Questionnaire
+416,BMIWAIST,Waist Circumference Comment,1,Could not obtain,Response
+417,NA,NA,1,Breakfast,Dietary
+418,NA,NA,2,Brunch,Dietary
+419,NA,NA,3,Lunch,Dietary
+420,NA,NA,4,Snack/beverage,Dietary
+421,NA,NA,5,Dinner/supper,Dietary
+422,NA,NA,6,Infant feeding,Dietary
+423,NA,NA,7,Extended consumption,Dietary
+424,NA,NA,8,Other,Dietary
+425,NA,NA,9,Desayuno (Spanish),Dietary
+426,NA,NA,10,Almuerzo (Spanish),Dietary
+427,NA,NA,11,Comida (Spanish),Dietary
+428,NA,NA,12,Merienda (Spanish),Dietary
+429,NA,NA,13,Cena (Spanish),Dietary
+430,NA,NA,14,Entre comida/bebida (Spanish),Dietary
+431,NA,NA,15,Bocadillo (Spanish),Dietary
+432,NA,NA,16,Botana (Spanish),Dietary
+433,NA,NA,99,Don't know,Dietary
+434,NA,NA,2,Lunch,Dietary
+435,NA,NA,3,Dinner/supper,Dietary
+436,NA,NA,5,Brunch,Dietary
+437,NA,NA,6,Snack/beverage,Dietary
+438,NA,NA,8,Infant feeding,Dietary
+439,NA,NA,9,Extended consumption,Dietary
+440,NA,NA,10,Desayano (Spanish),Dietary
+441,NA,NA,11,Almuerzo (Spanish),Dietary
+442,NA,NA,12,Comida (Spanish),Dietary
+443,NA,NA,13,Merienda (Spanish),Dietary
+444,NA,NA,14,Cena (Spanish),Dietary
+445,NA,NA,15,Entre comida/bebida/tentempie (Spanish),Dietary
+446,NA,NA,17,Bocadillo (Spanish),Dietary
+447,NA,NA,91,Other,Dietary
+448,NA,NA,3,Dinner,Dietary
+449,NA,NA,4,Supper,Dietary
+450,NA,NA,6,Snack,Dietary
+451,NA,NA,7,Drink,Dietary
+452,NA,NA,10,Desayano (breakfast),Dietary
+453,NA,NA,11,Almuerzo (breakfast),Dietary
+454,NA,NA,12,Comida (lunch),Dietary
+455,NA,NA,13,Merienda (snack),Dietary
+456,NA,NA,14,Cena (dinner),Dietary
+457,NA,NA,15,Entre comida (snack),Dietary
+458,NA,NA,16,Botana (snack),Dietary
+459,NA,NA,17,Bocadillo (snack),Dietary
+460,NA,NA,18,Tentempie (snack),Dietary
+461,NA,NA,19,Bebida (drink),Dietary
+462,NA,NA,0,Non-combination food,Dietary
+463,NA,NA,90,Other mixtures,Dietary
+464,NA,NA,9,Dried beans and vegetable w/ additions,Dietary
+465,NA,NA,1,Beverage w/ additions,Dietary
+466,NA,NA,3,Bread/baked products w/ additions,Dietary
+467,NA,NA,2,Cereal w/ additions,Dietary
+468,NA,NA,14,Chips w/ additions,Dietary
+469,NA,NA,12,"Meat, poultry, fish",Dietary
+470,NA,NA,7,Frozen meals,Dietary
+471,NA,NA,10,Fruit w/ additions,Dietary
+472,NA,NA,4,Salad,Dietary
+473,NA,NA,5,Sandwiches,Dietary
+474,NA,NA,6,Soup,Dietary
+475,NA,NA,11,Tortilla products,Dietary
+476,NA,NA,1,Beverage w/ adds,Dietary
+477,NA,NA,2,Cereal w/ adds,Dietary
+478,NA,NA,3,Bread/baked products w/ adds,Dietary
+479,NA,NA,8,Ice cream/frozen yogurt w/ additions,Dietary
+480,NA,NA,9,Dried beans and vegetable w/ adds,Dietary
+481,NA,NA,10,Fruit w/ adds,Dietary
+482,NA,NA,11,Tortilla Products,Dietary
+483,NA,NA,13,Lunchables,Dietary
+484,DRXDRSTZ,Dietary Recall Status,1,Reliable and met the minimum criteria,Dietary
+485,DRXDRSTZ,Dietary Recall Status,2,Not reliable or not met the minimum criteria,Dietary
+486,DRXDRSTZ,Dietary Recall Status,9,Interview lost due to computer malfunction or file transfer problem,Dietary
+487,DRXDRSTZ,Dietary Recall Status,4,Reported consuming breast-milk,Dietary
+488,DRXDRSTZ,Dietary Recall Status,88,Blank but applicable,Dietary
+489,DRXDRSTZ,Dietary Recall Status,5,Not done,Dietary
+490,NA,NA,2,No,Dietary
+491,NA,NA,1,Yes (home),Dietary
+492,NA,NA,7,Refused,Dietary
+493,NA,NA,9,Don't know,Dietary
+494,DRXTWSZ,Tap Water Source,1,Community supply,Dietary
+495,DRXTWSZ,Tap Water Source,91,Other,Dietary
+496,DRXTWSZ,Tap Water Source,4,Don't drink tap water,Dietary
+497,DRXTWSZ,Tap Water Source,99,Don't know,Dietary
+498,DBQ095Z,Type of salt you usually add at table,4,Doesn't use or add salt products at the table,Dietary
+499,DBQ095Z,Type of salt you usually add at table,1,"Ordinary salt [includes regular iodized salt, sea salt and seasoning salts made with regular salt]",Dietary
+500,DBQ095Z,Type of salt you usually add at table,2,Lite salt,Dietary
+501,DBQ095Z,Type of salt you usually add at table,3,Salt substitute,Dietary
+502,DBQ095Z,Type of salt you usually add at table,88,Blank but applicable,Dietary
+503,DBQ095Z,Type of salt you usually add at table,99,Don't know,Dietary
+504,DBQ095Z,Type of salt you usually add at table,7,Refused,Dietary
+505,DBQ095Z,Type of salt you usually add at table,91,Other,Dietary
+506,DRXHELP,Who helped in responding for this interview,1,SP,Dietary
+507,DRXHELP,Who helped in responding for this interview,4,Parent of SP,Dietary
+508,DRXHELP,Who helped in responding for this interview,5,Spouse of SP,Dietary
+509,DRXHELP,Who helped in responding for this interview,6,Child of SP,Dietary
+510,DRXHELP,Who helped in responding for this interview,7,Grandparent of SP,Dietary
+511,DRXHELP,Who helped in responding for this interview,8,"Friend, Partner, Non Relative",Dietary
+512,DRXHELP,Who helped in responding for this interview,9,"Translator, not a HH member",Dietary
+513,DRXHELP,Who helped in responding for this interview,10,"Child care provider, Caretaker",Dietary
+514,DRXHELP,Who helped in responding for this interview,11,Other Relative,Dietary
+515,DRXHELP,Who helped in responding for this interview,12,No One,Dietary
+516,DRXHELP,Who helped in responding for this interview,14,Other specify,Dietary
+517,DRXHELP,Who helped in responding for this interview,77,Refused,Dietary
+518,DRXHELP,Who helped in responding for this interview,99,Don't know,Dietary
+519,DRXMRESP,Who was the main respondent for this interview?,1,SP,Dietary
+520,DRXMRESP,Who was the main respondent for this interview?,97,Proxy,Dietary
+521,DRXMRESP,Who was the main respondent for this interview?,98,SP and proxy,Dietary
+522,DRXMRESP,Who was the main respondent for this interview?,88,Blank but applicable,Dietary
+523,DRXMRESP,Who was the main respondent for this interview?,2,Mother of SP,Dietary
+524,DRXMRESP,Who was the main respondent for this interview?,3,Father of SP,Dietary
+525,DRXMRESP,Who was the main respondent for this interview?,5,Spouse of SP,Dietary
+526,DRXMRESP,Who was the main respondent for this interview?,6,Child of SP,Dietary
+527,DRXMRESP,Who was the main respondent for this interview?,7,Grandparent of SP,Dietary
+528,DRXMRESP,Who was the main respondent for this interview?,8,"Friend, Partner, Non Relative",Dietary
+529,DRXMRESP,Who was the main respondent for this interview?,9,"Translator, not a HH member",Dietary
+530,DRXMRESP,Who was the main respondent for this interview?,10,"Child care provider, Caretaker",Dietary
+531,DRXMRESP,Who was the main respondent for this interview?,11,Other Relative,Dietary
+532,DRXMRESP,Who was the main respondent for this interview?,14,Other specify,Dietary
+533,DRXMRESP,Who was the main respondent for this interview?,77,Refused,Dietary
+534,DRXMRESP,Who was the main respondent for this interview?,99,Don't know,Dietary
+535,DRXTWSZ,Tap Water Source,1,Community supply,Dietary
+536,DRXTWSZ,Tap Water Source,91,Other,Dietary
+537,DRXTWSZ,Tap Water Source,4,Don't drink tap water,Dietary
+538,DRXTWSZ,Tap Water Source,99,Don't know,Dietary
+539,DRXHELP,Who helped in responding for this interview,1,SP,Dietary
+540,DRXHELP,Who helped in responding for this interview,4,Parent of SP,Dietary
+541,DRXHELP,Who helped in responding for this interview,5,Spouse of SP,Dietary
+542,DRXHELP,Who helped in responding for this interview,6,Child of SP,Dietary
+543,DRXHELP,Who helped in responding for this interview,7,Grandparent of SP,Dietary
+544,DRXHELP,Who helped in responding for this interview,8,"Friend, Partner, Non Relative",Dietary
+545,DRXHELP,Who helped in responding for this interview,9,"Translator, not a HH member",Dietary
+546,DRXHELP,Who helped in responding for this interview,10,"Child care provider, Caretaker",Dietary
+547,DRXHELP,Who helped in responding for this interview,11,Other Relative,Dietary
+548,DRXHELP,Who helped in responding for this interview,12,No One,Dietary
+549,DRXHELP,Who helped in responding for this interview,14,Other specify,Dietary
+550,DRXHELP,Who helped in responding for this interview,77,Refused,Dietary
+551,DRXHELP,Who helped in responding for this interview,99,Don't know,Dietary
+552,DRXMRESP,Who was the main respondent for this interview?,1,SP,Dietary
+553,DRXMRESP,Who was the main respondent for this interview?,2,Mother of SP,Dietary
+554,DRXMRESP,Who was the main respondent for this interview?,3,Father of SP,Dietary
+555,DRXMRESP,Who was the main respondent for this interview?,5,Spouse of SP,Dietary
+556,DRXMRESP,Who was the main respondent for this interview?,6,Child of SP,Dietary
+557,DRXMRESP,Who was the main respondent for this interview?,7,Grandparent of SP,Dietary
+558,DRXMRESP,Who was the main respondent for this interview?,8,"Friend, Partner, Non Relative",Dietary
+559,DRXMRESP,Who was the main respondent for this interview?,9,"Translator, not a HH member",Dietary
+560,DRXMRESP,Who was the main respondent for this interview?,10,"Child care provider, Caretaker",Dietary
+561,DRXMRESP,Who was the main respondent for this interview?,11,Other Relative,Dietary
+562,DRXMRESP,Who was the main respondent for this interview?,14,Other specify,Dietary
+563,DRXMRESP,Who was the main respondent for this interview?,77,Refused,Dietary
+564,DRXMRESP,Who was the main respondent for this interview?,99,Don't know,Dietary
+565,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,1,Rarely,Dietary
+566,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,2,Occasionally,Dietary
+567,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,3,Very often,Dietary
+568,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,88,Blank but applicable,Dietary
+569,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,9,Don't know,Dietary
+570,DBD100,How often {do you/does SP} add ordinary salt to {your/his/her/SP's} food at the table? Would you say . . .,7,Refused,Dietary

version2/data/tidytuesday_json_val.json ADDED Viewed

	@@ -0,0 +1,1911 @@

+[
+    {
+        "date_posted": "2023-02-28",
+        "project_name": "African Language Sentiment",
+        "project_source": [
+            "https://r4ds.io/join",
+            "https://arxiv.org/pdf/2302.08956.pdf",
+            "https://github.com/shmuhammad2004",
+            "https://github.com/afrisenti-semeval/afrisent-semeval-2023"
+        ],
+        "description": "The data this week comes fromAfriSenti: Sentiment Analysis dataset for 14 African languagesvia@shmuhammad2004(the corresponding author on theassociated paper, and an active member of theR4DS Online Learning Community Slack). This repository contains data for the SemEval 2023 Shared Task 12: Sentiment Analysis in African Languages (AfriSenti-SemEval). The source repository also includes sentiment lexicons for several languages.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "language_iso_code",
+                    "tweet",
+                    "label",
+                    "intended_use"
+                ],
+                "class": [
+                    "character",
+                    "character",
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "The unique code used to identify the language",
+                    "The text content of a tweet",
+                    "A sentiment label of positive, negative, or neutral assigned by a native speaker of that language",
+                    "Whether the data came from the dev, test, or train set for that language"
+                ]
+            },
+            {
+                "variable": [
+                    "language_iso_code",
+                    "language"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "The unique code used to identify the language",
+                    "The name of the language"
+                ]
+            },
+            {
+                "variable": [
+                    "language_iso_code",
+                    "script"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "The unique code used to identify the language",
+                    "The script used to write the language"
+                ]
+            },
+            {
+                "variable": [
+                    "language_iso_code",
+                    "country"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "The unique code used to identify the language",
+                    "A country in which the language is spoken"
+                ]
+            },
+            {
+                "variable": [
+                    "country",
+                    "region"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "A country in which the language is spoken",
+                    "The region of Africa in which that country is categorized. Note that Mozambique is categorized as \\\"East Africa\\\", \\\"Southern Africa\\\", and \\\"Southeastern Africa\\\""
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "afrisenti.csv",
+                "country_regions.csv",
+                "language_countries.csv",
+                "language_scripts.csv",
+                "languages.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/afrisenti.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/country_regions.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/language_countries.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/language_scripts.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-28/languages.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "afrisenti.csv",
+                "country_regions.csv",
+                "language_countries.csv",
+                "language_scripts.csv",
+                "languages.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/afrisenti.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/country_regions.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/language_countries.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/language_scripts.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-28/languages.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-05-02",
+        "project_name": "The Portal Project",
+        "project_source": [
+            "https://www.weecology.org/",
+            "https://weecology.github.io/portalr/",
+            "https://portal.weecology.org/",
+            "https://datacarpentry.org/ecology-workshop/",
+            "https://www.data-retriever.org/"
+        ],
+        "description": "The data this week comes from thePortal Project. This is a long-term ecological research site studying the dynamics of desert rodents, plants, ants and weather in Arizona. The Portal Project is a long-term ecological study being conducted near Portal, AZ. Since 1977, the site has been used to study the interactions among rodents, ants and plants and their respective responses to climate. To study the interactions among organisms, they experimentally manipulate access to 24 study plots. This study has produced over 100 scientific papers and is one of the longest running ecological studies in the U.S. TheWeecology research groupmonitors rodents, plants, ants, and weather. All data from the Portal Project are made openly available in near real-time so that they can provide the maximum benefit to scientific research and outreach. The core dataset is managed using an automated living data workflow run using GitHub and Continuous Analysis. This dataset focuses on the rodent data. Full data is available through these resources: The Portal Project data can also be accessed through the Data Retriever, a package manager for data. Data Retriever A teaching focused version of the dataset is also maintained with some of the complexities of the data removed to make it easy to use for computational training purposes. This dataset serves as the core dataset for theData Carpentry Ecologymaterial and has been downloaded almost 50,000 times. Thanks to @ethanwhite for the data cleaning script. This script downloads the data using the{portalr}package. It filters for the species and plot data, and years greater than 1977.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "plot",
+                    "treatment"
+                ],
+                "class": [
+                    "double",
+                    "character"
+                ],
+                "description": [
+                    "Plot number",
+                    "Treatment type"
+                ]
+            },
+            {
+                "variable": [
+                    "species",
+                    "scientificname",
+                    "taxa",
+                    "commonname",
+                    "censustarget",
+                    "unidentified",
+                    "rodent",
+                    "granivore",
+                    "minhfl",
+                    "meanhfl",
+                    "maxhfl",
+                    "minwgt",
+                    "meanwgt",
+                    "maxwgt",
+                    "juvwgt"
+                ],
+                "class": [
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "Species",
+                    "Scientific Name",
+                    "Taxa",
+                    "Common Name",
+                    "Target species (0 or 1)",
+                    "Unidentified (0 or 1)",
+                    "Rodent (0 or 1)",
+                    "Granivore (0 or 1)",
+                    "Minimum hindfoot length",
+                    "Mean hindfoot length",
+                    "Maximum hindfoot length",
+                    "Minimum weight",
+                    "Mean weight",
+                    "Maximum weight",
+                    "Juvenile weight"
+                ]
+            },
+            {
+                "variable": [
+                    "censusdate",
+                    "month",
+                    "day",
+                    "year",
+                    "treatment",
+                    "plot",
+                    "stake",
+                    "species",
+                    "sex",
+                    "reprod",
+                    "age",
+                    "testes",
+                    "vagina",
+                    "pregnant",
+                    "nipples",
+                    "lactation",
+                    "hfl",
+                    "wgt",
+                    "tag",
+                    "note2",
+                    "ltag",
+                    "note3"
+                ],
+                "class": [
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "Census date",
+                    "Month",
+                    "Day",
+                    "Year",
+                    "Treatment type",
+                    "Plot number",
+                    "Stake number",
+                    "Species code",
+                    "Sex",
+                    "Reproductive condition",
+                    "Age",
+                    "Testes (Scrotal, Recent, or Minor)",
+                    "Vagina (Swollen, Plugged, or Both)",
+                    "Pregnant",
+                    "Nipples (Enlarged, Swollen, or Both)",
+                    "Lactating",
+                    "Hindfoot length",
+                    "Weight",
+                    "Primary individual identifier",
+                    "Newly tagged individual for 'tag'",
+                    "Secondary tag information when ear tags were used in both ears",
+                    "Newly tagged individual for 'ltag'"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "plots.csv",
+                "species.csv",
+                "surveys.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02/plots.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02/species.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-02/surveys.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "plots.csv",
+                "species.csv",
+                "surveys.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-02/plots.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-02/species.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-02/surveys.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-04-04",
+        "project_name": "Premier League Match Data 2021-2022",
+        "project_source": [
+            "https://www.kaggle.com/datasets/evangower/premier-league-match-data",
+            "https://theathletic.com/3459766/2022/07/29/liverpool-manchester-city-premier-league-fouls-yellow-card/",
+            "https://github.com/evangower",
+            "https://www.kaggle.com/code/evangower/who-wins-the-epl-if-games-end-at-half-time/"
+        ],
+        "description": "The data this week comes from thePremier League Match Data 2021-2022viaEvan Goweron Kaggle. You can explore match day statistics of every game and every team during the 2021-22 season of the English Premier League Data. Data includes teams playing, date, referee, and stats for home and away side such as fouls, shots, cards, and more! Also included is a dataset of the weekly rankings for the season. The data was collected from the official website of the Premier League. Evan then cleaned the data using google sheets. Evan did an analysis ofWho wins the EPL if games end at half time?and there'san article from the Athleticabout fouls conceded per yellow card article. No data cleaning",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-04-04",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "Date",
+                    "HomeTeam",
+                    "AwayTeam",
+                    "FTHG",
+                    "FTAG",
+                    "FTR",
+                    "HTHG",
+                    "HTAG",
+                    "HTR",
+                    "Referee",
+                    "HS",
+                    "AS",
+                    "HST",
+                    "AST",
+                    "HF",
+                    "AF",
+                    "HC",
+                    "AC",
+                    "HY",
+                    "AY",
+                    "HR",
+                    "AR"
+                ],
+                "class": [
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "The date when the match was played",
+                    "The home team",
+                    "The away team",
+                    "Full time home goals",
+                    "Full time away goals",
+                    "Full time result",
+                    "Halftime home goals",
+                    "Halftime away goals",
+                    "Halftime results",
+                    "Referee of the match",
+                    "Number of shots taken by the home team",
+                    "Number of shots taken by the away team",
+                    "Number of shots on target by the home team",
+                    "Number of shots on target by the away team",
+                    "Number of fouls by the home team",
+                    "Number of fouls by the away team",
+                    "Number of corners taken by the home team",
+                    "Number of corners taken by the away team",
+                    "Number of yellow cards received by the home team",
+                    "Number of yellow cards received by the away team",
+                    "Number of red cards received by the home team",
+                    "Number of red cards received by the away team"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "soccer21-22.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-04-04/soccer21-22.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "soccer21-22.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-04-04/soccer21-22.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-02-07",
+        "project_name": "Big Tech Stock Prices",
+        "project_source": [
+            "https://github.com/rfordatascience/tidytuesday/issues/509",
+            "https://www.morningstar.com/articles/1129535/5-charts-on-big-tech-stocks-collapse",
+            "https://www.kaggle.com/datasets/evangower/big-tech-stock-prices"
+        ],
+        "description": "The data this week comes from Yahoo Finance viaKaggle(byEvan Gower). This dataset consists of the daily stock prices and volume of 14 different tech companies, including Apple (AAPL), Amazon (AMZN), Alphabet (GOOGL), and Meta Platforms (META) and more! A number of articles have examined the collapse of \"Big Tech\" stock prices, includingthis article from morningstar.com. Note: Allstock_symbols have 3271 prices, except META (2688) and TSLA (3148) because they were not publicly traded for part of the period examined.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-07",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "stock_symbol",
+                    "date",
+                    "open",
+                    "high",
+                    "low",
+                    "close",
+                    "adj_close",
+                    "volume"
+                ],
+                "class": [
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "stock_symbol",
+                    "date",
+                    "The price at market open.",
+                    "The highest price for that day.",
+                    "The lowest price for that day.",
+                    "The price at market close, adjusted for splits.",
+                    "The closing price after adjustments for all applicable splits and dividend distributions. Data is adjusted using appropriate split and dividend multipliers, adhering to Center for Research in Security Prices (CRSP) standards.",
+                    "The number of shares traded on that day."
+                ]
+            },
+            {
+                "variable": [
+                    "stock_symbol",
+                    "company"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "stock_symbol",
+                    "Full name of the company."
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "big_tech_companies.csv",
+                "big_tech_stock_prices.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-07/big_tech_companies.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-07/big_tech_stock_prices.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "big_tech_companies.csv",
+                "big_tech_stock_prices.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_companies.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_stock_prices.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-03-21",
+        "project_name": "Programming Languages",
+        "project_source": [
+            "https://github.com/rfordatascience/tidytuesday/issues/530",
+            "https://pldb.com/posts/does-every-programming-language-support-line-comments.html",
+            "https://pldb.com/csv.html",
+            "https://pldb.com/index.html",
+            "https://pldb.com/posts/index.html"
+        ],
+        "description": "The data this week comes from theProgramming Language DataBase. Thanks toJesus M. Castagnettofor the suggestion! The PLDB has ablogwith numerous articles exploring the data, such asDoes every programming language have line comments?. The data is user-submitted, so you might want to confirm the accuracy of anything particularly surprising that you find before stating it with certainty! Thefull data dictionaryis available from PLDB.com.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-21",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "pldb_id",
+                    "title",
+                    "description",
+                    "type",
+                    "appeared",
+                    "creators",
+                    "website",
+                    "domain_name",
+                    "domain_name_registered",
+                    "reference",
+                    "isbndb",
+                    "book_count",
+                    "semantic_scholar",
+                    "language_rank",
+                    "github_repo",
+                    "github_repo_stars",
+                    "github_repo_forks",
+                    "github_repo_updated",
+                    "github_repo_subscribers",
+                    "github_repo_created",
+                    "github_repo_description",
+                    "github_repo_issues",
+                    "github_repo_first_commit",
+                    "github_language",
+                    "github_language_tm_scope",
+                    "github_language_type",
+                    "github_language_ace_mode",
+                    "github_language_file_extensions",
+                    "github_language_repos",
+                    "wikipedia",
+                    "wikipedia_daily_page_views",
+                    "wikipedia_backlinks_count",
+                    "wikipedia_summary",
+                    "wikipedia_page_id",
+                    "wikipedia_appeared",
+                    "wikipedia_created",
+                    "wikipedia_revision_count",
+                    "wikipedia_related",
+                    "features_has_comments",
+                    "features_has_semantic_indentation",
+                    "features_has_line_comments",
+                    "line_comment_token",
+                    "last_activity",
+                    "number_of_users",
+                    "number_of_jobs",
+                    "origin_community",
+                    "central_package_repository_count",
+                    "file_type",
+                    "is_open_source"
+                ],
+                "class": [
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "integer",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "character",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "character",
+                    "double",
+                    "character",
+                    "logical"
+                ],
+                "description": [
+                    "A standardized, uniquified version of the language name, used as an ID on the PLDB site.",
+                    "The official title of the language.",
+                    "Description of the repo on GitHub.",
+                    "Which category in PLDB's subjective ontology does this entity fit into.",
+                    "What year was the language publicly released and/or announced?",
+                    "Name(s) of the original creators of the language delimited by \\\" and \\\"",
+                    "URL of the official homepage for the language project.",
+                    "If the project website is on its own domain.",
+                    "When was this domain first registered?",
+                    "A link to more info about this entity.",
+                    "Books about this language from ISBNdb.",
+                    "Computed; the number of books found for this language at isbndb.com",
+                    "Papers about this language from Semantic Scholar.",
+                    "Computed; A rank for the language, taking into account various online rankings. The computation for this column is not currently clear.",
+                    "URL of the official GitHub repo for the project if it hosted there.",
+                    "How many stars of the repo?",
+                    "How many forks of the repo?",
+                    "What year was the last commit made?",
+                    "How many subscribers to the repo?",
+                    "When was the Github repo for this entity created?",
+                    "Description of the repo on GitHub.",
+                    "How many isses on the repo?",
+                    "What year the first commit made in this git repo?",
+                    "GitHub has a set of supported languages as defined here",
+                    "The TextMate scope that represents this programming language.",
+                    "Either data, programming, markup, prose, or nil.",
+                    "A String name of the Ace Mode used for highlighting whenever a file is edited. This must match one of the filenames in http://git.io/3XO_Cg. Use \\\"text\\\" if a mode does not exist.",
+                    "An Array of associated extensions (the first one is considered the primary extension, the others should be listed alphabetically).",
+                    "How many repos for this language does GitHub report?",
+                    "URL of the entity on Wikipedia, if and only if it has a page dedicated to it.",
+                    "How many page views per day does this Wikipedia page get? Useful as a signal for rankings. Available via WP api.",
+                    "How many pages on WP link to this page?",
+                    "What is the text summary of the language from the Wikipedia page?",
+                    "Waht is the internal ID for this entity on WP?",
+                    "When does Wikipedia claim this entity first appeared?",
+                    "When was the Wikipedia page for this entity created?",
+                    "How many revisions does this page have?",
+                    "What languages does Wikipedia have as related?",
+                    "Does this language have a comment character?",
+                    "Does indentation have semantic meaning in this language?",
+                    "Does this language support inline comments (as opposed to comments that must span an entire line)?",
+                    "Defined as a token that can be placed anywhere on a line and starts a comment that cannot be stopped except by a line break character or end of file.",
+                    "Computed; The most recent of any year field in the PLDB for this language.",
+                    "Computed; \\\"Crude user estimate from a linear model.",
+                    "Computed; The estimated number of job openings for programmers in this language.",
+                    "In what community(ies) did the language first originate?",
+                    "Number of packages in a central repository. If this value is not known, it is set to 0 (so \\\"0\\\" can mean \\\"no repository exists\\\", \\\"the repository exists but is empty\\\" (unlikely), or \\\"we do not know if a repository exists\\\". This value is definitely incorrect for R.",
+                    "What is the file encoding for programs in this language?",
+                    "Is it an open source project?"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "languages.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-21/languages.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "languages.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-03-21/languages.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-05-23",
+        "project_name": "Central Park Squirrel Census",
+        "project_source": [
+            "https://data.cityofnewyork.us/Environment/2018-Central-Park-Squirrel-Census-Squirrel-Data/vfnx-vebw",
+            "https://www.thesquirrelcensus.com/"
+        ],
+        "description": "Squirrel data! The data this week comes from the2018 Central Park Squirrel Census. The Squirrel Censusis a multimedia science, design, and storytelling project focusing on the Eastern gray (Sciurus carolinensis). They count squirrels and present their findings to the public. The dataset contains squirrel data for each of the 3,023 sightings, including location coordinates, age, primary and secondary fur color, elevation, activities, communications, and interactions between squirrels and with humans. No data cleaning",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-23",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "X",
+                    "Y",
+                    "Unique Squirrel ID",
+                    "Hectare",
+                    "Shift",
+                    "Date",
+                    "Hectare Squirrel Number",
+                    "Age",
+                    "Primary Fur Color",
+                    "Highlight Fur Color",
+                    "Combination of Primary and Highlight Color",
+                    "Color notes",
+                    "Location",
+                    "Above Ground Sighter Measurement",
+                    "Specific Location",
+                    "Running",
+                    "Chasing",
+                    "Climbing",
+                    "Eating",
+                    "Foraging",
+                    "Other Activities",
+                    "Kuks",
+                    "Quaas",
+                    "Moans",
+                    "Tail flags",
+                    "Tail twitches",
+                    "Approaches",
+                    "Indifferent",
+                    "Runs from",
+                    "Other Interactions",
+                    "Lat/Long"
+                ],
+                "class": [
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "character",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "Longitude coordinate for squirrel sighting point",
+                    "Latitude coordinate for squirrel sighting point",
+                    "Identification tag for each squirrel sightings. The tag is comprised of \\\"Hectare ID\\\" + \\\"Shift\\\" + \\\"Date\\\" + \\\"Hectare Squirrel Number.\\\"",
+                    "ID tag, which is derived from the hectare grid used to divide and count the park area. One axis that runs predominantly north-to-south is numerical (1-42), and the axis that runs predominantly east-to-west is roman characters (A-I).",
+                    "Value is either \\\"AM\\\" or \\\"PM,\\\" to communicate whether or not the sighting session occurred in the morning or late afternoon.",
+                    "Concatenation of the sighting session day and month.",
+                    "Number within the chronological sequence of squirrel sightings for a discrete sighting session.",
+                    "Value is either \\\"Adult\\\" or \\\"Juvenile.\\\"",
+                    "Primary Fur Color - value is either \\\"Gray,\\\" \\\"Cinnamon\\\" or \\\"Black.\\\"",
+                    "Discrete value or string values comprised of \\\"Gray,\\\" \\\"Cinnamon\\\" or \\\"Black.\\\"",
+                    "A combination of the previous two columns; this column gives the total permutations of primary and highlight colors observed.",
+                    "Sighters occasionally added commentary on the squirrel fur conditions. These notes are provided here.",
+                    "Value is either \\\"Ground Plane\\\" or \\\"Above Ground.\\\" Sighters were instructed to indicate the location of where the squirrel was when first sighted.",
+                    "For squirrel sightings on the ground plane, fields were populated with a value of \\\"FALSE.\\\"",
+                    "Sighters occasionally added commentary on the squirrel location. These notes are provided here.",
+                    "Squirrel was seen running.",
+                    "Squirrel was seen chasing another squirrel.",
+                    "Squirrel was seen climbing a tree or other environmental landmark.",
+                    "Squirrel was seen eating.",
+                    "Squirrel was seen foraging for food.",
+                    "Other activities squirrels were observed doing.",
+                    "Squirrel was heard kukking, a chirpy vocal communication used for a variety of reasons.",
+                    "Squirrel was heard quaaing, an elongated vocal communication which can indicate the presence of a ground predator such as a dog.",
+                    "Squirrel was heard moaning, a high-pitched vocal communication which can indicate the presence of an air predator such as a hawk.",
+                    "Squirrel was seen flagging its tail. Flagging is a whipping motion used to exaggerate squirrel's size and confuse rivals or predators. Looks as if the squirrel is scribbling with tail into the air.",
+                    "Squirrel was seen twitching its tail. Looks like a wave running through the tail, like a breakdancer doing the arm wave. Often used to communicate interest, curiosity.",
+                    "Squirrel was seen approaching human, seeking food.",
+                    "Squirrel was indifferent to human presence.",
+                    "Squirrel was seen running from humans, seeing them as a threat.",
+                    "Sighter notes on other types of interactions between squirrels and humans.",
+                    "Latitude and longitude"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "squirrel_data.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-05-23/squirrel_data.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "squirrel_data.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-05-23/squirrel_data.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-01-17",
+        "project_name": "Art History",
+        "project_source": [
+            "https://research.repository.duke.edu/concern/datasets/q811kk70n?locale=en",
+            "https://github.com/hollandstam1/thesis",
+            "https://saralemus7.github.io/arthistory/",
+            "https://github.com/saralemus7/arthistory"
+        ],
+        "description": "The data this week comes from thearthistory data package This dataset contains data that was used for Holland Stam's thesis work, titledQuantifying art historical narratives. The data was collected to assess the demographic representation of artists through editions of Janson's History of Art and Gardner's Art Through the Ages, two of the most popular art history textbooks used in the American education system. In this package specifically, both artist-level and work-level data was collected along with variables regarding the artists' demographics and numeric metrics for describing how much space they or their work took up in each edition of each textbook. This package contains three datasets: Acknowledging arthistory Citation Lemus S, Stam H (2022). arthistory: Art History Textbook Data.https://github.com/saralemus7/arthistory,https://saralemus7.github.io/arthistory/. Examples of analyses are included inHolland Stam's thesisin Quarto files. No data cleaning",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-01-17",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "artist_name",
+                    "edition_number",
+                    "year",
+                    "artist_nationality",
+                    "artist_nationality_other",
+                    "artist_gender",
+                    "artist_race",
+                    "artist_ethnicity",
+                    "book",
+                    "space_ratio_per_page_total",
+                    "artist_unique_id",
+                    "moma_count_to_year",
+                    "whitney_count_to_year",
+                    "artist_race_nwi"
+                ],
+                "class": [
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "character"
+                ],
+                "description": [
+                    "The name of each artist",
+                    "The edition number of the textbook from either Janson's History or Art or Gardner's Art Through the Ages.",
+                    "The year of publication for a given edition of Janson or Gardner.",
+                    "The nationality of a given artist.",
+                    "The nationality of the artist. Of the total count of artists through all editions of Janson's History of Art and Gardner's Art Through the Ages, 77.32% account for French, Spanish, British, American and German. Therefore, the categorical strings of this variable are French, Spanish, British, American, German and Other",
+                    "The gender of the artist",
+                    "The race of the artist",
+                    "The ethnicity of the artist",
+                    "Which book, either Janson or Gardner the particular artist at that particular time was included.",
+                    "The area in centimeters squared of both the text and the figure of a particular artist in a given edition of Janson's History of Art divided by the area in centimeters squared of a single page of the respective edition. This variable is continuous.",
+                    "The unique identifying number assigned to artists across books is denoted in alphabetical order. This variable is discrete.",
+                    "The total count of exhibitions ever held by the Museum of Modern Art (MoMA) of a particular artist at a given year of publication. This variable is discrete.",
+                    "The count of exhibitions held by The Whitney of a particular artist at a particular moment of time, as highlighted by year. This variable in discrete.",
+                    "The non-white indicator for artist race, meaning if an artist's race is denoted as either white or non-white."
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "artists.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-01-17/artists.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "artists.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-01-17/artists.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-07-04",
+        "project_name": "Historical Markers",
+        "project_source": [
+            "http://www.geonames.org/",
+            "https://www.hmdb.org/geolists.asp?c=United%20States%20of%20America",
+            "https://www.hmdb.org/stats.asp",
+            "https://www.hmdb.org/",
+            "https://github.com/rfordatascience/tidytuesday/issues/574#issuecomment-1601050053"
+        ],
+        "description": "The data this week comes from theHistorical Marker Database USA Index. Learn more about the markers on theHMDb.org site, which includes a number of articles, includingDatabase Counts and Statistics. We included a dataset of places that donothave entries in the Historical Markers Database. You might try to combine that with information fromgeonames.org(code: HSTS) to find markers that need to be submitted. Thanks toJesus M. Castagnettofor the geonames tip!",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-04",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "marker_id",
+                    "marker_no",
+                    "title",
+                    "subtitle",
+                    "addl_subtitle",
+                    "year_erected",
+                    "erected_by",
+                    "latitude_minus_s",
+                    "longitude_minus_w",
+                    "street_address",
+                    "city_or_town",
+                    "section_or_quarter",
+                    "county_or_parish",
+                    "state_or_prov",
+                    "location",
+                    "missing",
+                    "link"
+                ],
+                "class": [
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "integer",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "Unique ID for this marker in the HMdb.",
+                    "Number of this marker in the state numbering scheme.",
+                    "Main title of the marker.",
+                    "Subtitle of the marker, if any.",
+                    "Additional subtitle text.",
+                    "The year in which the marker was erected.",
+                    "The organization which erected the marker.",
+                    "The latitude of the marker.",
+                    "The longitude of the marker.",
+                    "The street address of the marker, if available.",
+                    "The city, town, etc in which the marker is located.",
+                    "The section of the city, town, etc, when available.",
+                    "The county, parish, or similar designation in which the marker appears.",
+                    "The state, province, territory, etc in which the marker appears.",
+                    "A description of the marker's location.",
+                    "Whether the marker is \\\"Reported missing\\\" or \\\"Confirmed missing\\\". NA values indicate that the marker has neither been reported missing nor confirmed as missing.",
+                    "The HMDb link to the marker. Links include additional details, such as photos and topic lists to which this marker belongs."
+                ]
+            },
+            {
+                "variable": [
+                    "county",
+                    "state"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "County or equivalent.",
+                    "State or territory."
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "historical_markers.csv",
+                "no_markers.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-04/historical_markers.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-04/no_markers.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "historical_markers.csv",
+                "no_markers.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-07-04/historical_markers.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-07-04/no_markers.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-02-14",
+        "project_name": "Hollywood Age Gaps",
+        "project_source": [
+            "https://www.data-is-plural.com/archive/2018-02-07-edition/",
+            "https://tidytues.day/2021/2021-03-09",
+            "https://hollywoodagegap.com/"
+        ],
+        "description": "The data this week comes fromHollywood Age GapviaData Is Plural. An informational site showing the age gap between movie love interests. The data follows certain rules: The two (or more) actors play actual love interests (not just friends, coworkers, or some other non-romantic type of relationship) The youngest of the two actors is at least 17 years old Not animated characters We previously provided a dataset about theBechdel Test. It might be interesting to see whether there is any correlation between these datasets! The Bechdel Test dataset also included additional information about the films that were used in that dataset. Note: The age gaps dataset includes \"gender\" columns, which always contain the values \"man\" or \"woman\". These values appear to indicate how thecharactersin each film identify. Some of these values do not match how theactoridentifies. We apologize if any characters are misgendered in the data!",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-14",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "movie_name",
+                    "release_year",
+                    "director",
+                    "age_difference",
+                    "couple_number",
+                    "actor_1_name",
+                    "actor_2_name",
+                    "character_1_gender",
+                    "character_2_gender",
+                    "actor_1_birthdate",
+                    "actor_2_birthdate",
+                    "actor_1_age",
+                    "actor_2_age"
+                ],
+                "class": [
+                    "character",
+                    "integer",
+                    "character",
+                    "integer",
+                    "integer",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "date",
+                    "date",
+                    "integer",
+                    "integer"
+                ],
+                "description": [
+                    "Name of the film",
+                    "Release year",
+                    "Director of the film",
+                    "Age difference between the characters in whole years",
+                    "An identifier for the couple in case multiple couples are listed for this film",
+                    "The name of the older actor in this couple",
+                    "The name of the younger actor in this couple",
+                    "The gender of the older character, as identified by the person who submitted the data for this couple",
+                    "The gender of the younger character, as identified by the person who submitted the data for this couple",
+                    "The birthdate of the older member of the couple",
+                    "The birthdate of the younger member of the couple",
+                    "The age of the older actor when the film was released",
+                    "The age of the younger actor when the film was released"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "age_gaps.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-02-14/age_gaps.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "age_gaps.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-08-15",
+        "project_name": "Spam E-mail",
+        "project_source": [
+            "https://vincentarelbundock.github.io/Rdatasets/index.html",
+            "https://archive.ics.uci.edu/dataset/94/spambase",
+            "https://search.r-project.org/CRAN/refmans/kernlab/html/spam.html",
+            "https://vincentarelbundock.github.io/Rdatasets/doc/DAAG/spam7.html"
+        ],
+        "description": "The data this week comes from Vincent Arel-Bundock's Rdatasets package(https://vincentarelbundock.github.io/Rdatasets/index.html). Rdatasets is a collection of 2246 datasets which were originally distributed alongside the statistical software environment R and some of its add-on packages. The goal is to make these data more broadly accessible for teaching and statistical software development. We're working with thespam emaildataset. This is a subset of thespam e-mail database. This is a dataset collected at Hewlett-Packard Labs by Mark Hopkins, Erik Reeber, George Forman, and Jaap Suermondt and shared with theUCI Machine Learning Repository. The dataset classifies 4601 e-mails as spam or non-spam, with additional variables indicating the frequency of certain words and characters in the e-mail. First column was removed.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-15",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "crl.tot",
+                    "dollar",
+                    "bang",
+                    "money",
+                    "n000",
+                    "make",
+                    "yesno"
+                ],
+                "class": [
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "character"
+                ],
+                "description": [
+                    "Total length of uninterrupted sequences of capitals",
+                    "Occurrences of the dollar sign, as percent of total number of characters",
+                    "Occurrences of ‘!’, as percent of total number of characters",
+                    "Occurrences of ‘money’, as percent of total number of characters",
+                    "Occurrences of the string ‘000’, as percent of total number of words",
+                    "Occurrences of ‘make’, as a percent of total number of words",
+                    "Outcome variable, a factor with levels 'n' not spam, 'y' spam"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "spam.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-15/spam.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "spam.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-15/spam.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-03-07",
+        "project_name": "Numbats in Australia",
+        "project_source": [
+            "/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-07/data/numbats.csv",
+            "https://www.ala.org.au",
+            "https://github.com/numbats/numbats-tidytuesday",
+            "https://bie.ala.org.au/species/https://biodiversity.org.au/afd/taxa/6c72d199-f0f1-44d3-8197-224a2f7cff5f"
+        ],
+        "description": "The data this week comes from theAtlas of Living Australia. Thanks to Di Cook forpreparing this week's dataset! ThisNumbat page at the Atlas of Living Australiatalks about these endangered species in greater detail. Acsvfile of numbat sightings is provided. The code to refresh the data is below. Questions that would be interesting to answer are:",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-07",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "decimalLatitude",
+                    "decimalLongitude",
+                    "eventDate",
+                    "scientificName",
+                    "taxonConceptID",
+                    "recordID",
+                    "dataResourceName",
+                    "year",
+                    "month",
+                    "wday",
+                    "hour",
+                    "day",
+                    "dryandra",
+                    "prcp",
+                    "tmax",
+                    "tmin"
+                ],
+                "class": [
+                    "double",
+                    "double",
+                    "datetime",
+                    "factor",
+                    "factor",
+                    "character",
+                    "factor",
+                    "integer",
+                    "factor",
+                    "factor",
+                    "integer",
+                    "date",
+                    "logical",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "decimalLatitude",
+                    "decimalLongitude",
+                    "eventDate",
+                    "Either \\\"Myrmecobius fasciatus\\\" or \\\"Myrmecobius fasciatus rufus\\\"",
+                    "The URL for this (sub)species",
+                    "recordID",
+                    "dataResourceName",
+                    "The 4-digit year of the event (when available)",
+                    "The 3-letter month abbreviation of the event (when available)",
+                    "The 3-letter weekday abbreviation of the event (when available)",
+                    "The hour of the event (when available)",
+                    "The date of the event (when available)",
+                    "whether the observation was in Dryandra Woodland",
+                    "Precipitation on that day in Dryandra Woodland (when relevant), in millimeters",
+                    "Maximum temperature on that day in Dryandra Woodland (when relevant), in degrees Celsius",
+                    "Minimum temperature on that day in Dryandra Woodland (when relevant), in degrees Celsius"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "numbats.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-03-07/numbats.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "numbats.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-03-07/numbats.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-11-28",
+        "project_name": "Doctor Who Episodes",
+        "project_source": [
+            "https://en.wikipedia.org/wiki/List_of_Doctor_Who_episodes_(2005%E2%80%93present)",
+            "https://github.com/KittJonathan/datardis/tree/main/misc",
+            "https://cran.r-project.org/package=datardis",
+            "https://github.com/KittJonathan/datardis"
+        ],
+        "description": "Doctor Who is an extremely long-running British television program. The show was revived in 2005, and has proven very popular since then. To celebrate this year's 60th anniversary of Doctor Who, we have three datasets. The data this week comes from Wikipedia's [List of Doctor Who episodes](https://en.wikipedia.org/wiki/List_of_Doctor_Who_episodes_(2005%E2%80%93present)via the{datardis} packagebyJonathan Kitt. Thank you to Jonathan for compiling and sharing this data! As of 2023-11-24, the data only includes episodes from the \"revived\" era. For an added challenge, consider submitting a pull request to the {datardis} package to update thedata-extraction scriptsto also fetch the \"classic\" era data! Clean data from the{datardis} package.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "era",
+                    "season_number",
+                    "serial_title",
+                    "story_number",
+                    "episode_number",
+                    "episode_title",
+                    "type",
+                    "first_aired",
+                    "production_code",
+                    "uk_viewers",
+                    "rating",
+                    "duration"
+                ],
+                "class": [
+                    "character",
+                    "double",
+                    "character",
+                    "character",
+                    "double",
+                    "character",
+                    "character",
+                    "double",
+                    "character",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "Whether the episode is in the \\\"classic\\\" or \\\"revived\\\" era. All data in this dataset is within the \\\"revived\\\" era.",
+                    "The season number within the era. Note that some episodes are outside of a season.",
+                    "Serial title if available",
+                    "Story number",
+                    "Episode number in season",
+                    "Episode title",
+                    "\\\"episode\\\" or \\\"special\\\"",
+                    "Date the episode first aired in the U.K.",
+                    "Episode's production code if available",
+                    "Number of U.K. viewers (millions)",
+                    "Episode's rating",
+                    "Episode's duration in minutes"
+                ]
+            },
+            {
+                "variable": [
+                    "story_number",
+                    "director"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "Story number",
+                    "Episode's director"
+                ]
+            },
+            {
+                "variable": [
+                    "story_number",
+                    "writer"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "Story number",
+                    "Episode's writer"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "drwho_directors.csv",
+                "drwho_episodes.csv",
+                "drwho_writers.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28/drwho_directors.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28/drwho_episodes.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-28/drwho_writers.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "drwho_directors.csv",
+                "drwho_episodes.csv",
+                "drwho_writers.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-28/drwho_directors.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-28/drwho_episodes.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-28/drwho_writers.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-11-14",
+        "project_name": "Diwali Sales Data",
+        "project_source": [
+            "https://www.kaggle.com/code/bhushanshelke69/diwali-data-exploration",
+            "https://github.com/vikasvachheta08/Diwali_Sales_Analysis_Using_Python",
+            "https://www.kaggle.com/datasets/saadharoon27/diwali-sales-dataset"
+        ],
+        "description": "This week is Diwali, the festival of lights! The data this week comes fromsales datafor a retail store during the Diwali festival period in India. The data is shared on Kaggle by Saad Haroon. This week we're sharing Python data analysis examples! There's a few out there, but these ones fromBrushan ShelkeorVikas Vachheta(see the Diwali_Sales_Analysis.ipynb file for the code) are some data exploration analyses. Data was downloaded fromKaggle, and theStatusandunnamed1columns removed.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-14",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "User_ID",
+                    "Cust_name",
+                    "Product_ID",
+                    "Gender",
+                    "Age Group",
+                    "Age",
+                    "Marital_Status",
+                    "State",
+                    "Zone",
+                    "Occupation",
+                    "Product_Category",
+                    "Orders",
+                    "Amount"
+                ],
+                "class": [
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "User identification number",
+                    "Customer name",
+                    "Product identification number",
+                    "Gender of the customer (e.g. Male, Female)",
+                    "Age group of the customer",
+                    "Age of the customer",
+                    "Marital status of the customer (e.g. Married, Single)",
+                    "State of the customer",
+                    "Geographic zone of the customer",
+                    "Occupation of the customer",
+                    "Category of the product",
+                    "Number of orders made by the customer",
+                    "Amount in Indian rupees spent by the customer"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "diwali_sales_data.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-14/diwali_sales_data.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "diwali_sales_data.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-14/diwali_sales_data.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-12-12",
+        "project_name": "Holiday Movies",
+        "project_source": [
+            "https://networkdatascience.ceu.edu/article/2019-12-16/christmas-movies",
+            "https://developer.imdb.com/non-commercial-datasets/"
+        ],
+        "description": "Happy holidays! This week we're exploring \"holiday\" movies: movies with \"holiday\", \"Christmas\", \"Hanukkah\", or \"Kwanzaa\" (or variants thereof) in their title! The data this week comes from theInternet Movie Database. We don't have an article using exactly this dataset, but you might get inspiration from thisChristmas Moviesblog post by Milán Janosov at Central European University.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-12-12",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "tconst",
+                    "title_type",
+                    "primary_title",
+                    "original_title",
+                    "year",
+                    "runtime_minutes",
+                    "genres",
+                    "simple_title",
+                    "average_rating",
+                    "num_votes",
+                    "christmas",
+                    "hanukkah",
+                    "kwanzaa",
+                    "holiday"
+                ],
+                "class": [
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "logical",
+                    "logical",
+                    "logical",
+                    "logical"
+                ],
+                "description": [
+                    "alphanumeric unique identifier of the title",
+                    "the type/format of the title (movie, video, or tvMovie)",
+                    "the more popular title / the title used by the filmmakers on promotional materials at the point of release",
+                    "original title, in the original language",
+                    "the release year of a title",
+                    "primary runtime of the title, in minutes",
+                    "includes up to three genres associated with the title (comma-delimited)",
+                    "the title in lowercase, with punctuation removed, for easier filtering and grouping",
+                    "weighted average of all the individual user ratings on IMDb",
+                    "number of votes the title has received on IMDb (titles with fewer than 10 votes were not included in this dataset)",
+                    "whether the title includes \\\"christmas\\\", \\\"xmas\\\", \\\"x mas\\\", etc",
+                    "whether the title includes \\\"hanukkah\\\", \\\"chanukah\\\", etc",
+                    "whether the title includes \\\"kwanzaa\\\"",
+                    "whether the title includes the word \\\"holiday\\\""
+                ]
+            },
+            {
+                "variable": [
+                    "tconst",
+                    "genres"
+                ],
+                "class": [
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "alphanumeric unique identifier of the title",
+                    "genres associated with the title, one row per genre"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "holiday_movie_genres.csv",
+                "holiday_movies.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-12-12/holiday_movie_genres.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-12-12/holiday_movies.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "holiday_movie_genres.csv",
+                "holiday_movies.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-12-12/holiday_movie_genres.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-12-12/holiday_movies.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2024-02-13",
+        "project_name": "Valentine's Day Consumer Data",
+        "project_source": [
+            "https://github.com/rfordatascience/tidytuesday/tree/master/data/2022/2022-01-25",
+            "https://nrf.com/research-insights/holiday-data-and-trends/valentines-day/valentines-day-data-center",
+            "https://www.kaggle.com/datasets/infinator/happy-valentines-day-2022",
+            "https://github.com/rfordatascience/tidytuesday/tree/master/data/2022/2022-01-18"
+        ],
+        "description": "Happy Valentine's Day! This week we're exploringValentine's Day survey data. The National Retail Federation in the United States conducts surveys and has created aValentine's Day Data Centerso you can explore the data on how consumers celebrate. The NRF has surveyed consumers about how they plan to celebrate Valentine’s Day annually for over a decade. Take a deeper dive into the data from the last 10 years, and use the interactive charts to explore a demographic breakdown of total spending, average spending, types of gifts planned and spending per type of gift. The NRF has continued to collect data. The data for this week is from 2010 to 2022, as organized by Suraj Das for a Kaggle dataset. In the historical surveys gender was collected as only 'Men' and 'Women', which does not accurately include all genders. If you're looking for other Valentine's Day type datasets, check out previous datasets onchocolateorboard games(a good Valentine's Day activity!). Data was downloaded fromSunja aa Kaggle dataset. Data from historical_gift_trends_per_person_spending.csv, historical_spending_average_expected_spending.csv and historical_spending_percent_celebrating.csv were combined into historical_spending.csv. Data from planned_gifts_age.csv and spending_or_celebrating_age_1.csv were combined into gifts_age.csv. Data from planned_gifts_gender.csv and spending_or_celebrating_gender_1.csv were combined into gifts_gender.csv. Percentage signs and dollar signs were removed from all numerical values.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "Year",
+                    "PercentCelebrating",
+                    "PerPerson",
+                    "Candy",
+                    "Flowers",
+                    "Jewelry",
+                    "GreetingCards",
+                    "EveningOut",
+                    "Clothing",
+                    "GiftCards"
+                ],
+                "class": [
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "Year",
+                    "Percent of people celebrating Valentines Day",
+                    "Average amount each person is spending",
+                    "Average amount spending on candy",
+                    "Average amount spending on flowers",
+                    "Average amount spending on jewelry",
+                    "Average amount spending on greeting cards",
+                    "Average amount spending on an evening out",
+                    "Average amount spending on clothing",
+                    "Average amount spending on gift cards"
+                ]
+            },
+            {
+                "variable": [
+                    "Age",
+                    "SpendingCelebrating",
+                    "Candy",
+                    "Flowers",
+                    "Jewelry",
+                    "GreetingCards",
+                    "EveningOut",
+                    "Clothing",
+                    "GiftCards"
+                ],
+                "class": [
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "Age",
+                    "Percent spending money on or celebrating Valentines Day",
+                    "Average percent spending on candy",
+                    "Average percent spending on flowers",
+                    "Average percent spending on jewelry",
+                    "Average percent spending on greeting cards",
+                    "Average percent spending on an evening out",
+                    "Average percent spending on clothing",
+                    "Average percent spending on gift cards"
+                ]
+            },
+            {
+                "variable": [
+                    "Gender",
+                    "SpendingCelebrating",
+                    "Candy",
+                    "Flowers",
+                    "Jewelry",
+                    "GreetingCards",
+                    "EveningOut",
+                    "Clothing",
+                    "GiftCards"
+                ],
+                "class": [
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double",
+                    "double"
+                ],
+                "description": [
+                    "Gender only including Men or Women",
+                    "Percent spending money on or celebrating Valentines Day",
+                    "Average percent spending on candy",
+                    "Average percent spending on flowers",
+                    "Average percent spending on jewelry",
+                    "Average percent spending on greeting cards",
+                    "Average percent spending on an evening out",
+                    "Average percent spending on clothing",
+                    "Average percent spending on gift cards"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "gifts_age.csv",
+                "gifts_gender.csv",
+                "historical_spending.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13/gifts_age.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13/gifts_gender.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2024/2024-02-13/historical_spending.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "gifts_age.csv",
+                "gifts_gender.csv",
+                "historical_spending.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-13/gifts_age.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-13/gifts_gender.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-13/historical_spending.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-08-08",
+        "project_name": "Hot Ones Episodes",
+        "project_source": [
+            "https://en.wikipedia.org/wiki/List_of_Hot_Ones_episodes",
+            "https://github.com/borstell",
+            "https://github.com/rfordatascience/tidytuesday/issues/591",
+            "https://en.wikipedia.org/wiki/Hot_Ones"
+        ],
+        "description": "The data this week comes from Wikipedia articles:Hot OnesandList of Hot Ones episodes. Thank you toCarl Börstellfor thesuggestion and cleaning script! Hot Ones is an American YouTube talk show, created by Chris Schonberger, hosted by Sean Evans and produced by First We Feast and Complex Media. Its basic premise involves celebrities being interviewed by Evans over a platter of increasingly spicy chicken wings.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "season",
+                    "episode_overall",
+                    "episode_season",
+                    "title",
+                    "original_release",
+                    "guest",
+                    "guest_appearance_number",
+                    "finished"
+                ],
+                "class": [
+                    "integer",
+                    "integer",
+                    "integer",
+                    "character",
+                    "date",
+                    "character",
+                    "integer",
+                    "logical"
+                ],
+                "description": [
+                    "The season number.",
+                    "The overall count of this episode, from 1-300.",
+                    "The count of this episode within this season.",
+                    "The title of the episode.",
+                    "The date on which the episode was originally available on YouTube.",
+                    "The name of the guest.",
+                    "The number of appearances by this guest so far as of this date.",
+                    "Whether the guest finished trying all of the sauces."
+                ]
+            },
+            {
+                "variable": [
+                    "season",
+                    "sauce_number",
+                    "sauce_name",
+                    "scoville"
+                ],
+                "class": [
+                    "integer",
+                    "integer",
+                    "character",
+                    "integer"
+                ],
+                "description": [
+                    "The season number.",
+                    "The number of this sauce, from 1 (least hot) to 10 (hottest).",
+                    "The name of the sauce.",
+                    "The rating of the sauce in Scoville heat units."
+                ]
+            },
+            {
+                "variable": [
+                    "season",
+                    "episodes",
+                    "note",
+                    "original_release",
+                    "last_release"
+                ],
+                "class": [
+                    "integer",
+                    "integer",
+                    "character",
+                    "date",
+                    "date"
+                ],
+                "description": [
+                    "The season number.",
+                    "The count of episodes in this season.",
+                    "Notes about this season.",
+                    "The date of the first episode in this season.",
+                    "The date of the last episode of this season (if that episode has aired at the time of scraping)."
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "episodes.csv",
+                "sauces.csv",
+                "seasons.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08/episodes.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08/sauces.csv",
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-08/seasons.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "episodes.csv",
+                "sauces.csv",
+                "seasons.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-08/episodes.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-08/sauces.csv",
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-08/seasons.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-07-25",
+        "project_name": "Scurvy",
+        "project_source": [
+            "https://github.com/higgi13425/medicaldata/tree/master/data-raw",
+            "https://htmlpreview.github.io/?https://github.com/higgi13425/medicaldata/blob/master/man/description_docs/scurvy_desc.html",
+            "https://higgi13425.github.io/medicaldata/"
+        ],
+        "description": "The data this week comes from themedicaldata R package. This is a data package from Peter Higgins, with 19 medical datasets for teaching Reproducible Medical Research with R. We're using thescurvy dataset. Source: This data set is from a study published in 1757 in A Treatise on the Scurvy in Three Parts, by James Lind. This data set contains 12 participants with scurvy. In 1757, it was not known that scurvy is a manifestation of vitamin C deficiency. A variety of remedies had been anecdotally reported, but Lind was the first to test different regimens of acidic substances (including citrus fruits) against each other in a randomized, controlled trial. 6 distinct therapies were tested in 12 seamen with symptomatic scurvy, who were selected for similar severity. Six days of therapy were provided, and endpoints were reported in the text at the end of 6 days. These include rotting of the gums, skin sores, weakness of the knees, and lassitude, which are described in terms of severity. These have been translated into Likert scales from 0(none) to 3(severe). A dichotomous endpoint, fitness for duty, was also reported. Scurvy was a common affliction of seamen on long voyages, leading to mouth sores, skin lesions, weakness of the knees, and lassitude. Scurvy could be fatal on long voyages. James Lind reported the treatment of 12 seamen with scurvy in 1757, in _A Treatise on the Scurvy in Three Parts). This 476 page bloviation can be found scanned to the Google Books website A Treatise on the Scurvy. Pages 149-153 are a rare gem among what can be generously described as 400+ pages of evidence-free blathering, and these 4 pages may represent the first report of a controlled clinical trial. Lind was the ship’s surgeon on board the HMS Salisbury, and had a number of scurvy-affected seamen at his disposal. Many remedies had been described and advocated for, with no more than anecdotal evidence. On May 20, 1747, Lind decided to try the 6 therapies on the Salisbury in a comparative study in 12 affected seamen. He selected 12 with roughly similar severity, with notable skin and mouth sores, weakness of the knees, and significant lassitude, making them unfit for duty. They each received the standard shipboard diet of gruel and mutton broth, supplemented with occasional biscuits and puddings. Each treatment was a dietary supplement (including citrus fruits) or a medicinal. This data frame was reconstructed from Lind’s account as recorded on these 4 pages, with his estimates of severity translated to a 4 point Likert scale (0-3) for each of the symptoms he described at his chosen endpoint on day 6. A somewhat fanciful study_id variable was added, along with detailed descriptions of the dosing schedule of each treatment. Of note, there is some dispute about whether this was truly the first clinical trial, or whether it actually happened, as there are no contemporaneous corroborating accounts. See link about the historical debate. Lind reported that the seamen treated with 2 lemons and an orange daily did best, followed by those treated with cider. Those treated with elixir of vitriol only had improvement in mouth sores. One imagines that acidic substances (like dilute sulfuric acid, vinegar, cider, and citrus fruits) might have been rather painful on these mouth sores. Unfortunately, the burial of the 4 valuable pages of data in 476 pages of noise, a publication delay of 10 years, and Lind’s half-hearted conclusions (he was focused on acidity), meant that it took until 1795 before the British Navy mandated daily limes for seamen. The first column was removed from the scurvy.csv file available athttps://github.com/higgi13425/medicaldata/tree/master/data-raw.",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-25",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "study_id",
+                    "treatment",
+                    "dosing_regimen_for_scurvy",
+                    "gum_rot_d6",
+                    "skin_sores_d6",
+                    "weakness_of_the_knees_d6",
+                    "lassitude_d6",
+                    "fit_for_duty_d6"
+                ],
+                "class": [
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character",
+                    "character"
+                ],
+                "description": [
+                    "Participant ID",
+                    "Treatment; cider, dilute_sulfuric_acid, vinegar, sea_water, citrus, purgative_mixture",
+                    "Dosing Regimen; 1 quart per day; 25 drops of elixir of vitriol, three times a day; two spoonfuls, three times daily; half pint daily; two lemons and an orange daily; a nutmeg-sized paste of garlic, mustard seed, horseradish, balsam of Peru, and gum myrrh three times a day",
+                    "Gum Rot on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
+                    "Skin Sores on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
+                    "Weakness of the Knees on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
+                    "Lassitude on Day 6; 0_none, 1_mild, 2_moderate, 3_severe",
+                    "Fit for Duty on Day 6; 0_no, 1_yes"
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "scurvy.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-07-25/scurvy.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "scurvy.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-07-25/scurvy.csv"
+            ]
+        }
+    },
+    {
+        "date_posted": "2023-11-07",
+        "project_name": "US House Election Results",
+        "project_source": [
+            "https://electionlab.mit.edu/",
+            "https://electionlab.mit.edu/articles/new-report-how-we-voted-2022",
+            "https://docs.posit.co/ide/user/ide/guide/tools/copilot.html",
+            "https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2"
+        ],
+        "description": "It's election day in the United States! To celebrate, the data this week comes from theMIT Election Data and Science Lab(MEDSL). Hat tip this week to theRStudio GitHub Copilot integration, which suggested the MEDSL. From the MEDSL's reportNew Report: How We Voted in 2022: The Survey of the Performance of American Elections (SPAE) provides information about how Americans experienced voting in the most recent federal election. The survey has been conducted after federal elections since 2008, and is the only public opinion project in the country that is dedicated explicitly to understanding how voters themselves experience the election process. We're specifically providing data on House elections from 1976-2022. Check out theMEDSL websitefor additional datasets and tools. Be sure to cite the MEDSL in your work: Clean data and dictionary downloaded from theHarvard Dataverse",
+        "data_source_url": "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-07",
+        "data_dictionary": [
+            {
+                "variable": [
+                    "year",
+                    "state",
+                    "state_po",
+                    "state_fips",
+                    "state_cen",
+                    "state_ic",
+                    "office",
+                    "district",
+                    "stage",
+                    "runoff",
+                    "special",
+                    "candidate",
+                    "party",
+                    "writein",
+                    "mode",
+                    "candidatevotes",
+                    "totalvotes",
+                    "unofficial",
+                    "version",
+                    "fusion_ticket"
+                ],
+                "class": [
+                    "double",
+                    "character",
+                    "character",
+                    "double",
+                    "double",
+                    "double",
+                    "character",
+                    "character",
+                    "character",
+                    "logical",
+                    "logical",
+                    "character",
+                    "character",
+                    "logical",
+                    "character",
+                    "double",
+                    "double",
+                    "logical",
+                    "double",
+                    "logical"
+                ],
+                "description": [
+                    "year in which election was held",
+                    "state name",
+                    "U.S. postal code state abbreviation",
+                    "State FIPS code",
+                    "U.S. Census state code",
+                    "ICPSR state code",
+                    "U.S. House (constant)",
+                    "district number. At-large districts are coded as 0 (zero)",
+                    "electoral stage (gen = general elections, pri = primary elections)",
+                    "runoff election",
+                    "special election",
+                    "name of the candidate as it appears in the House Clerk report",
+                    "party of the candidate (always entirely lowercase) (Parties are as they appear in the House Clerk report. In states that allow candidates to appear on multiple party lines, separate vote totals are indicated for each party. Therefore, for analysis that involves candidate totals, it will be necessary to aggregate across all party lines within a district. For analysis that focuses on two-party vote totals, it will be necessary to account for major party candidates who receive votes under multiple party labels. Minnesota party labels are given as they appear on the Minnesota ballots. Future versions of this file will include codes for candidates who are endorsed by major parties, regardless of the party label under which they receive votes.)",
+                    "vote totals associated with write-in candidates",
+                    "mode of voting; states with data that doesn't break down returns by mode are marked as \\\"total\\\"",
+                    "votes received by this candidate for this particular party",
+                    "total number of votes cast for this election",
+                    "TRUE/FALSE indicator for unofficial result (to be updated later); this appears only for 2018 data in some cases",
+                    "date when this dataset was finalized",
+                    "A TRUE/FALSE indicator as to whether the given candidate is running on a fusion party ticket, which will in turn mean that a candidate will appear multiple times, but by different parties, for a given election. States with fusion tickets include Connecticut, New Jersey, New York, and South Carolina."
+                ]
+            }
+        ],
+        "data": {
+            "file_name": [
+                "house.csv"
+            ],
+            "file_url": [
+                "https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-11-07/house.csv"
+            ]
+        },
+        "data_load": {
+            "file_name": [
+                "house.csv"
+            ],
+            "file_url": [
+                "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-11-07/house.csv"
+            ]
+        }
+    }
+]

version2/demo.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Metadata Hierarchy Explorer — TFM 2026
+Navigation router (Streamlit st.navigation).
+Sidebar layout:
+    Metadata Hierarchy Explorer / TFM 2026   (branding, top)
+    Demo View                                (pre-built results viewer)
+    Build hierarchy  (collapsible)           (upload a CSV and run a method)
+         • the three methods (descriptive names from methods.py)
+"""
+import sys
+from pathlib import Path
+import streamlit as st
+# Shared method names live in views/methods.py — make it importable.
+sys.path.insert(0, str(Path(__file__).resolve().parent / "views"))
+from methods import METHODS  # noqa: E402
+st.set_page_config(
+    page_title="Metadata Hierarchy Explorer",
+    layout="wide",
+)
+# ── Pages ────────────────────────────────────────────────────────────────────
+viewer = st.Page("views/viewer.py",         title="Demo View", default=True)
+base   = st.Page("views/run_baseline.py",   title=METHODS["Baseline"]["title"])
+appr1  = st.Page("views/run_approach_1.py", title=METHODS["Approach 1"]["title"])
+appr2  = st.Page("views/run_approach_2.py", title=METHODS["Approach 2"]["title"])
+# Hidden default nav — we render our own links so we control the order.
+pg = st.navigation([viewer, base, appr1, appr2], position="hidden")
+# ── Sidebar: branding + navigation (Built Hierarchy above Demo View) ─────────
+with st.sidebar:
+    st.title("Metadata Hierarchy Explorer")
+    st.caption("TFM 2026 — Metadata hierarchy construction")
+    st.markdown("---")
+    with st.expander("Built Hierarchy", expanded=False):
+        st.caption("Upload a CSV and run a method live.")
+        st.page_link(base,  label=METHODS["Baseline"]["title"])
+        st.page_link(appr1, label=METHODS["Approach 1"]["title"])
+        st.page_link(appr2, label=METHODS["Approach 2"]["title"])
+    st.page_link(viewer, label="Demo View")
+# ── The selected page renders here (its own controls included) ───────────────
+pg.run()

version2/hierarchy_eval.py ADDED Viewed

	@@ -0,0 +1,622 @@

+"""
+hierarchy_eval.py — shared, reference-free hierarchy evaluation for the TFM.
+WHY REFERENCE-FREE?
+-------------------
+No manually curated reference taxonomy is bundled with the thesis experiments.
+The dataset group columns are metadata supplied by the input file, not an
+independent reference taxonomy.  Approach 1 and Approach 2 use group information
+during construction; the Baseline avoids it during construction, but it still
+does not become a manually verified taxonomy.  The defensible headline
+evaluation is therefore reference-free.
+PRIMARY METRICS (no manual reference required) — fair cross-approach comparison
+-------------------------------------------
+  • Parent–child coherence   — TraCo (Wu et al., AAAI 2024, arXiv:2401.14113)
+  • Sibling diversity        — TraCo (same paper)
+  • NPMI label coherence     — Lau et al., EACL 2014 (aclanthology.org/E14-1056);
+                               orig. Mimno et al., EMNLP 2010
+  • Label quality            — interpretability proxies (concept-valid label %,
+                               sibling redundancy, avg label words).  Captures the
+                               dimension coherence misses (meaningful inner labels,
+                               Taxonomizer's stated goal).
+  • Structural statistics    — HiExpan-style reporting (Shen et al., KDD 2018)
+All of the above use the SAME encoder/corpus for every approach, so the
+cross-approach comparison is fair.  NOTE: coherence (TraCo/NPMI) can favour the
+data-derived baseline, so interpretability + a human study are needed to show
+the approaches' advantage.
+GROUP-COLUMN METRICS (ARI / AMI / NMI / Purity) — descriptive only
+------------------------------------------------------------------
+These compare a system partition with input grouping metadata.  They are useful
+sanity checks, but they are not thesis accuracy scores and are not comparable as
+reference-taxonomy recovery.  NMI and Purity are especially inflated by over-splitting.
+"""
+from __future__ import annotations
+import re
+from collections import Counter
+import numpy as np
+# ──────────────────────────────────────────────────────────────────────────────
+# Tree helpers
+# ──────────────────────────────────────────────────────────────────────────────
+def build_parent_map(nodes: list) -> dict:
+    pm: dict = {}
+    for n in nodes:
+        for c in n.get('related', []):
+            cid = int(c)
+            if cid not in pm:
+                pm[cid] = int(n['id'])
+    return pm
+def structural_stats(nodes: list) -> dict:
+    pm = build_parent_map(nodes)
+    def depth_of(nid: int) -> int:
+        d = 0
+        while nid in pm:
+            nid = pm[nid]; d += 1
+        return d
+    agg   = [n for n in nodes if n.get('type') == 'aggregation']
+    leafs = [n for n in nodes if n.get('type') == 'attribute']
+    depths   = [depth_of(int(n['id'])) for n in leafs]
+    branches = [len(n.get('related', [])) for n in agg]
+    singletons = sum(1 for b in branches if b == 1)
+    return {
+        'n_aggregation_nodes':  len(agg),
+        'max_depth':            int(max(depths, default=0)),
+        'avg_leaf_depth':       round(float(np.mean(depths)), 2) if depths else 0.0,
+        'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0,
+        'singleton_nodes_%':    round(100.0 * singletons / max(len(agg), 1), 1),
+    }
+# ──────────────────────────────────────────────────────────────────────────────
+# Encoder — SBERT if available, TF-IDF fallback.  Loaded once, reused.
+# ──────────────────────────────────────────────────────────────────────────────
+_SBERT = None
+_SBERT_TRIED = False
+def _get_sbert():
+    global _SBERT, _SBERT_TRIED
+    if _SBERT_TRIED:
+        return _SBERT
+    _SBERT_TRIED = True
+    try:
+        from sentence_transformers import SentenceTransformer
+        _SBERT = SentenceTransformer('all-MiniLM-L6-v2')
+    except Exception:
+        _SBERT = None
+    return _SBERT
+def encode(texts: list):
+    """Return (unit-normalised vectors, backend_name)."""
+    texts = [str(t) if str(t).strip() else '_' for t in texts]
+    model = _get_sbert()
+    if model is not None:
+        v = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
+        return np.asarray(v, dtype=float), 'SBERT (all-MiniLM-L6-v2)'
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    X = TfidfVectorizer(stop_words='english', max_features=2000,
+                        min_df=1).fit_transform(texts).toarray().astype(float)
+    norms = np.linalg.norm(X, axis=1, keepdims=True)
+    return X / np.where(norms == 0, 1.0, norms), 'TF-IDF (SBERT unavailable)'
+# ──────────────────────────────────────────────────────────────────────────────
+# TraCo reference-free metrics  (Wu et al., AAAI 2024)
+# ──────────────────────────────────────────────────────────────────────────────
+def traco_metrics(nodes: list) -> dict:
+    """Parent–child coherence and sibling diversity over node *labels*."""
+    usable = [n for n in nodes if n.get('type') in ('aggregation', 'attribute')]
+    if len(usable) < 2:
+        return {'pc_coherence': 0.0, 'sibling_diversity': 0.0, 'encoder': 'n/a'}
+    ids    = [int(n['id']) for n in usable]
+    labels = [str(n.get('name', '')) for n in usable]
+    vecs, backend = encode(labels)
+    id2v = {i: vecs[k] for k, i in enumerate(ids)}
+    pc_sims, sib_divs = [], []
+    for n in nodes:
+        if n.get('type') == 'root':
+            continue
+        pid = int(n['id'])
+        if pid not in id2v:
+            continue
+        children = [int(c) for c in n.get('related', []) if int(c) in id2v]
+        for cid in children:
+            pc_sims.append(float(np.dot(id2v[pid], id2v[cid])))
+        if len(children) >= 2:
+            cv = np.array([id2v[c] for c in children])
+            S  = cv @ cv.T
+            nc = len(children)
+            divs = [1.0 - float(S[i, j]) for i in range(nc) for j in range(i + 1, nc)]
+            sib_divs.append(float(np.mean(divs)))
+    return {
+        'pc_coherence':      round(float(np.mean(pc_sims)),  4) if pc_sims  else 0.0,
+        'sibling_diversity': round(float(np.mean(sib_divs)), 4) if sib_divs else 0.0,
+        'encoder':           backend,
+    }
+# ──────────────────────────────────────────────────────────────────────────────
+# NPMI label coherence  (Lau et al., EACL 2014; Mimno et al., EMNLP 2010)
+# Reference corpus = the variable descriptions themselves.
+# ──────────────────────────────────────────────────────────────────────────────
+_TOKEN_RE = re.compile(r'[a-z][a-z]{2,}')
+_STOP = set(
+    'the a an and or of to in for on with by at from as is are be this that these '
+    'those it its was were has have had not no than then so such can will may '
+    'group description name label value type using used per each'.split()
+)
+def _tokens(text: str) -> set:
+    return {w for w in _TOKEN_RE.findall(str(text).lower()) if w not in _STOP}
+def npmi_coherence(nodes: list, corpus_texts: list, topn: int = 5) -> float:
+    """Average NPMI of each aggregation node's label terms over the corpus.
+    Returns a value in roughly [-1, 1]; higher = node labels use term
+    combinations that genuinely co-occur in the data (meaningful, not random).
+    """
+    docs = [_tokens(t) for t in corpus_texts]
+    docs = [d for d in docs if d]
+    N = len(docs)
+    if N < 2:
+        return 0.0
+    df: Counter = Counter()
+    for d in docs:
+        for w in d:
+            df[w] += 1
+    # Collect the term sets we actually need (node labels)
+    label_termsets: list = []
+    needed_terms: set = set()
+    for n in nodes:
+        if n.get('type') != 'aggregation':
+            continue
+        terms = [w for w in _tokens(n.get('name', '')) if df.get(w, 0) > 0]
+        terms = sorted(terms, key=lambda w: df[w], reverse=True)[:topn]
+        if len(terms) >= 2:
+            label_termsets.append(terms)
+            needed_terms.update(terms)
+    if not label_termsets:
+        return 0.0
+    # Pair co-occurrence counts (only for needed pairs)
+    needed_pairs = set()
+    for terms in label_termsets:
+        for i in range(len(terms)):
+            for j in range(i + 1, len(terms)):
+                needed_pairs.add(frozenset((terms[i], terms[j])))
+    co: Counter = Counter()
+    for d in docs:
+        present = d & needed_terms
+        if len(present) < 2:
+            continue
+        pl = list(present)
+        for i in range(len(pl)):
+            for j in range(i + 1, len(pl)):
+                pair = frozenset((pl[i], pl[j]))
+                if pair in needed_pairs:
+                    co[pair] += 1
+    eps = 1e-12
+    node_scores: list = []
+    for terms in label_termsets:
+        pair_npmis: list = []
+        for i in range(len(terms)):
+            for j in range(i + 1, len(terms)):
+                wi, wj = terms[i], terms[j]
+                c_ij = co.get(frozenset((wi, wj)), 0)
+                p_ij = (c_ij + eps) / N
+                p_i  = df[wi] / N
+                p_j  = df[wj] / N
+                pmi  = np.log(p_ij / (p_i * p_j + eps) + eps)
+                npmi = pmi / (-np.log(p_ij + eps))
+                pair_npmis.append(float(npmi))
+        if pair_npmis:
+            node_scores.append(float(np.mean(pair_npmis)))
+    return round(float(np.mean(node_scores)), 4) if node_scores else 0.0
+# ──────────────────────────────────────────────────────────────────────────────
+# Secondary (descriptive, caveated): group-structure preservation
+# ──────────────────────────────────────────────────────────────────────────────
+def _depth1_assignments(nodes: list, can) -> list:
+    pm = build_parent_map(nodes)
+    def depth1(nid: int) -> int:
+        while pm.get(nid, -1) not in (-1, 0):
+            nid = pm[nid]
+        return nid
+    lid_to_nid = {}
+    row_to_nid = {}
+    for n in nodes:
+        if n.get('type') != 'attribute' or 'metadata' not in n:
+            continue
+        meta = n.get('metadata', {})
+        if meta.get('leaf_id'):
+            lid_to_nid[str(meta['leaf_id'])] = int(n['id'])
+        if meta.get('row_index') is not None:
+            try:
+                row_to_nid[int(meta['row_index'])] = int(n['id'])
+            except Exception:
+                pass
+    leaf_col = '_leaf_id' if '_leaf_id' in can.columns else '_id'
+    row_col = '_row' if '_row' in can.columns else None
+    out = []
+    for i, row in can.iterrows():
+        lid = str(row.get(leaf_col, ''))
+        if lid in lid_to_nid:
+            out.append(depth1(lid_to_nid[lid]))
+            continue
+        try:
+            rid = int(row.get(row_col, i)) if row_col else int(i)
+        except Exception:
+            rid = int(i)
+        out.append(depth1(row_to_nid[rid]) if rid in row_to_nid else -1)
+    return out
+def _purity(y_true, y_pred) -> float:
+    clusters: dict = {}
+    for t, p in zip(y_true, y_pred):
+        clusters.setdefault(p, []).append(t)
+    correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values())
+    return correct / max(len(y_true), 1)
+def group_preservation(nodes: list, can) -> dict:
+    """NMI / ARI / Purity of the depth-1 partition vs input grouping metadata.
+    CAVEAT: the group column is not a manually curated reference taxonomy, so
+    this is a descriptive 'structure preservation' figure, NOT an accuracy metric.
+    """
+    from sklearn.metrics import (normalized_mutual_info_score, adjusted_rand_score,
+                                 adjusted_mutual_info_score)
+    from sklearn.preprocessing import LabelEncoder
+    import pandas as pd
+    # group column robust to either canonical schema (_group_path or _group)
+    gcol = '_group_path' if '_group_path' in can.columns else '_group'
+    y_true_raw = can[gcol].apply(
+        lambda x: str(x).split(' > ')[0].strip()
+        if pd.notna(x) and str(x) not in ('', 'nan') else 'Ungrouped'
+    ).tolist()
+    y_pred_raw = _depth1_assignments(nodes, can)
+    y_true = LabelEncoder().fit_transform(y_true_raw)
+    y_pred = LabelEncoder().fit_transform(y_pred_raw)
+    return {
+        # ARI and AMI are chance-corrected — the trustworthy numbers.
+        'ARI':    round(float(adjusted_rand_score(y_true, y_pred)), 4),
+        'AMI':    round(float(adjusted_mutual_info_score(y_true, y_pred)), 4),
+        # NMI and Purity are reported for completeness but are inflated by
+        # over-splitting (more clusters → higher), so they are NOT headline.
+        'NMI':    round(float(normalized_mutual_info_score(
+                     y_true, y_pred, average_method='arithmetic')), 4),
+        'Purity': round(_purity(y_true_raw, y_pred_raw), 4),
+    }
+def label_quality(nodes: list) -> dict:
+    """Reference-free interpretability proxies for internal-node labels.
+    Captures the dimension Taxonomizer is *about* — meaningful inner-node labels —
+    which coherence metrics miss.  Fully automatic, no manual reference required:
+      • concept_label_pct  — % of internal labels that read as a real concept:
+        a short phrase (<=3 words) whose head word is a known English noun
+        (WordNet).  Penalises '/'-joined contrastive term fragments.
+      • redundancy_pct     — % of internal labels that duplicate a sibling's
+        label (same normalised text under the same parent).
+      • avg_label_words    — mean label length in words (shorter = more name-like).
+    """
+    pm = build_parent_map(nodes)
+    internal = [n for n in nodes if n.get('type') == 'aggregation']
+    if not internal:
+        return {'concept_label_pct': 0.0, 'redundancy_pct': 0.0, 'avg_label_words': 0.0}
+    # WordNet noun check (optional; degrade gracefully if unavailable)
+    try:
+        from nltk.corpus import wordnet as wn
+        def _is_noun(w):
+            return bool(wn.synsets(w, pos=wn.NOUN))
+    except Exception:
+        def _is_noun(w):
+            return len(w) > 2  # fallback: any real-ish word
+    def _norm(s): return re.sub(r'[^a-z0-9]+', ' ', str(s).lower()).strip()
+    concept = 0
+    wordcounts = []
+    for n in internal:
+        raw = str(n.get('name', ''))
+        words = _norm(raw).split()
+        wordcounts.append(len(words))
+        # '/'-joined fragments are NOT concept labels
+        is_fragment = '/' in raw
+        head = words[-1] if words else ''
+        if (not is_fragment) and 1 <= len(words) <= 3 and head and _is_noun(head):
+            concept += 1
+    # sibling redundancy
+    by_parent: dict = {}
+    for n in internal:
+        p = pm.get(int(n['id']), -1)
+        by_parent.setdefault(p, []).append(_norm(n.get('name', '')))
+    redundant = 0
+    for sibs in by_parent.values():
+        seen = set()
+        for s in sibs:
+            if s in seen:
+                redundant += 1
+            seen.add(s)
+    n_int = len(internal)
+    return {
+        'concept_label_pct': round(100.0 * concept / n_int, 1),
+        'redundancy_pct':    round(100.0 * redundant / n_int, 1),
+        'avg_label_words':   round(float(np.mean(wordcounts)), 2),
+    }
+# ──────────────────────────────────────────────────────────────────────────────
+# Optional manual-reference comparison — Edge-F1 / Ancestor-F1
+#
+# HiExpan (Shen et al., KDD 2018) scores a system taxonomy against a hand-built
+# reference taxonomy with Edge-F1 (direct parent–child links) and Ancestor-F1
+# (all ancestor links).  Because our internal-node *labels* differ between a
+# manual reference tree and each system, we use the label-free leaf-pair formulation (the
+# pair-counting tradition, Fowlkes & Mallows 1983):
+#
+#   • Edge-F1     — over pairs of leaves that share the same IMMEDIATE parent
+#                   (i.e. they are siblings).  Strict: rewards correct granularity.
+#   • Ancestor-F1 — over pairs of leaves that share ANY non-root ancestor
+#                   (i.e. they are grouped together somewhere).  Lenient.
+#
+# Leaves are matched between reference and system by their attribute-node NAME (the
+# variable label) — the one field all three approaches expose for every leaf.
+# Only leaves present in BOTH the manual subset and the system tree are scored,
+# so a small hand-built subset could evaluate a full hierarchy if one is created.
+# ──────────────────────────────────────────────────────────────────────────────
+def _pred_leaf_lineage(nodes: list) -> dict:
+    """leaf name → list of ancestor node ids (root-most first, excl. root & leaf)."""
+    pm = build_parent_map(nodes)
+    id_to_node = {int(n['id']): n for n in nodes}
+    lineage: dict = {}
+    for n in nodes:
+        if n.get('type') != 'attribute':
+            continue
+        name = str(n.get('name', ''))
+        cur  = int(n['id'])
+        anc, seen = [], set()
+        while cur in pm and cur not in seen:
+            seen.add(cur)
+            cur = pm[cur]
+            nd = id_to_node.get(cur)
+            if nd is None or nd.get('type') == 'root':
+                break
+            anc.append(cur)
+        anc.reverse()
+        lineage[name] = anc
+    return lineage
+def _gold_leaf_lineage(gold_df) -> dict:
+    """leaf name → list of cumulative path-prefix strings from a manual reference."""
+    lineage: dict = {}
+    for _, r in gold_df.iterrows():
+        name = str(r['leaf_label'])
+        path = str(r.get('gold_path', '') or '')
+        comps = [c.strip() for c in path.split('>')
+                 if c.strip() and c.strip().lower() != 'ungrouped']
+        anc, pref = [], ''
+        for c in comps:
+            pref = c if not pref else f'{pref} > {c}'
+            anc.append(pref)
+        lineage[name] = anc
+    return lineage
+def _sibling_pairs(lineage: dict) -> set:
+    from collections import defaultdict
+    groups: dict = defaultdict(list)
+    for name, anc in lineage.items():
+        if anc:
+            groups[anc[-1]].append(name)
+    pairs: set = set()
+    for members in groups.values():
+        m = sorted(members)
+        for i in range(len(m)):
+            for j in range(i + 1, len(m)):
+                pairs.add((m[i], m[j]))
+    return pairs
+def _cogrouped_pairs(lineage: dict) -> set:
+    from collections import defaultdict
+    occ: dict = defaultdict(set)
+    for name, anc in lineage.items():
+        for a in anc:
+            occ[a].add(name)
+    pairs: set = set()
+    for members in occ.values():
+        m = sorted(members)
+        for i in range(len(m)):
+            for j in range(i + 1, len(m)):
+                pairs.add((m[i], m[j]))
+    return pairs
+def _prf(pred_set: set, gold_set: set) -> dict:
+    if not pred_set and not gold_set:
+        return {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
+    tp = len(pred_set & gold_set)
+    p = tp / len(pred_set) if pred_set else 0.0
+    r = tp / len(gold_set) if gold_set else 0.0
+    f = 2 * p * r / (p + r) if (p + r) > 0 else 0.0
+    return {'precision': round(p, 4), 'recall': round(r, 4), 'f1': round(f, 4)}
+def gold_comparison(nodes: list, gold_df) -> dict:
+    """Edge-F1 and Ancestor-F1 of a system tree vs an optional manual reference."""
+    pred = _pred_leaf_lineage(nodes)
+    gold = _gold_leaf_lineage(gold_df)
+    shared = set(pred) & set(gold)
+    pred = {k: v for k, v in pred.items() if k in shared}
+    gold = {k: v for k, v in gold.items() if k in shared}
+    return {
+        'n_matched_leaves': len(shared),
+        'edge_f1':     _prf(_sibling_pairs(pred),   _sibling_pairs(gold)),
+        'ancestor_f1': _prf(_cogrouped_pairs(pred), _cogrouped_pairs(gold)),
+    }
+# ──────────────────────────────────────────────────────────────────────────────
+# Granularity-tolerant, label-independent structural F1  (set-overlap matching)
+#
+# Edge-F1 punishes a system for adding *correct* extra depth, because two leaves
+# that a manual reference lists as siblings stop being immediate siblings once the system
+# refines them into sub-tiers.  That makes edge-F1 unfair to deliberately deeper
+# trees (Approaches 1 & 2).  Set-overlap F1 fixes this: it matches each reference
+# cluster (the set of leaves under a reference path-prefix) to the system node whose
+# leaf set overlaps it most (Jaccard), regardless of that node's depth or label.
+#
+#   • precision — for each system aggregation node, its best Jaccard with any
+#                 reference cluster, averaged.  Low when the system invents groups
+#                 the reference does not have (e.g. one node per delay value = over-split).
+#   • recall    — for each reference cluster, its best Jaccard with any system node,
+#                 averaged.  Low when the system fails to recover a reference group.
+#
+# This is the cluster-matching / overlap-F1 tradition (e.g. ontology alignment,
+# hierarchical-clustering evaluation).  Label-free, so it compares the three
+# approaches fairly even though their internal-node labels differ.
+# ──────────────────────────────────────────────────────────────────────────────
+def _system_clusters(nodes: list) -> list:
+    """Each aggregation node → frozenset of leaf NAMES in its subtree (size ≥ 2)."""
+    id_to_node = {int(n['id']): n for n in nodes}
+    out: list = []
+    for n in nodes:
+        if n.get('type') != 'aggregation':
+            continue
+        leaves: list = []
+        stack = [int(n['id'])]
+        seen: set = set()
+        while stack:
+            x = stack.pop()
+            if x in seen:
+                continue
+            seen.add(x)
+            nd = id_to_node.get(x)
+            if nd is None:
+                continue
+            if nd.get('type') == 'attribute':
+                leaves.append(str(nd.get('name', '')))
+            else:
+                stack.extend(int(c) for c in nd.get('related', []))
+        s = frozenset(leaves)
+        if len(s) >= 2:
+            out.append(s)
+    return out
+def _gold_clusters(gold_df) -> list:
+    """Each reference path-prefix → frozenset of leaf NAMES under it (size ≥ 2)."""
+    from collections import defaultdict
+    occ: dict = defaultdict(set)
+    for name, anc in _gold_leaf_lineage(gold_df).items():
+        for a in anc:
+            occ[a].add(name)
+    return [frozenset(v) for v in occ.values() if len(v) >= 2]
+def set_overlap_f1(nodes: list, gold_df) -> dict:
+    """Granularity-tolerant, label-free hierarchical F1 via best leaf-set Jaccard."""
+    pred_names = set(_pred_leaf_lineage(nodes))
+    gold_names = {str(x) for x in gold_df['leaf_label']}
+    shared = pred_names & gold_names
+    if len(shared) < 2:
+        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
+    sys_cl  = [c & shared for c in _system_clusters(nodes)]
+    sys_cl  = [c for c in sys_cl if len(c) >= 2]
+    gold_cl = [c & shared for c in _gold_clusters(gold_df)]
+    gold_cl = [c for c in gold_cl if len(c) >= 2]
+    if not sys_cl or not gold_cl:
+        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
+    def jac(a: frozenset, b: frozenset) -> float:
+        u = len(a | b)
+        return len(a & b) / u if u else 0.0
+    prec = float(np.mean([max(jac(s, g) for g in gold_cl) for s in sys_cl]))
+    rec  = float(np.mean([max(jac(s, g) for s in sys_cl) for g in gold_cl]))
+    f1   = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
+    return {'precision': round(prec, 4), 'recall': round(rec, 4), 'f1': round(f1, 4)}
+def refinement_breakdown(nodes: list, gold_df) -> dict:
+    """Decompose edge-F1 disagreements into harmless refinement vs real errors.
+    • wrong_merge_rate — system sibling pairs that the reference does NOT co-group anywhere
+      (genuine mistakes: variables wrongly placed together).
+    • refinement_rate  — reference sibling pairs the system keeps co-grouped but at a
+      FINER level (split into sub-tiers).  These are deeper-but-consistent, the
+      thing edge-F1 unfairly penalises.
+    • missed_rate      — reference sibling pairs the system fails to co-group at all
+      (real recall failures).
+    """
+    pred = _pred_leaf_lineage(nodes)
+    gold = _gold_leaf_lineage(gold_df)
+    shared = set(pred) & set(gold)
+    pred = {k: v for k, v in pred.items() if k in shared}
+    gold = {k: v for k, v in gold.items() if k in shared}
+    sys_sib = _sibling_pairs(pred)
+    sys_cog = _cogrouped_pairs(pred)
+    gold_sib = _sibling_pairs(gold)
+    gold_cog = _cogrouped_pairs(gold)
+    wrong_merge = len(sys_sib - gold_cog)
+    refined     = len((gold_sib & sys_cog) - sys_sib)
+    missed      = len(gold_sib - sys_cog)
+    return {
+        'wrong_merge_rate': round(wrong_merge / len(sys_sib), 4) if sys_sib else 0.0,
+        'refinement_rate':  round(refined / len(gold_sib), 4) if gold_sib else 0.0,
+        'missed_rate':      round(missed / len(gold_sib), 4) if gold_sib else 0.0,
+    }
+# ──────────────────────────────────────────────────────────────────────────────
+# One-call bundle
+# ──────────────────────────────────────────────────────────────────────────────
+def evaluate(nodes: list, corpus_texts: list | None = None, can=None,
+             gold_df=None) -> dict:
+    """Compute the full metric bundle for one hierarchy."""
+    out: dict = {}
+    out.update(traco_metrics(nodes))
+    out['npmi_coherence'] = (npmi_coherence(nodes, corpus_texts)
+                             if corpus_texts is not None else None)
+    out.update({f'struct_{k}': v for k, v in structural_stats(nodes).items()})
+    if can is not None:
+        out['group_preservation'] = group_preservation(nodes, can)
+    if gold_df is not None:
+        out['gold'] = gold_comparison(nodes, gold_df)
+    return out

version2/launcher.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+launcher.py — start Baseline, Approach 1 and Approach 2 on different ports,
+              open them in browser tabs, and shut down all at once when you
+              press Enter.
+Usage:
+    python launcher.py
+Each app has its own file uploader — upload a different CSV to each tab to
+compare approaches side by side.
+"""
+from __future__ import annotations
+import socket
+import subprocess
+import sys
+import time
+import webbrowser
+from pathlib import Path
+HERE = Path(__file__).resolve().parent
+JOBS = [
+    ('baseline.py',   8501, 'Baseline'),
+    ('approach_1.py', 8502, 'Approach 1'),
+    ('approach_2.py', 8503, 'Approach 2'),
+]
+# TIP: to compare TWO datasets at once you do NOT need extra ports. Streamlit
+# gives every browser tab its own independent session (separate upload + state),
+# so just open the same URL twice — e.g. open http://localhost:8501 in two tabs,
+# load AI-MIND in one and HCP in the other.
+OPEN_BROWSER      = True
+STARTUP_WAIT_SECS = 5
+def _port_in_use(port: int) -> bool:
+    """Return True if something is already listening on this port."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.settimeout(0.5)
+        return s.connect_ex(('127.0.0.1', port)) == 0
+def _kill_tree(p: subprocess.Popen) -> None:
+    """Kill a process and all its children (works reliably on Windows and POSIX)."""
+    if sys.platform == 'win32':
+        subprocess.call(
+            ['taskkill', '/F', '/T', '/PID', str(p.pid)],
+            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+        )
+    else:
+        try:
+            import os, signal
+            os.killpg(os.getpgid(p.pid), signal.SIGTERM)
+        except Exception:
+            p.terminate()
+    try:
+        p.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        p.kill()
+def main() -> int:
+    # Validate scripts
+    missing = [s for s, _, _ in JOBS if not (HERE / s).is_file()]
+    if missing:
+        print(f'ERROR: missing files: {missing}')
+        return 1
+    # Abort if any port is already occupied — prevents the duplicate-tab problem
+    busy = [(label, port) for _, port, label in JOBS if _port_in_use(port)]
+    if busy:
+        for label, port in busy:
+            print(f'ERROR: port {port} ({label}) is already in use.')
+        print('\nKill the existing servers first (Task Manager → python.exe → End Task),')
+        print('then run launcher.py again.')
+        return 1
+    procs: list[subprocess.Popen] = []
+    print(f'Working directory: {HERE}')
+    print(f'Launching {len(JOBS)} Streamlit instance(s)…\n')
+    for script, port, label in JOBS:
+        cmd = [
+            sys.executable, '-m', 'streamlit', 'run', str(HERE / script),
+            '--server.port', str(port),
+            '--server.headless', 'true',       # suppress Streamlit's own browser open
+            '--browser.gatherUsageStats', 'false',
+        ]
+        try:
+            # Do NOT use CREATE_NEW_PROCESS_GROUP — it breaks taskkill /T
+            p = subprocess.Popen(cmd)
+            procs.append(p)
+            print(f'  {label:<12} pid={p.pid:<6} → http://localhost:{port}')
+        except Exception as e:
+            print(f'  FAILED {label}: {e}')
+    if not procs:
+        print('Nothing started.')
+        return 1
+    # Wait for each server to actually be reachable before opening the browser
+    print(f'\nWaiting for servers to come up (max {STARTUP_WAIT_SECS}s each)…')
+    for _, port, label in JOBS:
+        for _ in range(STARTUP_WAIT_SECS * 2):
+            if _port_in_use(port):
+                print(f'  {label} ready')
+                break
+            time.sleep(0.5)
+        else:
+            print(f'  {label} did not respond in time — opening anyway')
+    if OPEN_BROWSER:
+        print('\nOpening browser tabs…')
+        for _, port, label in JOBS:
+            url = f'http://localhost:{port}'
+            webbrowser.open_new_tab(url)
+            print(f'  • {label}  →  {url}')
+            time.sleep(0.3)   # small gap so tabs open in order
+    print('\nAll servers running.')
+    print('Press Enter (in THIS terminal) to stop all servers and exit.\n')
+    try:
+        input()
+    except KeyboardInterrupt:
+        pass
+    print('\nStopping servers…')
+    for p in procs:
+        _kill_tree(p)
+    print('Done.')
+    return 0
+if __name__ == '__main__':
+    raise SystemExit(main())

version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_canonical.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_concept_labels.csv ADDED Viewed

	@@ -0,0 +1,159 @@

+Node,Confidence,Source,Embedding sim,Alternatives
+Self Reported Ethnicity,0.542,keybert,0.625,"self reported racial, id strictly self-report, Self Reported"
+Quarter Behavioral Data,0.266,keybert,0.525,"acquisition quarter, quarter behavioral, quarter relative start"
+Yes Relationship Status,0.319,keybert,0.601,"employment status, respondent still school, participant employment status"
+Gender,0.26,keybert,0.501,hcp id
+Group Participant Age Range,0.834,description_title,0.874,"group participant age, participant age range, age participant years"
+Percentage Task Fmri,0.41,keybert,0.84,"task fmri percent, fmri processing percent, fmri language percent"
+Gambling Task Fmri,0.438,keybert,0.845,"task fmri gambling, fmri gambling, fmri gambling percent"
+State Fmri Count,0.312,keybert,0.676,"fmri count, state fmri scans"
+Scans Collected Scan Count,0.79,description_title,0.836,"collected scan count, scan count, scans collected scan"
+Timepoints Task Fmri,0.329,keybert,0.698,"fmri protocol completed, task fmri protocol, relational task fmri"
+Percentage Diffusion Mri,0.41,keybert,0.788,"diffusion mri completed, diffusion mri percent, complete diffusion mri"
+Diffusion Mri Completed,0.385,keybert,0.729,"diffusion mri protocol, complete diffusion mri, full diffusion mri"
+Fmri Movie Protocol,0.332,keybert,0.669,"fmri protocol completed, fmri retinotopy protocol, movie task fmri"
+Data Complete Resting,0.275,keybert,0.532,"anatomy data complete, noise data complete, data complete noise"
+Story Math Data,0.372,keybert,0.652,"available story math, story math, complete story math"
+Ssaga Marijuana Dependence,0.378,keybert,0.666,"ssaga tobacco dependence, completed ssaga marijuana, data completed ssaga"
+Compl Description,0.448,description_title,0.212,"compl description penn, compl description mini, test completed"
+Non-Toolbox Battery Completed,0.433,keybert,0.747,"behavioral battery completed, toolbox battery completed, battery completed full"
+Asr-Syn Compl Description,0.52,description_title,0.34,"asr-syn compl, asr-syn, der asr-syn compl"
+Mr Session Scanner,0.41,keybert,0.847,"scanner particular mr, session scanner, scanner particular"
+Mri Session Labels,0.427,keybert,0.843,"label mri session, specific label mri, label mri"
+Parentheses Indicate Scan,0.339,keybert,0.658,"scan count type, indicate scan count, type scan mr"
+Scan Session Scans,0.295,keybert,0.611,"scanner particular, scanner particular session, scan session"
+Blood Sample,0.363,keybert,0.681,"hematocrit sample, women hematocrit sample, blood sample percentage"
+Participant Menstrual Cycles,0.376,keybert,0.747,"participant age menstrual, participant menstrual, menstrual age cycles"
+Bmi,0.715,description_title,0.698,"bmi body, bmi body mass, bmi their"
+Applicable Hypothyroidism Age,0.349,keybert,0.675,"applicable hyperthyroidism age, hypothyroidism age onset, hypothyroidism age"
+Birth Control Progesterone,0.369,keybert,0.675,"progesterone fertility drugs, participant birth control, control pills progesterone"
+Systolic Blood Pressure,0.375,keybert,0.7,"blood pressure systolic, diastolic blood pressure, blood pressure diastolic"
+Yes Father Bipolar,0.254,keybert,0.537,"father depression yes, yes father depression, father bipolar disorder"
+Disease Dementia Yes,0.277,keybert,0.55,"parkinson disease yes, father parkinson disease, father alzheimer disease"
+Drug Alcohol Problems,0.304,keybert,0.545,"father drug alcohol, mother drug alcohol, alcohol problems yes"
+Father Anxiety Yes,0.355,keybert,0.688,"yes father anxiety, mother anxiety yes, anxiety yes father"
+Schizophrenia Psychosis Yes,0.343,keybert,0.661,"mother schizophrenia psychosis, father schizophrenia psychosis, psychosis yes father"
+Father Tourette Syndrome,0.345,keybert,0.674,"mother tourette syndrome, tourette syndrome yes, yes father tourette"
+Psqi Compl Description,0.422,description_title,0.16,"psqi past month, psqi compl, psqi"
+Description,0.386,description_title,0.079,"description quality, describe, description quality index"
+Psqi Past Month,0.475,keybert,0.44,psqi past
+Der Psqi Compl,0.34,keybert,0.165,
+Index Psqi,0.376,keybert,0.286,
+Test Matrix,0.517,keybert,0.411,
+Delayed Reward Subjective,0.323,keybert,0.714,"delays fixed reward, larger delayed reward, undervaluing rewards delayed"
+Larger Delayed Reward,0.306,keybert,0.68,"delays fixed reward, undervaluing rewards delayed, rewards delayed time"
+Undervaluing Rewards Delayed,0.278,keybert,0.617,choice immediate amount
+Area Under Curve,0.51,description_title,0.292,"curve area under, under curve area, curve area"
+Total Positions Off,0.208,keybert,0.391,"trials total positions, total positions, positions off trials"
+Sum Cpn Fp,0.275,keybert,0.513,"cpt true positives, sum cpn tp, cpt false negatives"
+Non-Responses Longest Run,0.31,keybert,0.58,"longest run non-responses, non-responses longest, run non-responses longest"
+Anger Identifications Correct,0.288,keybert,0.575,"correct anger identifications, anger identifications, correct fear identifications"
+Aggression Scores Mean,0.31,keybert,0.62,"hostility scores mean, angry feelings scores, levels hostility scores"
+Self-Report Measure Adults,0.285,keybert,0.602,"scores indicate self-reported, self-reported scores mean, self-reported scores"
+Loneliness Scores,0.323,keybert,0.66,"loneliness scores mean, levels loneliness scores, scores indicative loneliness"
+Perceived Hostility Scores,0.335,keybert,0.659,"perceived hostility survey, perceived rejection scores, hostility scores mean"
+Brain Segmentation Volume,0.781,description_title,0.877,"brain segmentation, estimated intra-cranial volume, intra-cranial volume"
+Total Defect Holes,0.582,keybert,0.824,"Prior Fixing, lh prior fixing, defect holes rh"
+Etiv,0.582,description_title,0.418,"ratio maskvol etiv, ratio brainsegvol etiv, maskvol etiv"
+Supratentorial Volume,0.715,keybert,0.948,"Supratentorial, supratentorial ventricals volume, supratentorial ventricals"
+Gray Matter Volume,0.637,keybert,0.761,"white matter volume, Matter Volume, total gray matter"
+Wm-Hypointensities,0.439,keybert,0.785,"left-wm-hypointensities, left-non-wm-hypointensities, right-wm-hypointensities"
+Cc Anterior,0.154,keybert,0.319,"cc mid anterior, cc posterior, cc mid posterior"
+Left-Vessel,0.417,keybert,0.773,right-vessel
+Right-Putamen,0.139,keybert,0.308,"left-putamen, right-thalamus-proper, left-thalamus-proper"
+Right-Cerebellum-Cortex,0.228,keybert,0.483,"left-cerebellum-cortex, right-cerebellum-white-matter, left-cerebellum-white-matter"
+Left-Hippocampus,0.235,keybert,0.489,right-hippocampus
+Left-Amygdala,0.276,keybert,0.547,right-amygdala
+Right-Choroid-Plexus,0.285,keybert,0.551,left-choroid-plexus
+Rd-Ventricle,0.28,keybert,0.559,"th-ventricle, right-lateral-ventricle, left-lateral-ventricle"
+Right-Ventraldc,0.246,keybert,0.498,left-ventraldc
+Right-Inf-Lat-Vent,0.144,keybert,0.321,left-inf-lat-vent
+Gyrus Right Precentral,0.19,keybert,0.416,"gyrus right superiortemporal, gyrus right inferiortemporal, gyrus right middletemporal"
+Cortex Right Entorhinal,0.225,keybert,0.5,"cortex left entorhinal, cortex right inferiorparietal, cortex right superiorparietal"
+Inferior Frontal Gyrus,0.201,keybert,0.447,"gyrus left parsorbitalis, gyrus right parsorbitalis, gyrus right parsopercularis"
+Pole Right Frontalpole,0.189,keybert,0.371,"pole right temporalpole, pole left temporalpole, right frontalpole average"
+Gyrus Right Posteriorcingulate,0.219,keybert,0.46,"gyrus left posteriorcingulate, gyrus right isthmuscingulate, cingulate gyrus right"
+Gyrus Right Caudalmiddlefrontal,0.244,keybert,0.529,"gyrus right rostralmiddlefrontal, gyrus left caudalmiddlefrontal"
+Cortex Right Caudalanteriorcingulate,0.247,keybert,0.521,"cortex left caudalanteriorcingulate, cortex left rostralanteriorcingulate, anterior cingulate cortex"
+Gyrus Right Parahippocampal,0.258,keybert,0.539,"gyrus left parahippocampal, parahippocampal average, right parahippocampal average"
+Superior Temporal Sulcus,0.162,keybert,0.348,"superior temporal, temporal sulcus left, temporal sulcus"
+Right Insula Average,0.193,keybert,0.374,"insula average, left insula average, right insula"
+Cortex Right Medialorbitofrontal,0.222,keybert,0.488,"cortex left medialorbitofrontal, orbital frontal cortex, cortex right lateralorbitofrontal"
+Cortex Right Lateraloccipital,0.234,keybert,0.486,"cortex left lateraloccipital, occipital cortex right, occipital cortex"
+Sulcus Right Paracentral,0.166,keybert,0.327,"sulcus left paracentral, paracentral, right paracentral average"
+Cortex Right Transversetemporal,0.248,keybert,0.524,"cortex left transversetemporal, temporal cortex left, temporal cortex"
+Cortex Right Pericalcarine,0.241,keybert,0.51,"cortex left pericalcarine, right pericalcarine average, cortex"
+Gyrus Right Lingual,0.236,keybert,0.488,"gyrus left lingual, lingual, lingual average"
+Cortex Right Insula,0.241,keybert,0.53,"cortex left insula, cortex right transversetemporal, cortex left transversetemporal"
+Cortex Left Medialorbitofrontal,0.221,keybert,0.489,"cortex right medialorbitofrontal, orbital frontal cortex, cortex right lateralorbitofrontal"
+Cortex Left Caudalanteriorcingulate,0.256,keybert,0.543,"cortex right caudalanteriorcingulate, cortex left rostralanteriorcingulate, cortex right rostralanteriorcingulate"
+Gyrus Right Superiortemporal,0.218,keybert,0.485,"gyrus right inferiortemporal, gyrus right middletemporal, gyrus left superiortemporal"
+Cortex Right Inferiorparietal,0.246,keybert,0.526,"cortex right superiorparietal, cortex left inferiorparietal, cortex left superiorparietal"
+Median Reaction Times,0.34,keybert,0.646,"average median reaction, overall reaction time, face median reaction"
+Accuracy Percentage Face,0.361,keybert,0.677,"accuracy percentage overall, percentage overall accuracy, accuracy percentage shape"
+Reaction Time Reward,0.294,keybert,0.589,"reaction times trials, median reaction times, reaction time punish"
+Percentage Reward Trials,0.301,keybert,0.669,"percentage larger reward, percentage smaller reward, prediction percentage larger"
+Percentage Trials Response,0.307,keybert,0.663,"reward trials response, trials response logged, overall percentage trials"
+Percentage Larger Punish,0.339,keybert,0.729,"percentage smaller punish, percentage punish trials, percentage punish"
+Difficulty Level Stimuli,0.277,keybert,0.531,"story median reaction, correct reaction time, stimuli presented math"
+Accuracy Percentage Math,0.341,keybert,0.647,"accuracy percentage story, accuracy condition overall, accuracy percentage"
+Accuracy Percentage Overall,0.322,keybert,0.582,"percentage overall accuracy, accuracy percentage match, accuracy percentage blocks"
+Cial Tom Perc Random,0.582,description_title,0.509,"tom perc random, cial tom perc, percentage tom random"
+Percentage Stimuli Response,0.318,keybert,0.686,"stimuli response logged, overall percentage stimuli, percentage stimuli"
+Rating Percentage Unsure,0.286,keybert,0.59,unsure rating percentage
+Stimuli Received Unsure,0.263,keybert,0.584,"rating median reaction, average median reaction, median reaction times"
+Percentage Random Stimuli,0.364,keybert,0.763,"random stimuli subject, rated random percentage"
+Time Random Stimuli,0.303,keybert,0.629,"random median reaction, random stimuli subject, reaction time random"
+Accuracy Across Trials,0.265,keybert,0.589,"accuracy back place, trials back place, accuracy back"
+Trials Back Tool,0.248,keybert,0.55,"accuracy back tool, tool condition accuracy, back tool nontargets"
+Accuracy Back Face,0.303,keybert,0.618,"face condition accuracy, trials back face, back face targets"
+Median Reaction Time,0.217,keybert,0.482,"tool condition median, reaction time back, target trials back"
+Face Condition Median,0.225,keybert,0.459,"reaction time back, trials back face, reaction time across"
+Reaction Time Back,0.217,keybert,0.482,"condition median reaction, correct trials back, reaction time across"
+Accuracy Back Place,0.261,keybert,0.574,"trials back place, target trials back, accuracy back"
+Back Median Reaction,0.245,keybert,0.535,"reaction time conditions, average median"
+Accuracy Back Body,0.279,keybert,0.621,"body condition accuracy, trials back body, back body nontargets"
+Body Condition Median,0.21,keybert,0.442,"trials back body, reaction time across, back body targets"
+Body Condition Accuracy,0.264,keybert,0.579,"trials back body, back body nontargets"
+Accuracy Back Tool,0.259,keybert,0.567,"tool condition accuracy, back tool targets, target trials back"
+Conscientiousness Scale Neo-Ffi,0.382,keybert,0.785,"neo-ffi conscientiousness, neuroticism scale neo-ffi"
+Am Methodical Person,0.188,keybert,0.41,"methodical person am, high-spirited person am, tough-minded my attitudes"
+Energy Often Feel,0.238,keybert,0.475,"feel chill wave, often feel am, stress sometimes feel"
+Myself Especially Lighthearted,0.239,keybert,0.484,"laugh easily, easily laugh"
+Life Fast-Paced My,0.302,keybert,0.587,"my life fast-paced, fast-paced my life, life fast-paced"
+Depressed Am Seldom,0.294,keybert,0.59,"seldom sad depressed, am seldom sad, rarely feel lonely"
+T-Score Asr Anxiety,0.305,keybert,0.679,"t-score asr depressive, anxiety problems gender, depressive problems gender"
+Asr-Syn Compl Description,0.512,description_title,0.36,"compl description asr, asr-syn compl, description asr"
+Asr Somatic Problems,0.264,keybert,0.562,"raw asr somatic, raw asr ad, asr ad problems"
+Somatic Problems Gender,0.272,keybert,0.561,"gender age adjusted, ad problems gender, t-score asr somatic"
+Der Asr-Syn Compl,0.506,keybert,0.447,description asr
+Description Asr Withdrawn,0.483,keybert,0.614,asr withdrawn
+Description Asr Anxious,0.455,keybert,0.628,
+T-Score Asr Avoidant,0.263,keybert,0.585,"asr avoidant problems, adjusted t-score asr, asr avoidant"
+T-Score Asr Antisocial,0.335,keybert,0.708,"asr antisocial problems, asr antisocial, antisocial problems"
+Dsmiv Major Depressive,0.38,keybert,0.649,"depressive symptoms endorsed, lifetime depressive symptoms, experienced diagnosed dsmiv"
+Visual Acuity Numerator,0.362,keybert,0.714,"acuity numerator distance, visual acuity denominator, coded eyeglass correction"
+Left Eye Color,0.355,keybert,0.652,color eye
+Read Letter Test,0.227,keybert,0.485,"reads letters test, letters test, letters test card"
+Positive Amphetamines Tests,0.402,keybert,0.734,"positive opiates tests, positive cocaine tests, positive methamphetamine tests"
+Breathalyzers Administered Hcp,0.407,keybert,0.787,"any breathalyzers administered, breathalyzers administered, any breathalyzers"
+Avg Total Weekday,0.276,keybert,0.529,"avg total weekend, total weekday alcoholic, drinks past days"
+Drinks Past Days,0.281,keybert,0.625,"alcoholic drinks past, last hcp visit, total alcoholic drinks"
+Dsm Alc Criteria,0.291,keybert,0.577,"criteria dsm sometime, yes dsm criteria, criteria dsm"
+Drunk Past Months,0.49,keybert,0.531,"past months frequency, drinks past months, Past Months"
+Max Drinks Consumed,0.361,keybert,0.732,"lifetime max drinks, max drinks, female max drinks"
+Weekday Pipes Per,0.353,keybert,0.674,"avg weekday pipes, weekday pipes, pipes past days"
+Weekday Times Chew,0.355,keybert,0.684,"times chew per, chew past days, weekend times chew"
+Weekday Times Snuff,0.263,keybert,0.584,"avg weekend times, avg weekday times, snuff per past"
+Total Times Smoked,0.292,keybert,0.622,"visit times smoked, cigarettes past days, times smoked any"
+Avg Weekday Cigarettes,0.362,keybert,0.726,"avg weekend cigarettes, avg weekday cigars, weekday cigarettes per"
+Years Smoked,0.587,keybert,0.688,smoked years
+Fagerstrom Ftnd,0.622,keybert,0.522,"Fagerstrom, fagerstrom ftnd indicative, fagerstrom hsi"
+Dsm Criteria Withdrawal,0.29,keybert,0.528,"dsm criteria tolerance, dsm criteria difficulty, dsm tolerance"
+Cigarettes Per Day,0.376,keybert,0.745,"per day smoking, regularly cigarettes per, day cigarettes smoked"
+Times,0.502,description_title,0.339,"times sedatives, times sedatives never, times hallucinogens"
+Times Opiates,0.328,keybert,0.623,"times opiates never, times drugs, times cocaine"
+Lifetime Yes Dsm,0.2,keybert,0.339,"lifetime yes, never times"
+Trackfrac Min Trfrac,0.303,keybert,0.632,trackfrac min minimum
+Scan Trackfrac,0.305,keybert,0.648,
+Scan Trfrac,0.3,keybert,0.641,

version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/outputs/approach_1/HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_canonical.csv ADDED Viewed

	@@ -0,0 +1,109 @@

+_source_file,_row_index,_leaf_label,_leaf_id,_group_path,_text,_semantic_text,_dtype,_concept_label,_concept_score,_concept_source,_code_family,_facet_cond,_facet_task,_facet_variant,_facet_stat,_facet_outcome,_facet_prec
+ai-mind-variable-descriptions_in_.csv,0,DMSCC,DMS > DMS Recommended Standard.DMSCC,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSCC | description: DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays). | Decimal Places: 2","DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays).",determine,Mean Choices Correct,0.0,singleton_title,DMS,No Condition,DMS,DMS Recommended Standard,Mean,Other,2
+ai-mind-variable-descriptions_in_.csv,1,DMSL0SD,DMS > DMS Recommended Standard.DMSL0SD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSL0SD | description: DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,0,DMS,DMS Recommended Standard,Standard Deviation,Other,4
+ai-mind-variable-descriptions_in_.csv,2,DMSL12SD,DMS > DMS Recommended Standard.DMSL12SD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSL12SD | description: DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,12,DMS,DMS Recommended Standard,Standard Deviation,Other,4
+ai-mind-variable-descriptions_in_.csv,3,DMSL4SD,DMS > DMS Recommended Standard.DMSL4SD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSL4SD | description: DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,4,DMS,DMS Recommended Standard,Standard Deviation,Other,4
+ai-mind-variable-descriptions_in_.csv,4,DMSLADSD,DMS > DMS Recommended Standard.DMSLADSD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSLADSD | description: DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSL,No Condition,DMS,DMS Recommended Standard,Standard Deviation,Other,4
+ai-mind-variable-descriptions_in_.csv,5,DMSLSD,DMS > DMS Recommended Standard.DMSLSD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSLSD | description: DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,determine,Correct Latency Standard Deviation,0.687,description_title,DMSLS,No Condition,DMS,DMS Recommended Standard,Standard Deviation,Other,4
+ai-mind-variable-descriptions_in_.csv,6,DMSLSSD,DMS > DMS Recommended Standard.DMSLSSD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSLSSD | description: DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations. | Decimal Places: 4","DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations.",determine,Correct Latency Standard Deviation,0.687,description_title,DMSLS,No Condition,DMS,DMS Recommended Standard,Standard Deviation,Other,4
+ai-mind-variable-descriptions_in_.csv,7,DMSMDL,DMS > DMS Recommended Standard.DMSMDL,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL | description: DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,determine,Correct Latency Mean,0.625,keybert,DMSMDL,No Condition,DMS,DMS Recommended Standard,Median,Other,4
+ai-mind-variable-descriptions_in_.csv,8,DMSMDL0,DMS > DMS Recommended Standard.DMSMDL0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL0 | description: DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4,DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,0,DMS,DMS Recommended Standard,Median,Other,4
+ai-mind-variable-descriptions_in_.csv,9,DMSMDL12,DMS > DMS Recommended Standard.DMSMDL12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL12 | description: DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4,DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,12,DMS,DMS Recommended Standard,Median,Other,4
+ai-mind-variable-descriptions_in_.csv,10,DMSMDL4,DMS > DMS Recommended Standard.DMSMDL4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL4 | description: DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4,DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,4,DMS,DMS Recommended Standard,Median,Other,4
+ai-mind-variable-descriptions_in_.csv,11,DMSMDLAD,DMS > DMS Recommended Standard.DMSMDLAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLAD | description: DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4,DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,No Condition,DMS,DMS Recommended Standard,Median,Other,4
+ai-mind-variable-descriptions_in_.csv,12,DMSMDLS,DMS > DMS Recommended Standard.DMSMDLS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLS | description: DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4,DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,determine,Correct Latency Mean,0.625,keybert,DMSMDL,No Condition,DMS,DMS Recommended Standard,Median,Other,4
+ai-mind-variable-descriptions_in_.csv,13,DMSML,DMS > DMS Recommended Standard.DMSML,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML | description: DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays).,determine,Correct Latency Mean,0.625,keybert,DMSML,No Condition,DMS,DMS Recommended Standard,Mean,Other,4
+ai-mind-variable-descriptions_in_.csv,14,DMSML0,DMS > DMS Recommended Standard.DMSML0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML0 | description: DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4,DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,0,DMS,DMS Recommended Standard,Mean,Other,4
+ai-mind-variable-descriptions_in_.csv,15,DMSML12,DMS > DMS Recommended Standard.DMSML12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML12 | description: DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4,DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,12,DMS,DMS Recommended Standard,Mean,Other,4
+ai-mind-variable-descriptions_in_.csv,16,DMSML4,DMS > DMS Recommended Standard.DMSML4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSML4 | description: DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4,DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,4,DMS,DMS Recommended Standard,Mean,Other,4
+ai-mind-variable-descriptions_in_.csv,17,DMSMLAD,DMS > DMS Recommended Standard.DMSMLAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMLAD | description: DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4,DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay.,determine,Correct Latency Mean,0.625,keybert,DMSML,No Condition,DMS,DMS Recommended Standard,Mean,Other,4
+ai-mind-variable-descriptions_in_.csv,18,DMSMLS,DMS > DMS Recommended Standard.DMSMLS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSMLS | description: DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4,DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation.,determine,Correct Latency Mean,0.625,keybert,DMSML,No Condition,DMS,DMS Recommended Standard,Mean,Other,4
+ai-mind-variable-descriptions_in_.csv,19,DMSPC,DMS > DMS Recommended Standard.DMSPC,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC | description: DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0,DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,determine,Percent Correct Percentage,0.54,keybert,DMSPC,No Condition,DMS,DMS Recommended Standard,Percent,Other,0
+ai-mind-variable-descriptions_in_.csv,20,DMSPC0,DMS > DMS Recommended Standard.DMSPC0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC0 | description: KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 0,KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,0,DMS,DMS Recommended Standard,Percent,Other,0
+ai-mind-variable-descriptions_in_.csv,21,DMSPC12,DMS > DMS Recommended Standard.DMSPC12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC12 | description: KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 0,KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,12,DMS,DMS Recommended Standard,Percent,Other,0
+ai-mind-variable-descriptions_in_.csv,22,DMSPC4,DMS > DMS Recommended Standard.DMSPC4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPC4 | description: KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay. | Decimal Places: 0,KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,4,DMS,DMS Recommended Standard,Percent,Other,0
+ai-mind-variable-descriptions_in_.csv,23,DMSPCAD,DMS > DMS Recommended Standard.DMSPCAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPCAD | description: KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay. | Decimal Places: 0,KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,No Condition,DMS,DMS Recommended Standard,Percent,Other,0
+ai-mind-variable-descriptions_in_.csv,24,DMSPCS,DMS > DMS Recommended Standard.DMSPCS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPCS | description: KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli. | Decimal Places: 0,KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli.,determine,Percent Correct Percentage,0.54,keybert,DMSPC,No Condition,DMS,DMS Recommended Standard,Percent,Other,0
+ai-mind-variable-descriptions_in_.csv,25,DMSPEGC,DMS > DMS Recommended Standard.DMSPEGC,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGC | description: DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4,DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays).,determine,Probability Error Occurring,0.619,keybert,DMSPEG,No Condition,DMS,DMS Recommended Standard,Probability,Error,4
+ai-mind-variable-descriptions_in_.csv,26,DMSPEGE,DMS > DMS Recommended Standard.DMSPEGE,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGE | description: KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4,KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays).,determine,Probability Error Occurring,0.619,keybert,DMSPEG,No Condition,DMS,DMS Recommended Standard,Probability,Error,4
+ai-mind-variable-descriptions_in_.csv,27,DMSTC,DMS > DMS Recommended Standard.DMSTC,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC | description: DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0,DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays).,determine,Total Correct,0.507,description_title,DMSTC,No Condition,DMS,DMS Recommended Standard,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,28,DMSTC0,DMS > DMS Recommended Standard.DMSTC0,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC0 | description: DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds. | Decimal Places: 0,DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds.,determine,Total Correct,0.507,description_title,DMSTC,0,DMS,DMS Recommended Standard,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,29,DMSTC12,DMS > DMS Recommended Standard.DMSTC12,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC12 | description: DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds. | Decimal Places: 0,DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds.,determine,Total Correct,0.507,description_title,DMSTC,12,DMS,DMS Recommended Standard,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,30,DMSTC4,DMS > DMS Recommended Standard.DMSTC4,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTC4 | description: DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds. | Decimal Places: 0,DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds.,determine,Total Correct,0.507,description_title,DMSTC,4,DMS,DMS Recommended Standard,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,31,DMSTCAD,DMS > DMS Recommended Standard.DMSTCAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTCAD | description: DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay. | Decimal Places: 0,DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay.,determine,Total Correct,0.507,description_title,DMSTC,No Condition,DMS,DMS Recommended Standard,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,32,DMSTCS,DMS > DMS Recommended Standard.DMSTCS,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTCS | description: DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli. | Decimal Places: 0,DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli.,determine,Total Correct,0.507,description_title,DMSTC,No Condition,DMS,DMS Recommended Standard,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,33,DMSTE,DMS > DMS Recommended Standard.DMSTE,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTE | description: DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen. | Decimal Places: 0","DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen.",determine,Errors Total,0.604,keybert,DMSTE,No Condition,DMS,DMS Recommended Standard,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,34,DMSTEAD,DMS > DMS Recommended Standard.DMSTEAD,DMS > DMS Recommended Standard,Task: DMS | Variant: DMS Recommended Standard | name: DMSTEAD | description: DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component. | Decimal Places: 0,DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component.,determine,Errors Total,0.604,keybert,DMSTE,No Condition,DMS,DMS Recommended Standard,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,35,DMSTEC,DMS > DMS Recommended Standard.DMSTEC,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEC | description: DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0","DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays).",determine,Error,0.447,description_title,DMSTEC,No Condition,DMS,DMS Recommended Standard,Other,Error,0
+ai-mind-variable-descriptions_in_.csv,36,DMSTECAD,DMS > DMS Recommended Standard.DMSTECAD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTECAD | description: DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0","DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component.",determine,Error,0.447,description_title,DMSTEC,No Condition,DMS,DMS Recommended Standard,Other,Error,0
+ai-mind-variable-descriptions_in_.csv,37,DMSTED,DMS > DMS Recommended Standard.DMSTED,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTED | description: DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0","DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays).",determine,Error,0.447,description_title,DMSTED,No Condition,DMS,DMS Recommended Standard,Other,Error,0
+ai-mind-variable-descriptions_in_.csv,38,DMSTEDAD,DMS > DMS Recommended Standard.DMSTEDAD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEDAD | description: DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0","DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component.",determine,Error,0.447,description_title,DMSTED,No Condition,DMS,DMS Recommended Standard,Other,Error,0
+ai-mind-variable-descriptions_in_.csv,39,DMSTEP,DMS > DMS Recommended Standard.DMSTEP,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEP | description: DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0","DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays).",determine,Error,0.447,description_title,DMSTEP,No Condition,DMS,DMS Recommended Standard,Other,Error,0
+ai-mind-variable-descriptions_in_.csv,40,DMSTEPAD,DMS > DMS Recommended Standard.DMSTEPAD,DMS > DMS Recommended Standard,"Task: DMS | Variant: DMS Recommended Standard | name: DMSTEPAD | description: DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0","DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component.",determine,Error,0.447,description_title,DMSTEP,No Condition,DMS,DMS Recommended Standard,Other,Error,0
+ai-mind-variable-descriptions_in_.csv,41,MOTML,MOT > MOT Tone 2.0.MOTML,MOT > MOT Tone 2.0,Task: MOT | Variant: MOT Tone 2.0 | name: MOTML | description: The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 1,The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials.,determine,Latency Display Stimulus,0.418,keybert,MOT,No Condition,MOT,MOT Tone 2.0,Mean,Other,1
+ai-mind-variable-descriptions_in_.csv,42,MOTSDL,MOT > MOT Tone 2.0.MOTSDL,MOT > MOT Tone 2.0,"Task: MOT | Variant: MOT Tone 2.0 | name: MOTSDL | description: This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 2","This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials.",determine,Latency Display Stimulus,0.418,keybert,MOT,No Condition,MOT,MOT Tone 2.0,Standard Deviation,Other,2
+ai-mind-variable-descriptions_in_.csv,43,MOTTC,MOT > MOT Tone 2.0.MOTTC,MOT > MOT Tone 2.0,Task: MOT | Variant: MOT Tone 2.0 | name: MOTTC | description: The total number of assessment trials on which the subject made a correct response. | Decimal Places: 0,MOT The total number of assessment trials on which the subject made a correct response.,determine,Total Assessment Trials,0.313,keybert,MOTT,No Condition,MOT,MOT Tone 2.0,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,44,MOTTE,MOT > MOT Tone 2.0.MOTTE,MOT > MOT Tone 2.0,Task: MOT | Variant: MOT Tone 2.0 | name: MOTTE | description: The total number of assessment trials on which the subject failed to make a correct response. | Decimal Places: 0,MOT The total number of assessment trials on which the subject failed to make a correct response.,determine,Total Assessment Trials,0.313,keybert,MOTT,No Condition,MOT,MOT Tone 2.0,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,45,PALFAMS28,PAL > PAL Recommended Standard Extended.PALFAMS28,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALFAMS28 | description: KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard.. | Decimal Places: 0","KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard..",determine,First Attempt Memory,0.0,singleton_title,PAL,28,PAL,PAL Recommended Standard Extended,Other,Other,0
+ai-mind-variable-descriptions_in_.csv,46,PALMETS28,PAL > PAL Recommended Standard Extended.PALMETS28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALMETS28 | description: PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0,PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage.  Does not include 12 box level to provide a direct comparison to Recommended Standard.,determine,Mean Errors Success,0.0,singleton_title,PAL,28,PAL,PAL Recommended Standard Extended,Mean,Errors,0
+ai-mind-variable-descriptions_in_.csv,47,PALNPR28,PAL > PAL Recommended Standard Extended.PALNPR28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALNPR28 | description: PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached. | Decimal Places: 0,PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached.,determine,Patterns Reached,0.0,singleton_title,PAL,28,PAL,PAL Recommended Standard Extended,Other,Other,0
+ai-mind-variable-descriptions_in_.csv,48,PALTA12,PAL > PAL Recommended Standard Extended.PALTA12,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA12 | description: PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall. | Decimal Places: 0,PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,12,PAL,PAL Recommended Standard Extended,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,49,PALTA2,PAL > PAL Recommended Standard Extended.PALTA2,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA2 | description: PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall. | Decimal Places: 0,PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,2,PAL,PAL Recommended Standard Extended,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,50,PALTA28,PAL > PAL Recommended Standard Extended.PALTA28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA28 | description: PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0,PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems.  Does not include 12 box level to provide a direct comparison to Recommended Standard.,determine,Total Attempts Made,0.605,keybert,PALTA,28,PAL,PAL Recommended Standard Extended,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,51,PALTA4,PAL > PAL Recommended Standard Extended.PALTA4,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA4 | description: PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall. | Decimal Places: 0,PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,4,PAL,PAL Recommended Standard Extended,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,52,PALTA6,PAL > PAL Recommended Standard Extended.PALTA6,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA6 | description: PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall. | Decimal Places: 0,PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,6,PAL,PAL Recommended Standard Extended,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,53,PALTA8,PAL > PAL Recommended Standard Extended.PALTA8,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA8 | description: PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall. | Decimal Places: 0,PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall.,determine,Total Attempts Made,0.605,keybert,PALTA,8,PAL,PAL Recommended Standard Extended,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,54,PALTE12,PAL > PAL Recommended Standard Extended.PALTE12,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE12 | description: PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,12,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
+ai-mind-variable-descriptions_in_.csv,55,PALTE2,PAL > PAL Recommended Standard Extended.PALTE2,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE2 | description: PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,2,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
+ai-mind-variable-descriptions_in_.csv,56,PALTE28,PAL > PAL Recommended Standard Extended.PALTE28,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE28 | description: PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials.  Does not include 12 box level to provide a direct comparison to Recommended Standard.,determine,Errors Patterns Total,0.296,keybert,PALTE,28,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
+ai-mind-variable-descriptions_in_.csv,57,PALTE4,PAL > PAL Recommended Standard Extended.PALTE4,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE4 | description: PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,4,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
+ai-mind-variable-descriptions_in_.csv,58,PALTE6,PAL > PAL Recommended Standard Extended.PALTE6,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE6 | description: PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,6,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
+ai-mind-variable-descriptions_in_.csv,59,PALTE8,PAL > PAL Recommended Standard Extended.PALTE8,PAL > PAL Recommended Standard Extended,Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE8 | description: PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials. | Decimal Places: 0,chose the incorrect box for a stimulus on assessment problems PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials.,determine,Errors Patterns Total,0.296,keybert,PALTE,8,PAL,PAL Recommended Standard Extended,Total,Incorrect,0
+ai-mind-variable-descriptions_in_.csv,60,PALTEA12,PAL > PAL Recommended Standard Extended.PALTEA12,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA12 | description: PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,12,PAL,PAL Recommended Standard Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,61,PALTEA2,PAL > PAL Recommended Standard Extended.PALTEA2,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA2 | description: PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,2,PAL,PAL Recommended Standard Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,62,PALTEA28,PAL > PAL Recommended Standard Extended.PALTEA28,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA28 | description: KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task.  In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0","KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task.  In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard.",determine,Total Errors,0.0,singleton_title,PALTEA,28,PAL,PAL Recommended Standard Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,63,PALTEA4,PAL > PAL Recommended Standard Extended.PALTEA4,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA4 | description: PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,4,PAL,PAL Recommended Standard Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,64,PALTEA6,PAL > PAL Recommended Standard Extended.PALTEA6,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA6 | description: PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,6,PAL,PAL Recommended Standard Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,65,PALTEA8,PAL > PAL Recommended Standard Extended.PALTEA8,PAL > PAL Recommended Standard Extended,"Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA8 | description: PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0","does not include PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach.",determine,Include Total Errors Shapes,0.609,description_title,PALTEA,8,PAL,PAL Recommended Standard Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,66,PRMCLSDD,PRM > PRM Recommended Standard 18 Extended.PRMCLSDD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDD | description: PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMCLSD,No Condition,PRM,PRM Recommended Standard 18 Extended,Standard Deviation,Other,2
+ai-mind-variable-descriptions_in_.csv,67,PRMCLSDI,PRM > PRM Recommended Standard 18 Extended.PRMCLSDI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDI | description: PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMCLSD,No Condition,PRM,PRM Recommended Standard 18 Extended,Standard Deviation,Other,2
+ai-mind-variable-descriptions_in_.csv,68,PRMMCLD,PRM > PRM Recommended Standard 18 Extended.PRMMCLD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLD | description: PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Mean,Other,2
+ai-mind-variable-descriptions_in_.csv,69,PRMMCLI,PRM > PRM Recommended Standard 18 Extended.PRMMCLI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLI | description: PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Mean,Other,2
+ai-mind-variable-descriptions_in_.csv,70,PRMMDCLD,PRM > PRM Recommended Standard 18 Extended.PRMMDCLD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLD | description: PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMDCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Median,Other,2
+ai-mind-variable-descriptions_in_.csv,71,PRMMDCLI,PRM > PRM Recommended Standard 18 Extended.PRMMDCLI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLI | description: PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2","PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds.",determine,Latency Immediate Standard,0.653,keybert,PRMMDCL,No Condition,PRM,PRM Recommended Standard 18 Extended,Median,Other,2
+ai-mind-variable-descriptions_in_.csv,72,PRMPCD,PRM > PRM Recommended Standard 18 Extended.PRMPCD,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCD | description: KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage. | Decimal Places: 2","KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage.",determine,Percent Correct Immediate,0.596,keybert,PRMPC,No Condition,PRM,PRM Recommended Standard 18 Extended,Percent,Other,2
+ai-mind-variable-descriptions_in_.csv,73,PRMPCI,PRM > PRM Recommended Standard 18 Extended.PRMPCI,PRM > PRM Recommended Standard 18 Extended,"Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCI | description: KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage. | Decimal Places: 2","KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage.",determine,Percent Correct Immediate,0.596,keybert,PRMPC,No Condition,PRM,PRM Recommended Standard 18 Extended,Percent,Other,2
+ai-mind-variable-descriptions_in_.csv,74,PRMTSDSP,PRM > PRM Recommended Standard 18 Extended.PRMTSDSP,PRM > PRM Recommended Standard 18 Extended,Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMTSDSP | description: PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition. | Decimal Places: 2,PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition.,determine,Time Since Delayed Stimuli,0.0,singleton_title,PRM,No Condition,PRM,PRM Recommended Standard 18 Extended,Other,Other,2
+ai-mind-variable-descriptions_in_.csv,75,RVPA,RVP > RVP 3 Targets.RVPA,RVP > RVP 3 Targets,"Task: RVP | Variant: RVP 3 Targets | name: RVPA | description: KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences. | Decimal Places: 4","KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences.",determine,Detection Measure,0.0,singleton_keybert,RVP,No Condition,RVP,RVP 3 Targets,Range,Other,4
+ai-mind-variable-descriptions_in_.csv,76,RVPLSD,RVP > RVP 3 Targets.RVPLSD,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPLSD | description: RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4,RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials.,determine,Response Latency Mean,0.676,keybert,RVP,No Condition,RVP,RVP 3 Targets,Standard Deviation,Other,4
+ai-mind-variable-descriptions_in_.csv,77,RVPMDL,RVP > RVP 3 Targets.RVPMDL,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPMDL | description: KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4,KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials.,determine,Response Latency Mean,0.676,keybert,RVPM,No Condition,RVP,RVP 3 Targets,Median,Other,4
+ai-mind-variable-descriptions_in_.csv,78,RVPML,RVP > RVP 3 Targets.RVPML,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPML | description: RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4,RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials.,determine,Response Latency Mean,0.676,keybert,RVPM,No Condition,RVP,RVP 3 Targets,Mean,Other,4
+ai-mind-variable-descriptions_in_.csv,79,RVPPFA,RVP > RVP 3 Targets.RVPPFA,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPPFA | description: KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms ÷ (False Alarms + Correct Rejections)) | Decimal Places: 4,KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms ÷ (False Alarms + Correct Rejections)),determine,Total,0.407,description_title,RVPP,No Condition,RVP,RVP 3 Targets,Probability,False Alarm,4
+ai-mind-variable-descriptions_in_.csv,80,RVPPH,RVP > RVP 3 Targets.RVPPH,RVP > RVP 3 Targets,"Task: RVP | Variant: RVP 3 Targets | name: RVPPH | description: RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits ÷ total number of sequences) | Decimal Places: 4","RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits ÷ total number of sequences)",determine,Total,0.407,description_title,RVPP,No Condition,RVP,RVP 3 Targets,Probability,Other,4
+ai-mind-variable-descriptions_in_.csv,81,RVPTFA,RVP > RVP 3 Targets.RVPTFA,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPTFA | description: RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms. | Decimal Places: 0,RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms.,determine,Total,0.407,description_title,RVPT,No Condition,RVP,RVP 3 Targets,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,82,RVPTH,RVP > RVP 3 Targets.RVPTH,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPTH | description: RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks. | Decimal Places: 0,RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks.,determine,Total,0.407,description_title,RVPT,No Condition,RVP,RVP 3 Targets,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,83,RVPTM,RVP > RVP 3 Targets.RVPTM,RVP > RVP 3 Targets,Task: RVP | Variant: RVP 3 Targets | name: RVPTM | description: RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks. | Decimal Places: 0,RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks.,determine,Total,0.407,description_title,RVPT,No Condition,RVP,RVP 3 Targets,Total,Other,0
+ai-mind-variable-descriptions_in_.csv,84,SWMBE12,SWM > SWM Recommended Standard 2.0 Extended.SWMBE12,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE12 | description: KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only. | Decimal Places: 0,KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,12,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,85,SWMBE4,SWM > SWM Recommended Standard 2.0 Extended.SWMBE4,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE4 | description: KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only. | Decimal Places: 0,KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,4,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,86,SWMBE468,SWM > SWM Recommended Standard 2.0 Extended.SWMBE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE468 | description: KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials.",determine,Errors Boxes Times,0.515,keybert,SWMBE,468,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,87,SWMBE6,SWM > SWM Recommended Standard 2.0 Extended.SWMBE6,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE6 | description: KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only. | Decimal Places: 0,KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,88,SWMBE8,SWM > SWM Recommended Standard 2.0 Extended.SWMBE8,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE8 | description: KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only. | Decimal Places: 0,KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only.,determine,Errors Boxes Times,0.515,keybert,SWMBE,8,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,89,SWMDE12,SWM > SWM Recommended Standard 2.0 Extended.SWMDE12,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE12 | description: SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only. | Decimal Places: 0,SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,12,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,90,SWMDE4,SWM > SWM Recommended Standard 2.0 Extended.SWMDE4,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE4 | description: SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only. | Decimal Places: 0,SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,4,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,91,SWMDE468,SWM > SWM Recommended Standard 2.0 Extended.SWMDE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE468 | description: SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials.",determine,Double Errors,0.0,singleton_title,SWMDE,468,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,92,SWMDE6,SWM > SWM Recommended Standard 2.0 Extended.SWMDE6,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE6 | description: SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only. | Decimal Places: 0,SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,93,SWMDE8,SWM > SWM Recommended Standard 2.0 Extended.SWMDE8,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE8 | description: SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only. | Decimal Places: 0,SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only.,determine,Double Errors Boxes,0.594,description_title,SWMDE,8,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,94,SWMPR,SWM > SWM Recommended Standard 2.0 Extended.SWMPR,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMPR | description: SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete. | Decimal Places: 0","SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete.",determine,Problem Reached,0.0,singleton_title,SWM,No Condition,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
+ai-mind-variable-descriptions_in_.csv,95,SWMS,SWM > SWM Recommended Standard 2.0 Extended.SWMS,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS | description: KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens. | Decimal Places: 0","KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens.",determine,Strategy High,0.569,keybert,SWMS,No Condition,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
+ai-mind-variable-descriptions_in_.csv,96,SWMS6,SWM > SWM Recommended Standard 2.0 Extended.SWMS6,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS6 | description: SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. | Decimal Places: 0","SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes.",determine,Strategy High,0.569,keybert,SWMS,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
+ai-mind-variable-descriptions_in_.csv,97,SWMSX,SWM > SWM Recommended Standard 2.0 Extended.SWMSX,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMSX | description: SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more. | Decimal Places: 0","SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more.",determine,Strategy High,0.569,keybert,SWMS,No Condition,SWM,SWM Recommended Standard 2.0 Extended,Other,Other,0
+ai-mind-variable-descriptions_in_.csv,98,SWMTE12,SWM > SWM Recommended Standard 2.0 Extended.SWMTE12,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE12 | description: SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only. | Decimal Places: 0","SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,12,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,99,SWMTE4,SWM > SWM Recommended Standard 2.0 Extended.SWMTE4,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE4 | description: SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only. | Decimal Places: 0","SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,4,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,100,SWMTE468,SWM > SWM Recommended Standard 2.0 Extended.SWMTE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE468 | description: SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials.",determine,Errors Total,0.593,keybert,SWMTE,468,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,101,SWMTE6,SWM > SWM Recommended Standard 2.0 Extended.SWMTE6,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE6 | description: SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only. | Decimal Places: 0","SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,6,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,102,SWMTE8,SWM > SWM Recommended Standard 2.0 Extended.SWMTE8,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE8 | description: SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only. | Decimal Places: 0","SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only.",determine,Errors Total,0.593,keybert,SWMTE,8,SWM,SWM Recommended Standard 2.0 Extended,Total,Errors,0
+ai-mind-variable-descriptions_in_.csv,103,SWMWE12,SWM > SWM Recommended Standard 2.0 Extended.SWMWE12,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE12 | description: SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only. | Decimal Places: 0,SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,12,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,104,SWMWE4,SWM > SWM Recommended Standard 2.0 Extended.SWMWE4,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE4 | description: SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only. | Decimal Places: 0,SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,4,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,105,SWMWE468,SWM > SWM Recommended Standard 2.0 Extended.SWMWE468,SWM > SWM Recommended Standard 2.0 Extended,"Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE468 | description: SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0","SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials.",determine,Within Errors,0.0,singleton_title,SWMWE,468,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,106,SWMWE6,SWM > SWM Recommended Standard 2.0 Extended.SWMWE6,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE6 | description: SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only. | Decimal Places: 0,SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,6,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0
+ai-mind-variable-descriptions_in_.csv,107,SWMWE8,SWM > SWM Recommended Standard 2.0 Extended.SWMWE8,SWM > SWM Recommended Standard 2.0 Extended,Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE8 | description: SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only. | Decimal Places: 0,SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only.,determine,Within Errors,0.412,keybert,SWMWE,8,SWM,SWM Recommended Standard 2.0 Extended,Other,Errors,0

version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_concept_labels.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+Node,Confidence,Source,Embedding sim,Alternatives
+Total Correct,0.507,description_title,0.319,"correct total, correct total times"
+Error,0.447,description_title,0.216,"error times subject, error times, failed"
+Mean Latency,0.625,keybert,0.676,latency mean
+Errors Total,0.604,keybert,0.543,"errors total times, Total Errors"
+Standard Deviation,0.687,description_title,0.684,"latency standard deviation, deviation response latencies"
+Probability Error Occurring,0.619,keybert,0.578,"Probability Error, probability error made, reports probability error"
+Percent Correct Percentage,0.54,keybert,0.473,"correct percentage assessment, correct percentage, Percent Correct"
+Latency Display Stimulus,0.418,keybert,0.732,"mean latency display, standard deviation latency, deviation latency calculated"
+Total Assessment Trials,0.313,keybert,0.629,"assessment trials subject, trials subject failed, trials subject"
+Total Attempts Made,0.605,keybert,0.535,attempts total
+Errors Patterns Total,0.296,keybert,0.619,"box stimulus assessment, stimulus assessment problems, incorrect box stimulus"
+Include Shapes,0.609,description_title,0.549,"total errors shapes, errors shapes times, errors shapes"
+Latency Immediate Standard,0.653,keybert,0.715,"correct latency immediate, latency immediate, correct latency delayed"
+Percent Correct Immediate,0.596,keybert,0.671,"Percent Correct, key percent correct, percent correct delayed"
+Total,0.407,description_title,0.111,"total hits, hits total"
+Response Latency Mean,0.676,keybert,0.683,"Response Latency, response latency trials, latency mean response"
+Times Errors,0.515,keybert,0.447,"Errors Boxes, key errors boxes, errors times"
+Strategy High,0.569,keybert,0.509,"Strategy, strategy finding, high strategy"
+Within Errors,0.412,keybert,0.303,boxes times subject
+Errors Total,0.593,keybert,0.537,"errors total times, Total Errors"

version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_facets.json ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/outputs/approach_1/ai-mind-variable-descriptions_in__approach1_hierarchy.json ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/outputs/approach_2/HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/outputs/approach_2/ai-mind-variable-descriptions_in__approach2_lod.json ADDED Viewed

	@@ -0,0 +1,2716 @@

+[
+  {
+    "id": 0,
+    "name": "project",
+    "related": [
+      109,
+      120,
+      124,
+      130,
+      136,
+      143
+    ],
+    "type": "root",
+    "desc": "Root node",
+    "dtype": "determine",
+    "isShown": true,
+    "post_build_stats": {
+      "sibling_factor_nodes_inserted": 0,
+      "low_quality_nodes_dissolved": 0,
+      "group_prefix_labels_stripped": 6,
+      "dag_links_removed": 9
+    }
+  },
+  {
+    "id": 1,
+    "name": "DMSCC",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSCC | description: DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays). | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 0,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 2,
+    "name": "DMSL0SD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL0SD | description: DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 1,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 3,
+    "name": "DMSL12SD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL12SD | description: DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 2,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 4,
+    "name": "DMSL4SD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL4SD | description: DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 3,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 5,
+    "name": "DMSLADSD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLADSD | description: DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 4,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 6,
+    "name": "DMSLSD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSD | description: DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 5,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 7,
+    "name": "DMSLSSD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSSD | description: DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 6,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 8,
+    "name": "DMSMDL",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL | description: DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 7,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 9,
+    "name": "DMSMDL0",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL0 | description: DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 8,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 10,
+    "name": "DMSMDL12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL12 | description: DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 9,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 11,
+    "name": "DMSMDL4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL4 | description: DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 10,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 12,
+    "name": "DMSMDLAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLAD | description: DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 11,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 13,
+    "name": "DMSMDLS",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLS | description: DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 12,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 14,
+    "name": "DMSML",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML | description: DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 13,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 15,
+    "name": "DMSML0",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML0 | description: DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 14,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 16,
+    "name": "DMSML12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML12 | description: DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 15,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 17,
+    "name": "DMSML4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML4 | description: DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 16,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 18,
+    "name": "DMSMLAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLAD | description: DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 17,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 19,
+    "name": "DMSMLS",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLS | description: DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 18,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 20,
+    "name": "DMSPC",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC | description: DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 19,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 21,
+    "name": "DMSPC0",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC0 | description: KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 20,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 22,
+    "name": "DMSPC12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC12 | description: KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 21,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 23,
+    "name": "DMSPC4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC4 | description: KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 22,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 24,
+    "name": "DMSPCAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCAD | description: KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 23,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 25,
+    "name": "DMSPCS",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCS | description: KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 24,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 26,
+    "name": "DMSPEGC",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGC | description: DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 25,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 27,
+    "name": "DMSPEGE",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGE | description: KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 26,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 28,
+    "name": "DMSTC",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC | description: DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 27,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 29,
+    "name": "DMSTC0",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC0 | description: DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 28,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 30,
+    "name": "DMSTC12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC12 | description: DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 29,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 31,
+    "name": "DMSTC4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC4 | description: DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 30,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 32,
+    "name": "DMSTCAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCAD | description: DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 31,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 33,
+    "name": "DMSTCS",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCS | description: DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 32,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 34,
+    "name": "DMSTE",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTE | description: DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 33,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 35,
+    "name": "DMSTEAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEAD | description: DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 34,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 36,
+    "name": "DMSTEC",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEC | description: DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 35,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 37,
+    "name": "DMSTECAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTECAD | description: DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 36,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 38,
+    "name": "DMSTED",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTED | description: DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 37,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 39,
+    "name": "DMSTEDAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEDAD | description: DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 38,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 40,
+    "name": "DMSTEP",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEP | description: DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 39,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 41,
+    "name": "DMSTEPAD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEPAD | description: DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 40,
+      "group": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 42,
+    "name": "MOTML",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTML | description: The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 1",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 41,
+      "group": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 43,
+    "name": "MOTSDL",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTSDL | description: This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 42,
+      "group": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 44,
+    "name": "MOTTC",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTC | description: The total number of assessment trials on which the subject made a correct response. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 43,
+      "group": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 45,
+    "name": "MOTTE",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTE | description: The total number of assessment trials on which the subject failed to make a correct response. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 44,
+      "group": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 46,
+    "name": "PALFAMS28",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALFAMS28 | description: KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard.. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 45,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 47,
+    "name": "PALMETS28",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALMETS28 | description: PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 46,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 48,
+    "name": "PALNPR28",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALNPR28 | description: PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 47,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 49,
+    "name": "PALTA12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA12 | description: PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 48,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 50,
+    "name": "PALTA2",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA2 | description: PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 49,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 51,
+    "name": "PALTA28",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA28 | description: PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 50,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 52,
+    "name": "PALTA4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA4 | description: PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 51,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 53,
+    "name": "PALTA6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA6 | description: PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 52,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 54,
+    "name": "PALTA8",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA8 | description: PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 53,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 55,
+    "name": "PALTE12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE12 | description: PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 54,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 56,
+    "name": "PALTE2",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE2 | description: PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 55,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 57,
+    "name": "PALTE28",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE28 | description: PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 56,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 58,
+    "name": "PALTE4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE4 | description: PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 57,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 59,
+    "name": "PALTE6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE6 | description: PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 58,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 60,
+    "name": "PALTE8",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE8 | description: PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 59,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 61,
+    "name": "PALTEA12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA12 | description: PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 60,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 62,
+    "name": "PALTEA2",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA2 | description: PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 61,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 63,
+    "name": "PALTEA28",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA28 | description: KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task.  In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 62,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 64,
+    "name": "PALTEA4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA4 | description: PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 63,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 65,
+    "name": "PALTEA6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA6 | description: PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 64,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 66,
+    "name": "PALTEA8",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA8 | description: PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 65,
+      "group": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 67,
+    "name": "PRMCLSDD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDD | description: PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 66,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 68,
+    "name": "PRMCLSDI",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDI | description: PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 67,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 69,
+    "name": "PRMMCLD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLD | description: PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 68,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 70,
+    "name": "PRMMCLI",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLI | description: PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 69,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 71,
+    "name": "PRMMDCLD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLD | description: PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 70,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 72,
+    "name": "PRMMDCLI",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLI | description: PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 71,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 73,
+    "name": "PRMPCD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCD | description: KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 72,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 74,
+    "name": "PRMPCI",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCI | description: KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 73,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 75,
+    "name": "PRMTSDSP",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMTSDSP | description: PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition. | Decimal Places: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 74,
+      "group": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 76,
+    "name": "RVPA",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPA | description: KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 75,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 77,
+    "name": "RVPLSD",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPLSD | description: RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 76,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 78,
+    "name": "RVPMDL",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPMDL | description: KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 77,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 79,
+    "name": "RVPML",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPML | description: RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 78,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 80,
+    "name": "RVPPFA",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPFA | description: KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms ÷ (False Alarms + Correct Rejections)) | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 79,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 81,
+    "name": "RVPPH",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPH | description: RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits ÷ total number of sequences) | Decimal Places: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 80,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 82,
+    "name": "RVPTFA",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTFA | description: RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 81,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 83,
+    "name": "RVPTH",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTH | description: RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 82,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 84,
+    "name": "RVPTM",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTM | description: RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 83,
+      "group": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 85,
+    "name": "SWMBE12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE12 | description: KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 84,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 86,
+    "name": "SWMBE4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE4 | description: KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 85,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 87,
+    "name": "SWMBE468",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE468 | description: KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 86,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 88,
+    "name": "SWMBE6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE6 | description: KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 87,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 89,
+    "name": "SWMBE8",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE8 | description: KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 88,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 90,
+    "name": "SWMDE12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE12 | description: SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 89,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 91,
+    "name": "SWMDE4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE4 | description: SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 90,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 92,
+    "name": "SWMDE468",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE468 | description: SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 91,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 93,
+    "name": "SWMDE6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE6 | description: SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 92,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 94,
+    "name": "SWMDE8",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE8 | description: SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 93,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 95,
+    "name": "SWMPR",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMPR | description: SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 94,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 96,
+    "name": "SWMS",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS | description: KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 95,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 97,
+    "name": "SWMS6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS6 | description: SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 96,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 98,
+    "name": "SWMSX",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMSX | description: SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 97,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 99,
+    "name": "SWMTE12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE12 | description: SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 98,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 100,
+    "name": "SWMTE4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE4 | description: SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 99,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 101,
+    "name": "SWMTE468",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE468 | description: SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 100,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 102,
+    "name": "SWMTE6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE6 | description: SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 101,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 103,
+    "name": "SWMTE8",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE8 | description: SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 102,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 104,
+    "name": "SWMWE12",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE12 | description: SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 103,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 105,
+    "name": "SWMWE4",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE4 | description: SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 104,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 106,
+    "name": "SWMWE468",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE468 | description: SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 105,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 107,
+    "name": "SWMWE6",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE6 | description: SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 106,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 108,
+    "name": "SWMWE8",
+    "related": [],
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE8 | description: SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "dtype": "determine",
+    "isShown": true,
+    "metadata": {
+      "row_index": 107,
+      "group": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 109,
+    "name": "DMS",
+    "related": [
+      110
+    ],
+    "type": "aggregation",
+    "desc": "Group: DMS",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 110,
+    "name": "Recommended Standard",
+    "related": [
+      111,
+      115,
+      117,
+      118,
+      119,
+      1,
+      14,
+      26
+    ],
+    "type": "aggregation",
+    "desc": "Group: DMS > DMS Recommended Standard",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null,
+      "phrase_regularity": 1.0,
+      "route_used": "per_row_llm_extraction"
+    }
+  },
+  {
+    "id": 111,
+    "name": "Correct Latency",
+    "related": [
+      112,
+      113,
+      114
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Correct Latency\" | Variables: 17 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Correct Latency"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 112,
+    "name": "Standard Deviation",
+    "related": [
+      2,
+      3,
+      4,
+      5,
+      7,
+      6
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"Standard Deviation\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Standard Deviation"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 113,
+    "name": "Median",
+    "related": [
+      9,
+      10,
+      11,
+      12,
+      13,
+      8
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"Median\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Median"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 114,
+    "name": "Mean",
+    "related": [
+      15,
+      16,
+      17,
+      18,
+      19
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"Mean\" | Variables: 5 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Mean"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 115,
+    "name": "Error",
+    "related": [
+      116,
+      27,
+      36,
+      38,
+      40
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Error\" | Variables: 7 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Error"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 116,
+    "name": "All Delays",
+    "related": [
+      37,
+      39,
+      41
+    ],
+    "type": "aggregation",
+    "desc": "Role: condition | Value: \"all delays\" | Variables: 3 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "all delays"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "condition"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "condition",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 117,
+    "name": "Percent Correct",
+    "related": [
+      20,
+      21,
+      22,
+      23,
+      24,
+      25
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Percent Correct\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Percent Correct"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 118,
+    "name": "Correct",
+    "related": [
+      29,
+      30,
+      31,
+      32,
+      33,
+      28
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Correct\" | Variables: 6 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Correct"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 119,
+    "name": "Errors",
+    "related": [
+      34,
+      35
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Errors\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Errors"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 120,
+    "name": "MOT",
+    "related": [
+      121
+    ],
+    "type": "aggregation",
+    "desc": "Group: MOT",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 121,
+    "name": "Tone 2.0",
+    "related": [
+      122,
+      123
+    ],
+    "type": "aggregation",
+    "desc": "Group: MOT > MOT Tone 2.0",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null,
+      "phrase_regularity": 1.0,
+      "route_used": "per_row_llm_extraction"
+    }
+  },
+  {
+    "id": 122,
+    "name": "Latency",
+    "related": [
+      42,
+      43
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"latency\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "latency"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 123,
+    "name": "Assessment Trials",
+    "related": [
+      44,
+      45
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"assessment trials\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "assessment trials"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 124,
+    "name": "PAL",
+    "related": [
+      125
+    ],
+    "type": "aggregation",
+    "desc": "Group: PAL",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 125,
+    "name": "Recommended Standard Extended",
+    "related": [
+      126,
+      128,
+      46,
+      52,
+      65,
+      66,
+      48
+    ],
+    "type": "aggregation",
+    "desc": "Group: PAL > PAL Recommended Standard Extended",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null,
+      "phrase_regularity": 1.0,
+      "route_used": "per_row_llm_extraction"
+    }
+  },
+  {
+    "id": 126,
+    "name": "Errors",
+    "related": [
+      127,
+      47
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Errors\" | Variables: 11 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Errors"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 127,
+    "name": "Total",
+    "related": [
+      55,
+      56,
+      58,
+      59,
+      60,
+      61,
+      62,
+      63,
+      64,
+      57
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"Total\" | Variables: 10 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Total"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 128,
+    "name": "Attempts",
+    "related": [
+      129,
+      51
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Attempts\" | Variables: 5 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Attempts"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 129,
+    "name": "Total",
+    "related": [
+      49,
+      50,
+      53,
+      54
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"Total\" | Variables: 4 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Total"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 130,
+    "name": "PRM",
+    "related": [
+      131
+    ],
+    "type": "aggregation",
+    "desc": "Group: PRM",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 131,
+    "name": "Recommended Standard 18 Extended",
+    "related": [
+      132,
+      134,
+      135,
+      75
+    ],
+    "type": "aggregation",
+    "desc": "Group: PRM > PRM Recommended Standard 18 Extended",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null,
+      "phrase_regularity": 1.0,
+      "route_used": "per_row_llm_extraction"
+    }
+  },
+  {
+    "id": 132,
+    "name": "Latency",
+    "related": [
+      133,
+      67,
+      68
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"latency\" | Variables: 4 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "latency"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 133,
+    "name": "Median",
+    "related": [
+      71,
+      72
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"median\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "median"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 134,
+    "name": "Latency",
+    "related": [
+      69,
+      70
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Latency\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Latency"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 135,
+    "name": "Percent Correct",
+    "related": [
+      73,
+      74
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Percent Correct\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Percent Correct"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 136,
+    "name": "RVP",
+    "related": [
+      137
+    ],
+    "type": "aggregation",
+    "desc": "Group: RVP",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 137,
+    "name": "3 Targets",
+    "related": [
+      138,
+      139,
+      142
+    ],
+    "type": "aggregation",
+    "desc": "Group: RVP > RVP 3 Targets",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null,
+      "phrase_regularity": 0.2222,
+      "route_used": "aspect_clustering_fallback"
+    }
+  },
+  {
+    "id": 138,
+    "name": "Response Latency",
+    "related": [
+      77,
+      78,
+      79
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Response Latency\" | Variables: 3 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Response Latency"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 139,
+    "name": "False Alarm Number",
+    "related": [
+      76,
+      140,
+      141
+    ],
+    "type": "aggregation",
+    "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.626 | Variables: 6",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "false",
+        "alarms",
+        "number"
+      ],
+      "confidence": 0.97,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "False Alarm Number",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.6255,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 140,
+    "name": "False Alarm Presentations",
+    "related": [
+      80,
+      82
+    ],
+    "type": "aggregation",
+    "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.601 | Variables: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "false",
+        "alarms",
+        "presentations"
+      ],
+      "confidence": 0.97,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "False Alarm Presentations",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.6006,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 141,
+    "name": "Target Hit Number",
+    "related": [
+      81,
+      83,
+      84
+    ],
+    "type": "aggregation",
+    "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.601 | Variables: 3",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "number",
+        "target",
+        "hits"
+      ],
+      "confidence": 0.92,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "Target Hit Number",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.6006,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 142,
+    "name": "RVP Response Latency",
+    "related": [],
+    "type": "aggregation",
+    "desc": "Aspect: trials / latency / response latency / response | Silhouette: 0.626 | Variables: 3",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "response latency",
+        "latency",
+        "response"
+      ],
+      "confidence": 0.97,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "RVP Response Latency",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.6255,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 143,
+    "name": "SWM",
+    "related": [
+      144
+    ],
+    "type": "aggregation",
+    "desc": "Group: SWM",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 144,
+    "name": "Recommended Standard 2.0 Extended",
+    "related": [
+      145,
+      152,
+      153,
+      87,
+      92,
+      95,
+      97
+    ],
+    "type": "aggregation",
+    "desc": "Group: SWM > SWM Recommended Standard 2.0 Extended",
+    "dtype": "determine",
+    "isShown": true,
+    "structure_provenance": {
+      "route": "group_anchor",
+      "aspect_method": null,
+      "silhouette": null,
+      "slot_coverage": null,
+      "phrase_regularity": 1.0,
+      "route_used": "per_row_llm_extraction"
+    }
+  },
+  {
+    "id": 145,
+    "name": "Errors",
+    "related": [
+      146,
+      147,
+      148,
+      151
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"errors\" | Variables: 16 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "errors"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 146,
+    "name": "Total",
+    "related": [
+      99,
+      100,
+      101
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"total\" | Variables: 3 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "total"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 147,
+    "name": "Total",
+    "related": [
+      102,
+      103
+    ],
+    "type": "aggregation",
+    "desc": "Role: statistic | Value: \"Total\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Total"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "statistic"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "statistic",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 148,
+    "name": "SWM box revisits",
+    "related": [
+      149,
+      150
+    ],
+    "type": "aggregation",
+    "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.794 | Variables: 7",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "swm",
+        "box",
+        "revisits"
+      ],
+      "confidence": 0.95,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "SWM box revisits",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.7936,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 149,
+    "name": "SWM token previously",
+    "related": [
+      86,
+      88
+    ],
+    "type": "aggregation",
+    "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.731 | Variables: 2",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "token",
+        "previously",
+        "key",
+        "swm"
+      ],
+      "confidence": 0.92,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "SWM token previously",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.7313,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 150,
+    "name": "SWM Within Errors Box Revisits",
+    "related": [
+      104,
+      105,
+      106,
+      107,
+      108
+    ],
+    "type": "aggregation",
+    "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.731 | Variables: 5",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "search",
+        "box search",
+        "description swm",
+        "swmwe4",
+        "swmwe8",
+        "swmwe6",
+        "extended swmwe8",
+        "extended swmwe6"
+      ],
+      "confidence": 0.92,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "SWM Within Errors Box Revisits",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.7313,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 151,
+    "name": "SWM Double Errors",
+    "related": [
+      90,
+      91,
+      93,
+      94
+    ],
+    "type": "aggregation",
+    "desc": "Aspect: swm / token previously / description key / box token | Silhouette: 0.794 | Variables: 4",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "llm",
+      "evidence_terms": [
+        "swm double",
+        "double errors",
+        "error"
+      ],
+      "confidence": 0.98,
+      "llm_used": true,
+      "llm_rejected": false,
+      "llm_raw_label": "SWM Double Errors",
+      "llm_reason": "accepted"
+    },
+    "structure_provenance": {
+      "route": "aspect_clustering",
+      "aspect_method": "nmf",
+      "silhouette": 0.7936,
+      "slot_coverage": null
+    }
+  },
+  {
+    "id": 152,
+    "name": "Between Errors",
+    "related": [
+      85,
+      89
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"Between errors\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "Between errors"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  },
+  {
+    "id": 153,
+    "name": "Swm Strategy",
+    "related": [
+      96,
+      98
+    ],
+    "type": "aggregation",
+    "desc": "Role: measure | Value: \"SWM Strategy\" | Variables: 2 | Source: per-row LLM extraction (Zhu et al. 2025)",
+    "dtype": "determine",
+    "isShown": true,
+    "label_provenance": {
+      "label_source": "per_row_llm_role",
+      "evidence_terms": [
+        "SWM Strategy"
+      ],
+      "confidence": 1.0,
+      "llm_used": true,
+      "llm_rejected": false,
+      "role": "measure"
+    },
+    "structure_provenance": {
+      "route": "per_row_llm_extraction",
+      "aspect_method": "per_row_llm_extraction",
+      "slot_role": "measure",
+      "phrase_silhouette": null,
+      "regularity": 1.0,
+      "n_clusters": null
+    }
+  }
+]

version2/outputs/baseline/HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/outputs/baseline/ai-mind-variable-descriptions_in__baseline_hierarchy.json ADDED Viewed

	@@ -0,0 +1,1876 @@

+[
+  {
+    "id": 0,
+    "name": "ai-mind-variable-descriptions(in)",
+    "type": "root",
+    "dtype": "root",
+    "isShown": true,
+    "related": [
+      109,
+      114,
+      115,
+      122,
+      128,
+      132,
+      139,
+      140
+    ],
+    "desc": "Root node"
+  },
+  {
+    "id": 1,
+    "name": "DMSCC",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSCC | description: DMS Mean Choices to Correct: The mean number of choices that the subject made on each trial, including the correct choice. Calculated across all trials where the subject eventually made the correct choice (simultaneous and all delays). | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSCC",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 2,
+    "name": "DMSL0SD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL0SD | description: DMS Correct Latency Standard Deviation (SD) (0 second delay): The standard deviation of response latencies for trials containing a zero second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSL0SD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 3,
+    "name": "DMSL12SD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL12SD | description: DMS Correct Latency Standard Deviation (SD) (12 second delay): The standard deviation of response latencies for trials containing a twelve second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSL12SD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 4,
+    "name": "DMSL4SD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSL4SD | description: DMS Correct Latency Standard Deviation (SD) (4 second delay): The standard deviation of response latencies for trials containing a four second delay between the presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSL4SD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 5,
+    "name": "DMSLADSD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLADSD | description: DMS Correct Latency Standard Deviation (SD) (all delays): The standard deviation of response latencies for trials containing a delay between the presentation of target stimulus and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSLADSD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 6,
+    "name": "DMSLSD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSD | description: DMS Correct Latency Standard Deviation (SD): The standard deviation of response latencies for trials where subjects selected the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSLSD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 7,
+    "name": "DMSLSSD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSLSSD | description: DMS Correct Latency Standard Deviation (SD) (simultaneous): The standard deviation of response latencies for trials containing a simultaneous presentation of target and response stimuli, where subjects selected the correct box on their first attempt. Calculated across all assessed trials containing simultaneous presentations. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSLSSD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 8,
+    "name": "DMSMDL",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL | description: DMS Median Correct Latency: The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMDL",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 9,
+    "name": "DMSMDL0",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL0 | description: DMS Median Correct Latency (0 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMDL0",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 10,
+    "name": "DMSMDL12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL12 | description: DMS Median Correct Latency (12 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMDL12",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 11,
+    "name": "DMSMDL4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDL4 | description: DMS Median Correct Latency (4 seconds delay): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMDL4",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 12,
+    "name": "DMSMDLAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLAD | description: DMS Median Correct Latency (all delays): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMDLAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 13,
+    "name": "DMSMDLS",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMDLS | description: DMS Median Correct Latency (simultaneous): The median latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMDLS",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 14,
+    "name": "DMSML",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML | description: DMS Mean Correct Latency: The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt. Calculated across all correct assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSML",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 15,
+    "name": "DMSML0",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML0 | description: DMS Mean Correct Latency (0 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a zero second delay. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSML0",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 16,
+    "name": "DMSML12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML12 | description: DMS Mean Correct Latency (12 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a twelve second delay. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSML12",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 17,
+    "name": "DMSML4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSML4 | description: DMS Mean Correct Latency (4 seconds delay): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a four second delay. Calculated across all assessed trials containing a four second delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSML4",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 18,
+    "name": "DMSMLAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLAD | description: DMS Mean Correct Latency (all delays): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a delay between target and response stimuli presentation. Calculated across all assessed trials containing a delay. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMLAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 19,
+    "name": "DMSMLS",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSMLS | description: DMS Mean Correct Latency (simultaneous): The mean latency between the presentation of the response stimuli options and the subject selecting the correct box on their first attempt for trials containing a simultaneous presentation of target and response stimuli. Calculated across all assessed trials containing simultaneous presentation. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSMLS",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 20,
+    "name": "DMSPC",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC | description: DMS Percent Correct: The percentage of assessment trials during which the subject chose the correct box on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPC",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 21,
+    "name": "DMSPC0",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC0 | description: KEY: DMS Percent Correct (0 seconds delay): The percentage of assessment trials containing a zero second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a zero second delay. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPC0",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 22,
+    "name": "DMSPC12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC12 | description: KEY: DMS Percent Correct (12 second delay): The percentage of assessment trials containing a twelve second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a twelve second delay. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPC12",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 23,
+    "name": "DMSPC4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPC4 | description: KEY: DMS Percent Correct (4 second delay): The percentage of assessment trials containing a four second delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a four second delay. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPC4",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 24,
+    "name": "DMSPCAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCAD | description: KEY: DMS Percent Correct (all delays): The percentage of assessment trials containing a delay during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPCAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 25,
+    "name": "DMSPCS",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPCS | description: KEY: DMS Percent Correct (simultaneous): The percentage of assessment trials where the target and response stimuli were presented simultaneously during which the subject chose the correct box on their first box choice. Calculated across all assessed trials containing the simultaneous presentation of stimuli. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPCS",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 26,
+    "name": "DMSPEGC",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGC | description: DMS Probability of Error Given Correct: This measure reports the probability of an error being made when the previous trial was responded to correctly by the subject. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPEGC",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 27,
+    "name": "DMSPEGE",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSPEGE | description: KEY: DMS Probability of Error Given Error: This measure reports the probability of an error occurring when the previous trial was responded to incorrectly. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSPEGE",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 28,
+    "name": "DMSTC",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC | description: DMS Total Correct: The total number of times a subject chose the correct answer on their first box choice. Calculated across all assessed trials (simultaneous presentation and all delays). | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTC",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 29,
+    "name": "DMSTC0",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC0 | description: DMS Total Correct (0 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 0 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of zero seconds. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTC0",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 30,
+    "name": "DMSTC12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC12 | description: DMS Total Correct (12 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 12 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of twelve seconds. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTC12",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 31,
+    "name": "DMSTC4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTC4 | description: DMS Total Correct (4 second delay): The total number of times a subject chose the correct answer on their first box choice for trials where the response stimuli appeared on screen after a 4 second delay after the target stimulus was shown. Calculated across all assessed trials which contained a delay of four seconds. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTC4",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 32,
+    "name": "DMSTCAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCAD | description: DMS Total Correct (all delays): The total number of times a subject chose the correct answer on their first box choice for all trials where the response stimuli were presented after a delay. Calculated across all assessed trials containing a delay. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTCAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 33,
+    "name": "DMSTCS",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTCS | description: DMS Total Correct (simultaneous): The total number of times a subject chose the correct answer on their first box choice for trials where the target stimulus and response stimuli appeared on screen simultaneously. Calculated across all assessed trials that included a simultaneous presentation (no delay) of target and response stimuli. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTCS",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 34,
+    "name": "DMSTE",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTE | description: DMS Total Errors: The total number of times a subject failed to choose the correct box on their first selection, thus making an error. Calculated across all assessed trials (simultaneous and all delays) regardless of which incorrect box (out of the 3 possible incorrect boxes) was chosen. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTE",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 35,
+    "name": "DMSTEAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEAD | description: DMS Total Errors (all delays): The total number of times a subject failed to choose the correct box on their first selection for any trial containing a delay between the presentation of the target stimulus and response stimuli. Calculated across all assessed trials containing a delay component. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTEAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 36,
+    "name": "DMSTEC",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEC | description: DMS Error (incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colours. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTEC",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 37,
+    "name": "DMSTECAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTECAD | description: DMS Error (all delays, incorrect colour): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different physical attributes. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTECAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 38,
+    "name": "DMSTED",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTED | description: DMS Error (distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTED",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 39,
+    "name": "DMSTEDAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEDAD | description: DMS Error (all delays, distractor): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained no common elements to the original target stimulus. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTEDAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 40,
+    "name": "DMSTEP",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEP | description: DMS Error (incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same colour elements, but different pattern/ physical attributes. Calculated across all assessed trials (simultaneous and all delays). | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTEP",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 41,
+    "name": "DMSTEPAD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: DMS | Variant: DMS Recommended Standard | name: DMSTEPAD | description: DMS Error (all delays, incorrect pattern): The number of times that the subject failed to select the correct box on their first selection, and instead chose the distractor stimulus that contained the same pattern/ physical attributes, but different colour elements. Calculated across all assessed trials which contained a delay component. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "DMS > DMS Recommended Standard.DMSTEPAD",
+      "group_path": "DMS > DMS Recommended Standard"
+    }
+  },
+  {
+    "id": 42,
+    "name": "MOTML",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTML | description: The mean latency from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 1",
+    "metadata": {
+      "leaf_id": "MOT > MOT Tone 2.0.MOTML",
+      "group_path": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 43,
+    "name": "MOTSDL",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTSDL | description: This is the standard deviation of the latency, calculated from the display of a stimulus to a correct response to that stimulus during assessment trials. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "MOT > MOT Tone 2.0.MOTSDL",
+      "group_path": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 44,
+    "name": "MOTTC",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTC | description: The total number of assessment trials on which the subject made a correct response. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "MOT > MOT Tone 2.0.MOTTC",
+      "group_path": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 45,
+    "name": "MOTTE",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: MOT | Variant: MOT Tone 2.0 | name: MOTTE | description: The total number of assessment trials on which the subject failed to make a correct response. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "MOT > MOT Tone 2.0.MOTTE",
+      "group_path": "MOT > MOT Tone 2.0"
+    }
+  },
+  {
+    "id": 46,
+    "name": "PALFAMS28",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALFAMS28 | description: KEY: PAL First Attempt Memory Score: The number of times a subject chose the correct box on their first attempt when recalling the pattern locations. Calculated across assessed trials, omitting 12 box level to provide a direct comparison to Recommended Standard.. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALFAMS28",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 47,
+    "name": "PALMETS28",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALMETS28 | description: PAL Mean Errors to Success: The mean number of attempts made by a subject needed for them to successfully complete the stage.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALMETS28",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 48,
+    "name": "PALNPR28",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALNPR28 | description: PAL Number of Patterns Reached: The number of patterns presented to the subject on the last problem they reached. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALNPR28",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 49,
+    "name": "PALTA12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA12 | description: PAL Total Attempts 12 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 12 shapes to recall. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA12",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 50,
+    "name": "PALTA2",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA2 | description: PAL Total Attempts 2 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 2 shapes to recall. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA2",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 51,
+    "name": "PALTA28",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA28 | description: PAL Total Attempts: The total number of attempts made (but not necessarily completed) by the subject during assessment problems.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA28",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 52,
+    "name": "PALTA4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA4 | description: PAL Total Attempts 4 patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 4 shapes to recall. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA4",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 53,
+    "name": "PALTA6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA6 | description: PAL Total Attempts 6 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 6 shapes to recall. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA6",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 54,
+    "name": "PALTA8",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTA8 | description: PAL Total Attempts 8 Patterns: The total number of attempts made (but not necessarily completed) by the subject during assessment problems containing a total of 8 shapes to recall. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTA8",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 55,
+    "name": "PALTE12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE12 | description: PAL Total Errors 12 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 12 patterns. Calculated across all 12-pattern assessed trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE12",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 56,
+    "name": "PALTE2",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE2 | description: PAL Total Errors 2 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 2 patterns. Calculated across all 2-pattern assessed trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE2",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 57,
+    "name": "PALTE28",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE28 | description: PAL Total Errors: The total number of times a subject selected an incorrect box when attempting to recall a pattern location. Calculated across all assessed trials.  Does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE28",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 58,
+    "name": "PALTE4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE4 | description: PAL Total Errors 4 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 4 patterns. Calculated across all 4-pattern assessed trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE4",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 59,
+    "name": "PALTE6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE6 | description: PAL Total Errors 6 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 6 patterns. Calculated across all 6-pattern assessed trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE6",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 60,
+    "name": "PALTE8",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTE8 | description: PAL Total Errors 8 Patterns: The total number of times a subject selected an incorrect box when attempting to recall a pattern location on trials containing a total of 8 patterns. Calculated across all 8-pattern assessed trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTE8",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 61,
+    "name": "PALTEA12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA12 | description: PAL Total Errors 12 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 12 (PALTE12), plus an adjustment for the estimated number of errors they would have made on any other 12 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA12",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 62,
+    "name": "PALTEA2",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA2 | description: PAL Total Errors 2 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes required to remember was equal to 2 (PALTE2), plus an adjustment for the estimated number of errors they would have made on any other 2 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA2",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 63,
+    "name": "PALTEA28",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA28 | description: KEY: PAL Total Errors (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems (PALTE), plus an adjustment for the estimated number of errors they would have made on any problems, attempts and recalls they did not reach. This measure allows you to compare performance on errors made across all subjects regardless of those who terminated early versus those completing the final stage of the task.  In this task variant PALTEA does not include 12 box level to provide a direct comparison to Recommended Standard. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA28",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 64,
+    "name": "PALTEA4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA4 | description: PAL Total Errors 4 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 4 (PALTE4), plus an adjustment for the estimated number of errors they would have made on any other 4 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA4",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 65,
+    "name": "PALTEA6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA6 | description: PAL Total Errors 6 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 6 (PALTE6), plus an adjustment for the estimated number of errors they would have made on any other 6 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA6",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 66,
+    "name": "PALTEA8",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PAL | Variant: PAL Recommended Standard Extended | name: PALTEA8 | description: PAL Total Errors 8 Shapes (Adjusted): The number of times the subject chose the incorrect box for a stimulus on assessment problems, where the number of shapes was equal to 8 (PALTE8), plus an adjustment for the estimated number of errors they would have made on any other 8 pattern problems, attempts and recalls they did not reach. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "PAL > PAL Recommended Standard Extended.PALTEA8",
+      "group_path": "PAL > PAL Recommended Standard Extended"
+    }
+  },
+  {
+    "id": 67,
+    "name": "PRMCLSDD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDD | description: PRM Correct Latency (SD) Delayed: The standard deviation for the latency of a subject's response to correctly choose the appropriate pattern in the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMCLSDD",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 68,
+    "name": "PRMCLSDI",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMCLSDI | description: PRM Correct Latency (SD) Immediate: The standard deviation for the latency of a subject's response to correctly select the appropriate pattern in the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMCLSDI",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 69,
+    "name": "PRMMCLD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLD | description: PRM Mean Correct Latency Delayed: The mean latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMCLD",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 70,
+    "name": "PRMMCLI",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMCLI | description: PRM Mean Correct Latency Immediate: The mean latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMCLI",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 71,
+    "name": "PRMMDCLD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLD | description: PRM Median Correct Latency Delayed: The median latency for a subject to correctly select the appropriate pattern during the delayed forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMDCLD",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 72,
+    "name": "PRMMDCLI",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMMDCLI | description: PRM Median Correct Latency Immediate: The median latency for a subject to correctly select the appropriate pattern during the immediate forced-choice condition, measured in milliseconds. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMMDCLI",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 73,
+    "name": "PRMPCD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCD | description: KEY: PRM Percent Correct Delayed: The number of correct patterns selected by the subject in the delayed forced-choice condition, expressed as a percentage. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMPCD",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 74,
+    "name": "PRMPCI",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMPCI | description: KEY: PRM Percent Correct Immediate: The number of correct patterns selected by the subject in the immediate forced-choice condition, expressed as a percentage. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMPCI",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 75,
+    "name": "PRMTSDSP",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: PRM | Variant: PRM Recommended Standard 18 Extended | name: PRMTSDSP | description: PRM Time Since Delayed Stimuli Presentation: The length of time between the end of the stimuli presentation for the delayed phase and the start of the delayed forced-choice condition. | Decimal Places: 2",
+    "metadata": {
+      "leaf_id": "PRM > PRM Recommended Standard 18 Extended.PRMTSDSP",
+      "group_path": "PRM > PRM Recommended Standard 18 Extended"
+    }
+  },
+  {
+    "id": 76,
+    "name": "RVPA",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPA | description: KEY: RVP A?: A? (A prime) is the signal detection measure of a subject's sensitivity to the target sequence (string of three numbers), regardless of response tendency (the expected range is 0.00 to 1.00; bad to good). In essence, this metric is a measure of how good the subject is at detecting target sequences. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPA",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 77,
+    "name": "RVPLSD",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPLSD | description: RVP Response Latency (SD): The standard deviation of response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPLSD",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 78,
+    "name": "RVPMDL",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPMDL | description: KEY: RVP Median Response Latency: The median response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPMDL",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 79,
+    "name": "RVPML",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPML | description: RVP Mean Response Latency: The mean response latency on trials where the subject responded correctly. Calculated across all assessed trials. | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPML",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 80,
+    "name": "RVPPFA",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPFA | description: KEY: RVP Probability of False Alarm: The number of sequence presentations that were false alarms divided by the number of sequence presentations that were false alarms plus the number of sequence presentations that were correct rejections: (False Alarms ÷ (False Alarms + Correct Rejections)) | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPPFA",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 81,
+    "name": "RVPPH",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPPH | description: RVP Probability of Hit: The number of target sequences during assessment blocks that were correctly responded to within the time allowed, divided by the number of target sequences during assessment blocks (Correct hits ÷ total number of sequences) | Decimal Places: 4",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPPH",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 82,
+    "name": "RVPTFA",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTFA | description: RVP Total False Alarms: The total number of stimulus presentations during assessment blocks that were false alarms. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPTFA",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 83,
+    "name": "RVPTH",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTH | description: RVP Total Hits: The total number of target sequences that were correctly responded to (Correct Hits) within the allowed time during assessment sequence blocks. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPTH",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 84,
+    "name": "RVPTM",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: RVP | Variant: RVP 3 Targets | name: RVPTM | description: RVP Total Misses: The total number of target sequences that were not responded to within the allowed time during assessment sequence blocks. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "RVP > RVP 3 Targets.RVPTM",
+      "group_path": "RVP > RVP 3 Targets"
+    }
+  },
+  {
+    "id": 85,
+    "name": "SWMBE12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE12 | description: KEY: SWM Between errors 12 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE12",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 86,
+    "name": "SWMBE4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE4 | description: KEY: SWM Between errors 4 boxes: The number of times a subject revisits a box in which a token has previously been found. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE4",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 87,
+    "name": "SWMBE468",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE468 | description: KEY: SWM Between Errors: The number of times the subject incorrectly revisits a box in which a token has previously been found. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE468",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 88,
+    "name": "SWMBE6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE6 | description: KEY: SWM Between errors 6 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE6",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 89,
+    "name": "SWMBE8",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMBE8 | description: KEY: SWM Between errors 8 boxes: The number of times the subject revisits a box in which a token has previously been found. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMBE8",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 90,
+    "name": "SWMDE12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE12 | description: SWM Double errors 12 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE12",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 91,
+    "name": "SWMDE4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE4 | description: SWM Double errors 4 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE4",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 92,
+    "name": "SWMDE468",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE468 | description: SWM Double Errors: The number of times a subject commits an error that is both a within error and a between error. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE468",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 93,
+    "name": "SWMDE6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE6 | description: SWM Double errors 6 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE6",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 94,
+    "name": "SWMDE8",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMDE8 | description: SWM Double errors 8 boxes: The number of times a subject commits an error that is both a within error and a between error. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMDE8",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 95,
+    "name": "SWMPR",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMPR | description: SWM Problem Reached: This measure reports the problem number that the subject reached, but did not necessarily complete. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMPR",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 96,
+    "name": "SWMS",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS | description: KEY: SWM Strategy (6-8 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or 8 tokens. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMS",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 97,
+    "name": "SWMS6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMS6 | description: SWM Strategy (6 box only): This measure computes the strategy score for the 6 box stage of the task only. The strategy score is calculated based on the number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMS6",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 98,
+    "name": "SWMSX",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMSX | description: SWM Strategy (6-12 boxes): The number of times a subject begins a new search pattern from the same box they started with previously. If they always begin a search from the same starting point we infer that the subject is employing a planned strategy for finding the tokens. Therefore a low score indicates high strategy use (1 = they always begin the search from the same box), a high score indicates that they are beginning their searches from many different boxes. Calculated across assessed trials with 6 tokens or more. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMSX",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 99,
+    "name": "SWMTE12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE12 | description: SWM Total errors 12 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE12",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 100,
+    "name": "SWMTE4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE4 | description: SWM Total errors 4 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE4",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 101,
+    "name": "SWMTE468",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE468 | description: SWM Total Errors: The total number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE468",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 102,
+    "name": "SWMTE6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE6 | description: SWM Total errors 6 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE6",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 103,
+    "name": "SWMTE8",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMTE8 | description: SWM Total errors 8 boxes: The number of times a box is selected that is certain not to contain a token and therefore should not have been visited by the subject, i.e. between errors + within errors - double errors. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMTE8",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 104,
+    "name": "SWMWE12",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE12 | description: SWM Within errors 12 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 12 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE12",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 105,
+    "name": "SWMWE4",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE4 | description: SWM Within errors 4 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 4 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE4",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 106,
+    "name": "SWMWE468",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE468 | description: SWM Within Errors: The number of times a subject revisits a box already shown to be empty during the same search. Calculated across all assessed four, six and eight token trials. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE468",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 107,
+    "name": "SWMWE6",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE6 | description: SWM Within errors 6 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 6 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE6",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 108,
+    "name": "SWMWE8",
+    "dtype": "determine",
+    "related": [],
+    "isShown": true,
+    "type": "attribute",
+    "desc": "Task: SWM | Variant: SWM Recommended Standard 2.0 Extended | name: SWMWE8 | description: SWM Within errors 8 boxes: The number of times a subject revisits a box already found to be empty during the same search. Calculated across all trials with 8 tokens only. | Decimal Places: 0",
+    "metadata": {
+      "leaf_id": "SWM > SWM Recommended Standard 2.0 Extended.SWMWE8",
+      "group_path": "SWM > SWM Recommended Standard 2.0 Extended"
+    }
+  },
+  {
+    "id": 109,
+    "name": "Pal / Total",
+    "related": [
+      110,
+      111,
+      112,
+      113,
+      46,
+      47,
+      48,
+      51
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 21 variables — label terms: Pal / Total",
+    "dtype": "determine"
+  },
+  {
+    "id": 110,
+    "name": "Errors / Adjusted",
+    "related": [
+      57,
+      63
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 2 variables — label terms: Errors / Adjusted",
+    "dtype": "determine"
+  },
+  {
+    "id": 111,
+    "name": "Shapes / Adjusted",
+    "related": [
+      61,
+      62,
+      64,
+      65,
+      66
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 5 variables — label terms: Shapes / Adjusted",
+    "dtype": "determine"
+  },
+  {
+    "id": 112,
+    "name": "Patterns / Errors",
+    "related": [
+      55,
+      56,
+      58,
+      59,
+      60
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 5 variables — label terms: Patterns / Errors",
+    "dtype": "determine"
+  },
+  {
+    "id": 113,
+    "name": "Attempts / Patterns",
+    "related": [
+      49,
+      50,
+      52,
+      53,
+      54
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 5 variables — label terms: Attempts / Patterns",
+    "dtype": "determine"
+  },
+  {
+    "id": 114,
+    "name": "Rvp / False",
+    "related": [
+      80,
+      81,
+      82,
+      83,
+      84
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 5 variables — label terms: Rvp / False",
+    "dtype": "determine"
+  },
+  {
+    "id": 115,
+    "name": "Latency / Correct",
+    "related": [
+      116,
+      117,
+      118,
+      119,
+      120,
+      121
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 28 variables — label terms: Latency / Correct",
+    "dtype": "determine"
+  },
+  {
+    "id": 116,
+    "name": "Mean / Dms",
+    "related": [
+      1,
+      14,
+      18,
+      19
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 4 variables — label terms: Mean / Dms",
+    "dtype": "determine"
+  },
+  {
+    "id": 117,
+    "name": "Median / Prm",
+    "related": [
+      8,
+      12,
+      13,
+      71,
+      72,
+      78
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 6 variables — label terms: Median / Prm",
+    "dtype": "determine"
+  },
+  {
+    "id": 118,
+    "name": "Prm / Delayed",
+    "related": [
+      67,
+      68,
+      69,
+      70
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 4 variables — label terms: Prm / Delayed",
+    "dtype": "determine"
+  },
+  {
+    "id": 119,
+    "name": "Seconds / Delay",
+    "related": [
+      9,
+      10,
+      11,
+      15,
+      16,
+      17
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 6 variables — label terms: Seconds / Delay",
+    "dtype": "determine"
+  },
+  {
+    "id": 120,
+    "name": "Standard / Deviation",
+    "related": [
+      2,
+      3,
+      4,
+      5,
+      6,
+      7
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 6 variables — label terms: Standard / Deviation",
+    "dtype": "determine"
+  },
+  {
+    "id": 121,
+    "name": "Rvp / Response",
+    "related": [
+      77,
+      79
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 2 variables — label terms: Rvp / Response",
+    "dtype": "determine"
+  },
+  {
+    "id": 122,
+    "name": "Swm / Errors",
+    "related": [
+      123,
+      124,
+      125,
+      95,
+      126,
+      127,
+      101,
+      92
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 21 variables — label terms: Swm / Errors",
+    "dtype": "determine"
+  },
+  {
+    "id": 123,
+    "name": "Between / Within",
+    "related": [
+      87,
+      106
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 2 variables — label terms: Between / Within",
+    "dtype": "determine"
+  },
+  {
+    "id": 124,
+    "name": "Total / Boxes",
+    "related": [
+      99,
+      100,
+      102,
+      103
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 4 variables — label terms: Total / Boxes",
+    "dtype": "determine"
+  },
+  {
+    "id": 125,
+    "name": "Double / Boxes",
+    "related": [
+      90,
+      91,
+      93,
+      94
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 4 variables — label terms: Double / Boxes",
+    "dtype": "determine"
+  },
+  {
+    "id": 126,
+    "name": "Within / Boxes",
+    "related": [
+      104,
+      105,
+      107,
+      108
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 4 variables — label terms: Within / Boxes",
+    "dtype": "determine"
+  },
+  {
+    "id": 127,
+    "name": "Between / Boxes",
+    "related": [
+      85,
+      86,
+      88,
+      89
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 4 variables — label terms: Between / Boxes",
+    "dtype": "determine"
+  },
+  {
+    "id": 128,
+    "name": "Error / Dms",
+    "related": [
+      129,
+      130,
+      131
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 8 variables — label terms: Error / Dms",
+    "dtype": "determine"
+  },
+  {
+    "id": 129,
+    "name": "Distractor / Delays",
+    "related": [
+      38,
+      39
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 2 variables — label terms: Distractor / Delays",
+    "dtype": "determine"
+  },
+  {
+    "id": 130,
+    "name": "Incorrect / Colour",
+    "related": [
+      36,
+      37,
+      40,
+      41
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 4 variables — label terms: Incorrect / Colour",
+    "dtype": "determine"
+  },
+  {
+    "id": 131,
+    "name": "Probability / Given",
+    "related": [
+      26,
+      27
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 2 variables — label terms: Probability / Given",
+    "dtype": "determine"
+  },
+  {
+    "id": 132,
+    "name": "Dms / Correct",
+    "related": [
+      133,
+      134,
+      135,
+      136,
+      137,
+      138
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 16 variables — label terms: Dms / Correct",
+    "dtype": "determine"
+  },
+  {
+    "id": 133,
+    "name": "Percent / Dms",
+    "related": [
+      20,
+      24,
+      25
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 3 variables — label terms: Percent / Dms",
+    "dtype": "determine"
+  },
+  {
+    "id": 134,
+    "name": "Errors / Total",
+    "related": [
+      32,
+      34,
+      35
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 3 variables — label terms: Errors / Total",
+    "dtype": "determine"
+  },
+  {
+    "id": 135,
+    "name": "Delay / Percent",
+    "related": [
+      21,
+      22,
+      23
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 3 variables — label terms: Delay / Percent",
+    "dtype": "determine"
+  },
+  {
+    "id": 136,
+    "name": "Total / Simultaneous",
+    "related": [
+      28,
+      33
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 2 variables — label terms: Total / Simultaneous",
+    "dtype": "determine"
+  },
+  {
+    "id": 137,
+    "name": "Prm / Percent",
+    "related": [
+      73,
+      74
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 2 variables — label terms: Prm / Percent",
+    "dtype": "determine"
+  },
+  {
+    "id": 138,
+    "name": "Second / Delay",
+    "related": [
+      29,
+      30,
+      31
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 3 variables — label terms: Second / Delay",
+    "dtype": "determine"
+  },
+  {
+    "id": 139,
+    "name": "Strategy / Swm",
+    "related": [
+      96,
+      97,
+      98
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 3 variables — label terms: Strategy / Swm",
+    "dtype": "determine"
+  },
+  {
+    "id": 140,
+    "name": "Response / Trials",
+    "related": [
+      42,
+      43,
+      44,
+      45,
+      75,
+      76
+    ],
+    "type": "aggregation",
+    "isShown": true,
+    "desc": "Cluster of 6 variables — label terms: Response / Trials",
+    "dtype": "determine"
+  }
+]

version2/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+streamlit>=1.43
+pandas>=2.0
+numpy>=1.24
+scikit-learn>=1.3
+plotly>=5.18
+sentence-transformers>=2.5
+requests>=2.31
+openpyxl>=3.1
+# Approach 1 — WordNet concept lookups (optional, auto-downloads corpus at runtime)
+nltk>=3.8
+# Approach 2 — semantic aspect discovery (optional; torch already pulled by sentence-transformers)
+fastopic>=0.0.5
+# Approach 2 — OpenAI-compatible client for optional local-LLM label refinement
+openai>=1.30

version2/views/methods.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+methods.py — single source of truth for method naming, descriptions and display
+config, shared by the Demo View (viewer.py) and the Build pages (run_*.py).
+Metadata Hierarchy Explorer — TFM 2026.
+The internal keys ("Baseline" / "Approach 1" / "Approach 2") are kept stable on
+purpose: the pre-built output filenames and the thesis cross-references depend on
+them. The user-facing *title* is what gets shown in the app.
+"""
+from __future__ import annotations
+METHOD_ORDER = ["Baseline", "Approach 1", "Approach 2"]
+METHODS: dict[str, dict] = {
+    "Baseline": {
+        "title": "Baseline: Taxonomizer Semantic Space Hierarchy",
+        "tag":   "Baseline · Word2Vec semantic space + agglomerative clustering "
+                 "(Mahmood & Mueller, IEEE TVCG 2019)",
+        "color":     "Greens",
+        "compress":  False,
+        "node_link": True,
+        "about": (
+            "Classical clustering baseline. Word2Vec skip-gram embeddings of the "
+            "attribute names build a cosine semantic space, then balanced Ward "
+            "agglomerative clustering produces the tree; node labels are the most "
+            "discriminative terms per cluster. No external knowledge bases and no "
+            "neural language models — a deliberately simple reference point."
+        ),
+    },
+    "Approach 1": {
+        "title": "Approach 1: External Concept Alignment Hierarchy",
+        "tag":   "Approach 1 · SBERT + Gonçalves N×M alignment + HiExpan + Castanet facets",
+        "color":     "Blues",
+        "compress":  False,
+        "node_link": True,
+        "about": (
+            "Aligns each variable to concepts drawn from external knowledge bases. "
+            "SBERT embeddings and an N×M concept-similarity matrix (Gonçalves 2019) "
+            "match variables to candidate concepts retrieved from Wikidata, Wikipedia, "
+            "WordNet and BioPortal; HiExpan refines the tree and Castanet builds "
+            "parallel facets. External enrichment activates automatically for "
+            "biomedical, cognitive and neurological domains."
+        ),
+    },
+    "Approach 2": {
+        "title": "Approach 2: Dataset Constrained Multi Aspect Hierarchy",
+        "tag":   "Approach 2 · FASTopic + phrase-slot mining (Wu et al. NeurIPS 2024)",
+        "color":     "Viridis",
+        "compress":  True,
+        "node_link": True,
+        "about": (
+            "Builds the hierarchy using only evidence inside the dataset — no external "
+            "knowledge. Group structure anchors the top levels, phrase-slot mining and "
+            "FASTopic (Wu et al. 2024) discover semantic aspects, and per-aspect "
+            "clustering forms the branches. Labels are generated deterministically and "
+            "are fully auditable; an optional local LLM may re-phrase them under a "
+            "strict grounding check."
+        ),
+    },
+}
+# Reverse lookup: display title -> internal key.
+TITLE_TO_KEY = {m["title"]: k for k, m in METHODS.items()}
+TITLES = [METHODS[k]["title"] for k in METHOD_ORDER]
+def title(key: str) -> str:
+    return METHODS[key]["title"]
+def tag(key: str) -> str:
+    return METHODS[key]["tag"]
+def about(key: str) -> str:
+    return METHODS[key]["about"]

version2/views/run_approach_1.py ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/views/run_approach_2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

version2/views/run_baseline.py ADDED Viewed

	@@ -0,0 +1,1091 @@

+# baseline.py — Metadata Hierarchy Builder — Baseline (Taxonomizer)
+#
+# Baseline = Taxonomizer (Mahmood & Mueller, IEEE TVCG 2019), semantic-space
+# pipeline, adapted to a metadata-only setting.  No hardcoded domain patterns.
+#
+# Pipeline:
+#   1. Load metadata file (CSV / TSV / XLSX / JSON)
+#   2. Detect column roles (leaf / context / text / meta) — same as Approach 1 / 2
+#   3. Build canonical schema (incl. _semantic_text = description values only)
+#   4. Embed each variable (code + description) via Word2Vec skip-gram and build
+#      the cosine-distance semantic space [TAX §3.2]
+#   5. Recursively cluster (agglomerative, cosine) into the dendrogram taxonomy;
+#      internal-node labels = data-driven contrastive terms of each cluster
+#   6. Visualise (Sunburst / Treemap / Node-link)
+#   7. Export visualization-ready JSON + canonical CSV
+#
+# Paper & justified adaptations (metadata/schema setting, fully automatic):
+#   [TAX] Mahmood & Mueller — Taxonomizer, IEEE TVCG 2019.
+#         Builds a SEMANTIC space (cosine over word2vec skip-gram embeddings of
+#         attribute names; gensim, Wikipedia, window=5, dim=128) merged with a
+#         DATA space (correlation over raw values), clustered into a dendrogram;
+#         inner nodes labelled semi-automatically by distributional degree-of-
+#         entailment + WordNet synonyms.
+#   Adaptations (all documented):
+#     1. No DATA space — a schema/dictionary has no raw values, so we use the
+#        semantic space alone (Taxonomizer with semantic weight = 1.0).
+#     2. Embed the attribute's short NAME (the description's name clause), since
+#        the bare code goes out-of-vocabulary (a limitation the paper flags,
+#        e.g. "BP").  Taxonomizer embeds the NAME ("a few words"), not a
+#        paragraph; using the short name (not the full description prose) keeps
+#        domain-specific words from being diluted by shared explanatory text.
+#     3. Fully-automatic labels — the paper's labelling is semi-automatic
+#        (human picks from suggestions); a baseline must be non-interactive, so
+#        we use data-driven contrastive terms from each cluster's members.
+#
+# Dependencies: gensim
+#   pip install gensim
+from __future__ import annotations
+import csv, json, re, warnings
+from collections import Counter, defaultdict
+from pathlib import Path
+import tempfile
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, silhouette_score
+from sklearn.preprocessing import LabelEncoder
+warnings.filterwarnings('ignore')
+# set_page_config handled by the navigation router (demo.py)
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+import methods  # shared method names
+st.title(methods.title('Baseline'))
+st.caption('Upload a metadata file (CSV / TSV / XLSX / JSON), confirm the column '
+           'roles, then build. Semantic space only (no raw data values); no '
+           'hardcoded patterns, no external APIs.')
+# ─────────────────────────────────────────────────────────────────────────────
+# CONSTANTS
+# ─────────────────────────────────────────────────────────────────────────────
+LEAF_KEYS  = 'variable var field column attribute name code id item indicator question measure concept'.split()
+GROUP_KEYS = 'task category domain module section table dataset assessment test variant group topic instrument form subscale construct'.split()
+TEXT_KEYS  = 'description definition desc label title question meaning note notes text display full details explanation comment'.split()
+META_KEYS  = 'type dtype data_type datatype unit units format decimal precision values value coding codebook range min max scale'.split()
+# URL pattern — strip embedded links (e.g. HCP FreeSurfer NeuroLex URLs) so web
+# tokens cannot dominate the embedding or the cluster label.  [shared with A1]
+_URL_RE = re.compile(r'(https?://\S+|www\.\S+|\b\w+\.(?:org|com|net|gov|edu)\b/?\S*)',
+                     re.IGNORECASE)
+# ─────────────────────────────────────────────────────────────────────────────
+# FILE LOADING
+# ─────────────────────────────────────────────────────────────────────────────
+def safe_name(name: str) -> str:
+    return ''.join(ch if ch.isalnum() or ch in '-_.' else '_' for ch in name)
+def try_read_csv(path: Path) -> pd.DataFrame:
+    best, best_score = None, -1
+    for enc in ['utf-8-sig', 'utf-8', 'latin1']:
+        for sep in [None, ',', '\t', ';', '|']:
+            try:
+                df = pd.read_csv(path, sep=sep, engine='python', encoding=enc)
+                score = df.shape[1] * 10 - float(df.isna().mean().mean())
+                if score > best_score:
+                    best, best_score = df, score
+            except Exception:
+                pass
+    if best is None:
+        raise ValueError(f'Could not read {path.name}')
+    best.columns = [str(c).strip().replace(';', '') for c in best.columns]
+    # Repair comma-packed rows (AI-Mind format)
+    if len(best) > 0:
+        first = best.iloc[:, 0].astype(str)
+        other_null = best.iloc[:, 1:].isna().mean().mean() if best.shape[1] > 1 else 1.0
+        if first.str.contains(',').mean() > 0.50 and other_null > 0.70:
+            lines = path.read_text(encoding='utf-8-sig', errors='replace').splitlines()
+            if lines:
+                header = [h.strip().replace(';', '') for h in lines[0].split(',')]
+                rows = []
+                for line in lines[1:]:
+                    line = line.strip().rstrip(';')
+                    if not line:
+                        continue
+                    if line.startswith('"') and line.endswith('"'):
+                        line = line[1:-1]
+                    try:
+                        parts = next(csv.reader([line], quotechar='"'))
+                    except Exception:
+                        continue
+                    if len(parts) >= len(header):
+                        rows.append(parts[:len(header)])
+                if rows:
+                    best = pd.DataFrame(rows, columns=header)
+    best.columns = [str(c).strip().replace(';', '') for c in best.columns]
+    return best
+def load_any(path: Path) -> pd.DataFrame:
+    s = path.suffix.lower()
+    if s in ['.csv', '.tsv', '.txt']:
+        return try_read_csv(path)
+    if s in ['.xlsx', '.xls']:
+        return pd.read_excel(path)
+    if s == '.json':
+        obj = json.loads(path.read_text(encoding='utf-8', errors='replace'))
+        if isinstance(obj, list):
+            return pd.json_normalize(obj)
+        if isinstance(obj, dict):
+            for v in obj.values():
+                if isinstance(v, list):
+                    return pd.json_normalize(v)
+    raise ValueError(f'Unsupported file type: {s}')
+def save_upload(f) -> Path:
+    tmp = Path(tempfile.mkdtemp(prefix='baseline_'))
+    p = tmp / safe_name(f.name)
+    p.write_bytes(f.getbuffer())
+    return p
+# ─────────────────────────────────────────────────────────────────────────────
+# ROLE DETECTION  [GON]
+# ─────────────────────────────────────────────────────────────────────────────
+def norm(c: str) -> str:
+    return re.sub(r'[^a-z0-9]+', '_', str(c).strip().lower()).strip('_')
+def kscore(c: str, keys: list) -> int:
+    nc = norm(c)
+    return sum(1 for k in keys if k in nc)
+def profile_columns(df: pd.DataFrame) -> pd.DataFrame:
+    out = []
+    n = max(len(df), 1)
+    for col in df.columns:
+        s = df[col]
+        non = float(s.notna().mean())
+        nun = int(s.nunique(dropna=True))
+        ur  = nun / n
+        avg = float(s.dropna().astype(str).map(len).mean()) if s.notna().any() else 0
+        out.append({
+            'column':         str(col),
+            'non_null':       round(non, 3),
+            'unique_values':  nun,
+            'unique_ratio':   round(ur, 3),
+            'avg_length':     round(avg, 1),
+            'leaf_score':     4*kscore(col, LEAF_KEYS)  + (3 if 0.5 <= ur <= 1 else 0) + (1 if avg < 80 else 0),
+            'group_score':    4*kscore(col, GROUP_KEYS) + (3 if 1 < nun < min(n*0.5, 80) else 0) + (1 if avg < 60 else 0),
+            'text_score':     5*kscore(col, TEXT_KEYS)  + (4 if avg > 50 else 0) + (1 if non > 0.5 else 0),
+            'metadata_score': 4*kscore(col, META_KEYS)  + (2 if 1 < nun < min(n*0.8, 100) else 0),
+        })
+    return pd.DataFrame(out)
+def detect_roles(df: pd.DataFrame) -> tuple:
+    """Auto-detect column roles.  Identical logic to Approach 1 / 2 so the
+    preprocessing up to the canonical table is comparable across all apps."""
+    prof  = profile_columns(df)
+    leaf  = prof.sort_values(['leaf_score', 'unique_ratio'], ascending=False).head(1)['column'].tolist()
+    text  = (prof[(prof.text_score >= 4) | (prof.avg_length > 80)]
+             .sort_values('text_score', ascending=False)['column'].tolist()) or leaf.copy()
+    group = (prof[(prof.group_score >= 4) & (~prof.column.isin(leaf)) & (prof.unique_values > 1)]
+             .sort_values('group_score', ascending=False)['column'].head(3).tolist())
+    meta  = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
+             .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
+    # Representation columns (decimal/precision/unit/type/format/…) must never
+    # become structural levels; prefer them as metadata. [GON][TAX]
+    _META_SUBSTR_BLOCK = {
+        'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
+        'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
+    }
+    def _is_repr(col_name):
+        nc = re.sub(r'[^a-z0-9]', '', str(col_name).lower())
+        return any(sub in nc for sub in _META_SUBSTR_BLOCK)
+    meta_extra = [c for c in prof['column'].tolist()
+                  if _is_repr(c) and c not in text and c not in leaf and c not in meta]
+    group = [c for c in group if not _is_repr(c)]
+    meta  = list(dict.fromkeys(meta + meta_extra))[:8]
+    return {'leaf_cols': leaf, 'group_cols': group, 'text_cols': text, 'metadata_cols': meta}, prof
+# ─────────────────────────────────────────────────────────────────────────────
+# CANONICAL SCHEMA  [GON]
+# ─────────────────────────────────────────────────────────────────────────────
+def sv(x) -> str:
+    return '' if pd.isna(x) else str(x).strip()
+def build_canonical(df: pd.DataFrame, cfg: dict, source: str) -> pd.DataFrame:
+    leaf_cols  = cfg.get('leaf_cols', [])
+    group_cols = cfg.get('group_cols', [])
+    text_cols  = cfg.get('text_cols', [])
+    meta_cols  = cfg.get('metadata_cols', [])
+    rows = []
+    for i, row in df.iterrows():
+        leaf_parts  = [sv(row.get(c, '')) for c in leaf_cols]
+        leaf_parts  = [p for p in leaf_parts if p]
+        label       = ' / '.join(leaf_parts) if leaf_parts else f'variable_{i+1}'
+        group_parts = [sv(row.get(c, '')) for c in group_cols]
+        group_parts = [p for p in group_parts if p and p.lower() not in ['nan', 'none']]
+        gpath       = ' > '.join(group_parts) if group_parts else 'Ungrouped'
+        parts = []
+        for c in list(dict.fromkeys(group_cols + leaf_cols + text_cols + meta_cols)):
+            v = sv(row.get(c, ''))
+            if v:
+                parts.append(f'{c}: {v}')
+        text = ' | '.join(parts) if parts else label
+        # _semantic_text: description VALUES only — no "fieldname:" prefixes, no
+        # other fields, URLs stripped.  This is the clean text Taxonomizer embeds
+        # (the attribute's meaning), identical in spirit to Approach 1's column.
+        sem_parts = [sv(row.get(c, '')) for c in text_cols]
+        sem_parts = [p for p in sem_parts if p]
+        if not sem_parts:
+            sem_parts = list(leaf_parts)
+        semantic = _URL_RE.sub(' ', ' '.join(sem_parts)) if sem_parts else label
+        rows.append({
+            '_source_file':   source,
+            '_row_index':     int(i),
+            '_leaf_label':    label,
+            '_leaf_id':       f'{gpath}.{label}' if gpath != 'Ungrouped' else label,
+            '_group_path':    gpath,
+            '_text':          text,
+            '_semantic_text': semantic,
+        })
+    can = pd.DataFrame(rows)
+    if can['_leaf_id'].duplicated().any():
+        cnt: dict = defaultdict(int)
+        ids = []
+        for lid in can['_leaf_id']:
+            cnt[lid] += 1
+            ids.append(lid if cnt[lid] == 1 else f'{lid}__{cnt[lid]}')
+        can['_leaf_id'] = ids
+    return can
+# ─────────────────────────────────────────────────────────────────────────────
+# TAXONOMIZER CORE  [TAX — Mahmood & Mueller, IEEE TVCG 2019]
+#
+# Taxonomizer builds the taxonomy from a SEMANTIC SPACE (cosine distance between
+# word2vec skip-gram embeddings of attribute names) merged with a DATA SPACE
+# (correlation over the raw values).  In a metadata/schema setting we have no
+# raw data values, so we use the semantic space alone (= Taxonomizer with
+# semantic weight 1.0).  Because attribute *names* here are opaque codes that go
+# out-of-vocabulary — a limitation the paper explicitly flags (e.g. "BP") — we
+# embed code + description so real words carry the meaning (OOV code tokens are
+# skipped during averaging).  Internal-node labels: the paper uses semi-automatic
+# distributional degree-of-entailment + WordNet synonyms; a baseline must be
+# fully automatic, so we use data-driven contrastive terms drawn from the data.
+# ─────────────────────────────────────────────────────────────────────────────
+_W2V_STOP = frozenset(
+    'a an the and or but if in on at to of for with by is are was were be '
+    'been being have has had do does did will would could should may might '
+    'shall can this that these those i you he she it we they me him her us '
+    'them my your his her its our their what which who whom when where why '
+    'how all each every few more most other some such no not only same so '
+    'than too very just because as until while'.split()
+)
+@st.cache_resource(show_spinner=False)
+def _load_w2v():
+    """Load pre-trained Word2Vec / GloVe model via gensim downloader.
+    We prefer glove-wiki-gigaword-100 (~66 MB) because its Wikipedia training
+    corpus and skip-gram-style objective most closely match Taxonomizer's
+    described word2vec-Wikipedia-dim128 model.
+    """
+    try:
+        import gensim.downloader as api
+        return api.load('glove-wiki-gigaword-100')
+    except Exception as e:
+        st.error(
+            f'Could not load Word2Vec model: {e}\n\n'
+            'Run:  pip install gensim  and restart the app.\n'
+            'The model (~66 MB) is downloaded automatically on first use.'
+        )
+        return None
+def _tokenize(label: str) -> list[str]:
+    return [t for t in re.sub(r'[^a-zA-Z]+', ' ', label).lower().split()
+            if len(t) > 2 and t not in _W2V_STOP]
+def attribute_name(text: str) -> str:
+    """The attribute's short NAME — what Taxonomizer actually embeds [TAX §3.2].
+    The paper embeds the attribute name ("not more than a few words long"), not a
+    paragraph.  Descriptions here are formatted '<name>: <full sentence>' (some
+    prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
+    that is not a pure all-caps marker.  Embedding this short name — rather than
+    the full description prose — keeps the domain-specific words from being
+    diluted by shared explanatory text, so the taxonomy clusters more by theme
+    (e.g. DMS / PAL / SWM).
+    """
+    text = str(text)
+    for clause in re.split(r'[:\n]', text):
+        clause = clause.strip()
+        if clause and not all(2 <= len(w) <= 6 and w.isupper() for w in clause.split()):
+            return clause
+    return text.strip()
+def embed_labels_w2v(labels: list[str], model) -> np.ndarray:
+    """Average Word2Vec vectors for each label's tokens [TAX §4.1].
+    Falls back to a zero vector for labels where none of the tokens are in the
+    model vocabulary (rare for standard English attribute names).
+    """
+    dim = model.vector_size
+    out = np.zeros((len(labels), dim), dtype=np.float32)
+    for i, label in enumerate(labels):
+        toks = _tokenize(label)
+        vecs = [model[t] for t in toks if t in model]
+        if vecs:
+            out[i] = np.mean(vecs, axis=0)
+    # L2-normalise so cosine distance = 1 - dot
+    norms = np.linalg.norm(out, axis=1, keepdims=True)
+    norms[norms == 0] = 1.0
+    return out / norms
+def _cluster(X: np.ndarray, k: int) -> np.ndarray:
+    """Ward-linkage agglomerative cut into k clusters.
+    Ward (on the L2-normalised embedding vectors, where Euclidean ∝ √cosine)
+    minimises within-cluster variance and so produces *balanced* clusters.
+    This avoids the average/single-linkage chaining pathology that otherwise
+    peels off tiny clusters and leaves one giant residual (i.e. no real
+    hierarchy forms).
+    """
+    return AgglomerativeClustering(n_clusters=k, linkage='ward').fit_predict(X)
+def best_k(X: np.ndarray, n: int, k_min: int = 2, k_max: int = 8) -> int:
+    """Pick the number of clusters that maximises the silhouette score.
+    Fully data-driven — no fixed cluster count.  Returns 1 only when the node
+    is too small to split (n <= k_min).
+    """
+    k_hi = min(k_max, n - 1)
+    if k_hi < k_min:
+        return 1
+    best, best_s = 1, -1.0
+    for k in range(k_min, k_hi + 1):
+        labels = _cluster(X, k)
+        if len(set(labels)) < 2:
+            continue
+        try:
+            s = silhouette_score(X, labels)
+        except Exception:
+            continue
+        if s > best_s:
+            best_s, best = s, k
+    return best
+def _doc_freq(texts: list[str]) -> Counter:
+    """Document frequency: how many member texts each content word appears in."""
+    c: Counter = Counter()
+    for t in texts:
+        for w in set(_tokenize(t)):
+            c[w] += 1
+    return c
+def cluster_term_label(member_texts: list[str], sibling_texts: list[str],
+                       used: set, vocab=None, top_n: int = 2) -> str:
+    """Label a node with the content words most characteristic of its members.
+    Data-driven labelling: each candidate word is scored by how much more
+    frequent it is *inside* the cluster than in the sibling pool (contrastive
+    document frequency), so labels are domain terms drawn from the dataset
+    itself — not external ontology words.  This replaces Taxonomizer's
+    WordNet degree-of-entailment, which produces over-general, off-domain
+    abstractions on specialised scientific metadata.
+    If `vocab` is given (the Word2Vec model), only real dictionary words are
+    eligible, so opaque attribute codes (e.g. 'dms', 'motml') are filtered out
+    of labels.  Codes are used only as a last-resort fallback.
+    """
+    def in_vocab(w: str) -> bool:
+        return vocab is None or w in vocab
+    n_in  = max(len(member_texts), 1)
+    n_out = max(len(sibling_texts), 1)
+    cin   = _doc_freq(member_texts)
+    cout  = _doc_freq(sibling_texts)
+    scores: dict[str, float] = {}
+    for w, f in cin.items():
+        if w in used or len(w) <= 2 or not in_vocab(w):
+            continue
+        p_in  = f / n_in
+        p_out = cout.get(w, 0) / n_out
+        # ignore single-occurrence noise unless the term is widely shared
+        if f < 2 and p_in < 0.5:
+            continue
+        scores[w] = p_in - p_out
+    picks = [w for w, _ in sorted(scores.items(), key=lambda x: -x[1])[:top_n]
+             if scores[w] > 0]
+    if not picks:
+        # fallback: most frequent shared real word, then any shared token
+        for require_vocab in (True, False):
+            for w, _ in cin.most_common():
+                if w not in used and len(w) > 2 and (not require_vocab or in_vocab(w)):
+                    picks = [w]
+                    break
+            if picks:
+                break
+    return ' / '.join(p.title() for p in picks) if picks else 'Group'
+# ─────────────────────────────────────────────────────────────────────────────
+# HIERARCHY CONSTRUCTION  [TAX + GON]
+# ─────────────────────────────────────────────────────────────────────────────
+def _nmap(nodes: list) -> dict:
+    return {int(n['id']): n for n in nodes}
+def _next_id(nodes: list) -> int:
+    return max((int(n['id']) for n in nodes), default=0) + 1
+def _add_child(nodes: list, parent_id: int, child_id: int):
+    m = _nmap(nodes)
+    p = m.get(int(parent_id))
+    if p is None:
+        return
+    rel = list(p.get('related', []))
+    if int(child_id) not in rel:
+        rel.append(int(child_id))
+    p['related'] = rel
+def _make_agg(nid: int, name: str, desc: str = '') -> dict:
+    return {'id': int(nid), 'name': str(name), 'related': [],
+            'type': 'aggregation', 'isShown': True, 'desc': desc, 'dtype': 'determine'}
+def _leaf_ids(nodes: list, nid: int) -> list:
+    m = _nmap(nodes)
+    out: list = []
+    def rec(x):
+        n = m.get(int(x))
+        if not n:
+            return
+        if n.get('type') == 'attribute':
+            out.append(int(x))
+            return
+        for c in n.get('related', []):
+            rec(int(c))
+    rec(nid)
+    return list(dict.fromkeys(out))
+def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
+                    max_depth: int = 3, min_cluster_size: int = 6,
+                    branch_max: int = 8) -> list:
+    """Taxonomizer semantic-space construction [TAX].
+    Embeds each variable from its short attribute NAME (Word2Vec skip-gram
+    average) — the name clause of the description, as Taxonomizer specifies.
+    Recursively clusters via balanced Ward linkage — the semantic-space
+    dendrogram.  Labels each internal node with the contrastive content terms of
+    its members (data-driven, fully automatic). No hardcoding.
+    """
+    # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
+    nodes: list = [{'id': 0, 'name': project, 'type': 'root',
+                    'dtype': 'root', 'isShown': True, 'related': [], 'desc': 'Root node'}]
+    row_to_node: list = []
+    embed_list: list[str] = []    # short attribute name → embedding input + labels
+    for i, (_, r) in enumerate(can.iterrows(), start=1):
+        sem  = str(r.get('_semantic_text', '') or r['_leaf_label'])
+        name = attribute_name(sem) or str(r['_leaf_label'])
+        nodes.append({'id': i, 'name': r['_leaf_label'], 'dtype': 'determine',
+                      'related': [], 'isShown': True, 'type': 'attribute',
+                      'desc': r['_text'],
+                      'metadata': {'leaf_id': r['_leaf_id'], 'group_path': r['_group_path']}})
+        row_to_node.append(i)
+        embed_list.append(name)
+    label_list = embed_list
+    row_to_node = np.array(row_to_node)
+    # ── Word2Vec semantic-space embeddings [TAX §3.2] ─────────────────────────
+    emb = embed_labels_w2v(embed_list, w2v_model)   # (N, dim), L2-normalised
+    # ── recursive clustering down the Ward dendrogram ─────────────────────────
+    def attach_leaves(parent_id: int, idx: np.ndarray):
+        for i in idx:
+            _add_child(nodes, parent_id, int(row_to_node[i]))
+    def recurse(parent_id: int, idx: np.ndarray, depth: int, used: set):
+        n = len(idx)
+        if n <= min_cluster_size or depth >= max_depth:
+            attach_leaves(parent_id, idx)
+            return
+        sub = emb[idx]
+        k_cap = min(branch_max, n - 1)
+        # Branching floor: a node with n leaves and `remaining` levels left must
+        # fan out enough to fit all its leaves into buckets of ~min_cluster_size
+        # by the depth cap, i.e. k >= (n / min_cluster_size) ** (1/remaining).
+        # Without this, silhouette keeps picking k=2 on overlapping data (e.g.
+        # HCP), giving a near-binary tree that dumps ~100 leaves per bottom node.
+        remaining = max(1, max_depth - depth)
+        k_floor = int(np.ceil((n / max(min_cluster_size, 1)) ** (1.0 / remaining)))
+        k_floor = max(2, min(k_floor, k_cap))
+        k = best_k(sub, n, k_min=k_floor, k_max=k_cap)
+        if k <= 1:
+            k = min(k_floor, k_cap) if n > min_cluster_size else 1
+        if k <= 1:
+            attach_leaves(parent_id, idx)
+            return
+        cluster_labels = _cluster(sub, k)
+        for c in range(k):
+            mask    = cluster_labels == c
+            members = idx[mask]
+            if len(members) == 0:
+                continue
+            if len(members) == 1:           # don't create singleton internal nodes
+                _add_child(nodes, parent_id, int(row_to_node[members[0]]))
+                continue
+            mset = set(members.tolist())
+            member_texts  = [label_list[i] for i in members]
+            sibling_texts = [label_list[i] for i in idx if i not in mset]
+            # data-driven contrastive-term labelling
+            label = cluster_term_label(member_texts, sibling_texts, used)
+            nid = _next_id(nodes)
+            nodes.append(_make_agg(nid, label,
+                                   desc=f'Cluster of {len(members)} variables — '
+                                        f'label terms: {label}'))
+            _add_child(nodes, parent_id, nid)
+            recurse(nid, members, depth + 1, used | {label.lower()})
+    recurse(0, np.arange(len(can)), 0, set())
+    for n in nodes:
+        n['related'] = list(dict.fromkeys(int(x) for x in n.get('related', [])))
+    return nodes
+# ─────────────────────────────────────────────────────────────────────────────
+# VISUALISATION
+# ─────────────────────────────────────────────────────────────────────────────
+def _parent_map(nodes: list) -> dict:
+    pm: dict = {}
+    for n in nodes:
+        for c in n.get('related', []):
+            if int(c) not in pm:
+                pm[int(c)] = int(n['id'])
+    return pm
+# ─────────────────────────────────────────────────────────────────────────────
+# EVALUATION HELPERS
+# ─────────────────────────────────────────────────────────────────────────────
+def _eval_cluster_assignments(nodes: list, can: pd.DataFrame) -> list[int]:
+    """Return predicted cluster id (depth-1 aggregation ancestor) for each row in can."""
+    pm = _parent_map(nodes)
+    def depth1(nid: int) -> int:
+        # Walk up until our parent is root (id==0) or we have no parent
+        while pm.get(nid, -1) not in (-1, 0):
+            nid = pm[nid]
+        return nid
+    lid_to_nid = {n['metadata']['leaf_id']: int(n['id'])
+                  for n in nodes if n.get('type') == 'attribute' and 'metadata' in n}
+    return [depth1(lid_to_nid[lid]) if lid in lid_to_nid else -1
+            for lid in can['_leaf_id']]
+def _purity(y_true, y_pred) -> float:
+    from collections import Counter
+    clusters: dict = {}
+    for t, p in zip(y_true, y_pred):
+        clusters.setdefault(p, []).append(t)
+    correct = sum(Counter(v).most_common(1)[0][1] for v in clusters.values())
+    return correct / max(len(y_true), 1)
+def _structural_stats(nodes: list) -> dict:
+    pm = _parent_map(nodes)
+    def depth_of(nid: int) -> int:
+        d = 0
+        while nid in pm:
+            nid = pm[nid]; d += 1
+        return d
+    agg   = [n for n in nodes if n.get('type') == 'aggregation']
+    leafs = [n for n in nodes if n.get('type') == 'attribute']
+    depths   = [depth_of(int(n['id'])) for n in leafs]
+    branches = [len(n.get('related', [])) for n in agg]
+    singletons = sum(1 for b in branches if b == 1)
+    return {
+        'n_aggregation_nodes':  len(agg),
+        'max_depth':            int(max(depths, default=0)),
+        'avg_leaf_depth':       round(float(np.mean(depths)), 2) if depths else 0.0,
+        'avg_branching_factor': round(float(np.mean(branches)), 2) if branches else 0.0,
+        'singleton_nodes_%':    round(100.0 * singletons / max(len(agg), 1), 1),
+    }
+def _wrap(text: str, width: int = 70) -> str:
+    """Wrap long hover text onto multiple <br> lines so it never runs off-screen."""
+    import textwrap
+    text = str(text).replace('<', '&lt;')
+    lines: list = []
+    for para in text.split('\n'):
+        wrapped = textwrap.wrap(para, width=width) or ['']
+        lines.extend(wrapped)
+    return '<br>'.join(lines)
+def plot_sunburst(nodes: list, max_depth: int = 4) -> go.Figure:
+    pm = _parent_map(nodes)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n['id'])
+        lc  = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get('name', ''))[:40])
+        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+        values.append(max(1, lc))
+        desc = _wrap(n.get('desc', ''))
+        hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Type: {n.get("type","")}'
+                     f'<br>Variables: {lc}<br><br>{desc}')
+    fig = go.Figure(go.Sunburst(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues='total', hovertext=hover, hoverinfo='text',
+        maxdepth=max_depth, insidetextorientation='radial',
+        marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
+    ))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
+                      title='Click a sector to drill down — click centre to go back')
+    return fig
+def plot_treemap(nodes: list) -> go.Figure:
+    pm = _parent_map(nodes)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n['id'])
+        lc  = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get('name', ''))[:40])
+        parents.append('' if nid == 0 else str(pm.get(nid, 0)))
+        values.append(max(1, lc))
+        desc = _wrap(n.get('desc', ''))
+        hover.append(f'<b>{_wrap(n.get("name",""))}</b><br>Variables: {lc}<br>{desc}')
+    fig = go.Figure(go.Treemap(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues='total', hovertext=hover, hoverinfo='text',
+        textinfo='label+value',
+        marker=dict(colorscale='Greens', line=dict(width=1, color='white')),
+    ))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
+    return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# NODE-LINK TREE  (Reingold–Tilford layout — matches Approach 1 / 2 interface)
+# ─────────────────────────────────────────────────────────────────────────────
+def _bl_node_color(n: dict) -> str:
+    t = n.get('type', '')
+    if t == 'root':      return '#2a7d2a'
+    if t == 'attribute': return '#74c476'
+    if t == 'collapsed': return '#bbbbbb'
+    return '#238b45'
+def _display_graph(nodes: list, max_depth: int = 4):
+    """Walk the tree to the chosen depth, inserting 'collapsed' placeholders for
+    branches cut off below max_depth (the Level-of-Detail control)."""
+    m = _nmap(nodes)
+    dnodes: dict = {}
+    edges: list  = []
+    counter = 10 ** 9
+    def rec(nid, depth):
+        nonlocal counter
+        n = m.get(int(nid))
+        if not n:
+            return
+        dnodes[int(nid)] = n
+        if depth >= max_depth and n.get('related'):
+            counter += 1
+            cid = counter
+            n_leaves = len(_leaf_ids(nodes, nid))
+            dnodes[cid] = {'id': cid, 'name': f'… {n_leaves} variables',
+                           'type': 'collapsed', 'related': [],
+                           'desc': f"Collapsed: {n.get('name')}", 'isShown': True}
+            edges.append((int(nid), cid))
+            return
+        for c in n.get('related', []):
+            if int(c) not in m:
+                continue
+            edges.append((int(nid), int(c)))
+            rec(int(c), depth + 1)
+    rec(0, 0)
+    return list(dnodes.values()), edges
+def _positions(edges: list):
+    """Reingold–Tilford style positions: x = depth, y = subtree-aware vertical."""
+    H_SCALE, V_SPACE = 3.0, 1.8
+    children: dict = defaultdict(list)
+    for p, c in edges:
+        children[p].append(c)
+    pos: dict = {}
+    counter = {'v': 0}
+    def rec(nid, depth):
+        ch = children.get(nid, [])
+        if not ch:
+            y = counter['v'] * V_SPACE
+            counter['v'] += 1
+            pos[nid] = (depth * H_SCALE, y)
+            return y
+        y = float(np.mean([rec(c, depth + 1) for c in ch]))
+        pos[nid] = (depth * H_SCALE, y)
+        return y
+    rec(0, 0)
+    return pos
+def plot_node_link(nodes: list, max_depth: int = 4, show_leaf_labels: bool = False) -> go.Figure:
+    """Node-link tree with elbow edges. Best for inspecting structure at moderate
+    depth; Sunburst is recommended for large hierarchies (Taxonomizer)."""
+    dnodes, edges = _display_graph(nodes, max_depth)
+    pos = _positions(edges)
+    ex, ey = [], []
+    for p, c in edges:
+        if p not in pos or c not in pos:
+            continue
+        x0, y0 = pos[p]; x1, y1 = pos[c]
+        xm = (x0 + x1) / 2
+        ex += [x0, xm, xm, x1, None]
+        ey += [y0, y0, y1, y1, None]
+    traces = [go.Scatter(x=ex, y=ey, mode='lines',
+                         line=dict(width=1, color='#c8c8c8'),
+                         hoverinfo='skip', showlegend=False)]
+    agg_x, agg_y, agg_l, agg_c, agg_h = [], [], [], [], []
+    lf_x,  lf_y,  lf_l,  lf_c,  lf_h  = [], [], [], [], []
+    for n in dnodes:
+        nid = int(n['id'])
+        if nid not in pos:
+            continue
+        x, y = pos[nid]
+        lc   = len(_leaf_ids(nodes, nid))
+        lab  = str(n.get('name', nid))
+        htxt = (f"<b>{_wrap(n.get('name',''))}</b><br>Type: {n.get('type','')}"
+                f"<br>Variables: {lc}<br><br>{_wrap(n.get('desc',''))}")
+        col  = _bl_node_color(n)
+        if n.get('type') in ('root', 'aggregation', 'collapsed'):
+            agg_x.append(x); agg_y.append(y)
+            agg_l.append((lab + (f' ({lc})' if lc else ''))[:50])
+            agg_c.append(col); agg_h.append(htxt)
+        else:
+            lf_x.append(x); lf_y.append(y)
+            lf_l.append(lab[:40] if show_leaf_labels else '')
+            lf_c.append(col); lf_h.append(htxt)
+    if agg_x:
+        traces.append(go.Scatter(
+            x=agg_x, y=agg_y, mode='markers+text', text=agg_l,
+            textposition='middle right', hovertext=agg_h, hoverinfo='text',
+            marker=dict(size=16, color=agg_c, line=dict(color='white', width=2)),
+            showlegend=False))
+    if lf_x:
+        traces.append(go.Scatter(
+            x=lf_x, y=lf_y, mode='markers+text', text=lf_l,
+            textposition='middle right', hovertext=lf_h, hoverinfo='text',
+            marker=dict(size=7, color=lf_c, symbol='circle', opacity=0.75,
+                        line=dict(color='white', width=1)),
+            showlegend=False))
+    n_leaves = max(12, len(lf_x))
+    fig = go.Figure(traces)
+    fig.update_layout(
+        height=max(700, min(4000, int(n_leaves * 32))),
+        margin=dict(l=20, r=220, t=30, b=20),
+        plot_bgcolor='white', paper_bgcolor='white',
+        xaxis=dict(visible=False, fixedrange=False),
+        yaxis=dict(visible=False, autorange='reversed', fixedrange=False),
+        dragmode='pan')
+    return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# INPUT + CONFIGURATION  (main area — UX v2: configuration is front-and-centre,
+# not buried in the sidebar; expert knobs live under "Advanced settings")
+# ─────────────────────────────────────────────────────────────────────────────
+st.subheader('Upload metadata')
+uploaded = st.file_uploader(
+    'Upload a metadata file',
+    type=['csv', 'tsv', 'txt', 'xlsx', 'xls', 'json'],
+    accept_multiple_files=False,
+)
+with st.expander('Advanced settings', expanded=False):
+    gc1, gc2 = st.columns(2)
+    with gc1:
+        st.markdown('**Taxonomizer**')
+        tx_max_depth = st.slider('Max taxonomy depth', 2, 6, 3, 1,
+                                 help='How many abstract-to-concrete levels to build')
+        tx_min_size  = st.slider('Min cluster size', 3, 20, 6, 1,
+                                 help='Clusters smaller than this stop splitting (leaves attach directly)')
+        tx_branch    = st.slider('Max branches per node', 3, 12, 8, 1,
+                                 help='Upper bound on clusters per split; the actual number is chosen by silhouette')
+    with gc2:
+        st.markdown('**Display**')
+        max_items     = st.slider('Maximum variables', 25, 1200, 900, 25,
+                                  help='Cap on variables included (lower only to speed up very large files). '
+                                       'Default keeps full datasets like HCP (813).')
+        group_filter  = st.text_input('Row filter (optional)', value='',
+                                      help='Filter rows by contextual path text before building')
+# ─────────────────────────────────────────────────────────────────────────────
+# MAIN
+# ─────────────────────────────────────────────────────────────────────────────
+if not uploaded:
+    st.info('Upload a metadata CSV / XLSX / JSON file to begin.')
+    st.markdown("""
+    ### Baseline algorithm — Taxonomizer (semantic space)
+    Based on **Mahmood & Mueller, IEEE TVCG 2019** (Taxonomizer), adapted to a
+    metadata-only setting. No hardcoded domain patterns, no external APIs.
+    | Step | Method | Paper |
+    |------|--------|-------|
+    | Variable representation | **short attribute name** (description's name clause; codes are OOV) | Taxonomizer §3.2 / §4.1 |
+    | Embedding | Word2Vec skip-gram — average of word vectors (`glove-wiki-gigaword-100`) | Taxonomizer §3.2 |
+    | Semantic space | Cosine-distance matrix (no data space — schema has no raw values) | Taxonomizer §3.2 *(adapted)* |
+    | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram | Taxonomizer §4.2 |
+    | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer §4.3 *(adapted)* |
+    This page is the pure Taxonomizer-style semantic-space reference method:
+    variable meanings are embedded and recursively clustered into a hierarchy,
+    with node labels generated from contrastive terms.
+    **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
+    **Approach 2** adds NMF/FASTopic aspect discovery + GMM clustering + optional LLM labels.
+    """)
+    st.stop()
+path = save_upload(uploaded)
+@st.cache_data(show_spinner=False)
+def _load_profile(path_str: str):
+    df = load_any(Path(path_str))
+    cfg, prof = detect_roles(df)
+    return df, cfg, prof
+with st.spinner('Loading file…'):
+    df, auto_cfg, prof = _load_profile(str(path))
+st.subheader('File preview')
+with st.expander(f'{uploaded.name}  ({len(df):,} rows, {len(df.columns)} columns)',
+                 expanded=False):
+    st.dataframe(df.head(10), width='stretch')
+    score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
+                  if c in prof.columns]
+    st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
+                 width='stretch')
+st.subheader('Confirm column roles')
+cols = list(df.columns)
+# Scope widget keys to the uploaded file so a NEW file always shows its own
+# auto-detected defaults (Streamlit otherwise keeps the previous file's
+# selections under a fixed key, which silently overrides the new defaults).
+_fk = safe_name(uploaded.name)
+with st.expander('Column configuration', expanded=True):
+    left, right = st.columns(2)
+    with left:
+        leaf_cols = st.multiselect('Leaf variable column(s)', cols,
+            default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
+        group_cols = st.multiselect('Context column(s) (optional)', cols,
+            default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
+            help='Optional contextual columns for display/filtering.')
+    with right:
+        text_cols = st.multiselect('Text/description column(s)', cols,
+            default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
+        meta_cols = st.multiselect('Metadata/type column(s)', cols,
+            default=[c for c in auto_cfg.get('metadata_cols', []) if c in cols], key=f'meta_{_fk}')
+if not leaf_cols:
+    st.error('Choose at least one leaf variable column.')
+    st.stop()
+cfg = {'leaf_cols': leaf_cols, 'group_cols': group_cols,
+       'text_cols': text_cols, 'metadata_cols': meta_cols}
+if st.button('Build baseline hierarchy', type='primary'):
+    # ── load Word2Vec model (cached after first call) ──────────────────────
+    with st.spinner('Loading Word2Vec model (first run downloads ~66 MB)…'):
+        _w2v = _load_w2v()
+    if _w2v is None:
+        st.stop()
+    with st.spinner('Building hierarchy…'):
+        _can = build_canonical(df, cfg, source=Path(uploaded.name).stem)
+        if group_filter.strip():
+            _can = _can[_can['_group_path'].str.contains(
+                group_filter.strip(), case=False, na=False)].copy()
+        if len(_can) > max_items:
+            _can = _can.head(max_items).copy()
+        _can = _can.reset_index(drop=True)
+        if len(_can) < 2:
+            st.error('Need at least 2 variables after filtering.')
+            st.stop()
+        _pname = Path(uploaded.name).stem
+        _nodes = build_hierarchy(_can, _w2v, project=_pname,
+                                 max_depth=tx_max_depth,
+                                 min_cluster_size=tx_min_size,
+                                 branch_max=tx_branch)
+    st.session_state['_bl_nodes']   = _nodes
+    st.session_state['_bl_can']     = _can
+    st.session_state['_bl_project'] = _pname
+if '_bl_nodes' not in st.session_state:
+    st.info('Configure columns above then click **Build baseline hierarchy**.')
+    st.stop()
+nodes        = st.session_state['_bl_nodes']
+can          = st.session_state['_bl_can']
+project_name = st.session_state['_bl_project']
+_sm = _structural_stats(nodes)
+n_leaves   = len([n for n in nodes if n['type'] == 'attribute'])
+n_internal = len([n for n in nodes if n['type'] == 'aggregation'])
+st.divider()
+c1, c2, c3, c4 = st.columns(4)
+c1.metric('Variables', n_leaves)
+c2.metric('Aggregation nodes', n_internal)
+c3.metric('Max depth', _sm['max_depth'])
+c4.metric('Avg branching', _sm['avg_branching_factor'])
+tabs = st.tabs(['Visualization', 'Node detail', 'Canonical table', 'Export', 'Evaluation'])
+with tabs[0]:
+    # ── Visualization controls (above chart — matches Approach 1 / 2) ─────────
+    vc1, vc2, vc3 = st.columns([3, 2, 1])
+    with vc1:
+        viz_mode = st.radio(
+            'View mode',
+            ['Sunburst (drill-down)', 'Treemap', 'Node-link tree'],
+            horizontal=True, index=0,
+            help='Sunburst best for large hierarchies [Taxonomizer]. '
+                 'Node-link best for inspecting structure at moderate depth.')
+    with vc2:
+        display_depth = st.slider('Depth (Level of Detail)', 1, 8, 4, 1,
+                                  help='How many levels to reveal at once.')
+    with vc3:
+        show_leaf_labels = st.checkbox('Leaf labels', value=False,
+                                       help='Show variable names on the node-link tree.')
+    st.divider()
+    if viz_mode == 'Sunburst (drill-down)':
+        st.plotly_chart(plot_sunburst(nodes, max_depth=display_depth),
+                        width='stretch')
+        st.caption('Green = Baseline. Click a sector to drill down; click the centre to go back.')
+    elif viz_mode == 'Treemap':
+        st.plotly_chart(plot_treemap(nodes), width='stretch')
+    else:
+        st.plotly_chart(plot_node_link(nodes, max_depth=display_depth,
+                                       show_leaf_labels=show_leaf_labels),
+                        width='stretch')
+with tabs[1]:
+    nm = _nmap(nodes)
+    agg_nodes = [n for n in nodes if n['type'] in ('aggregation', 'root')]
+    options   = [f'{n["name"]}  [{len(_leaf_ids(nodes, int(n["id"])))} vars]'
+                 for n in agg_nodes]
+    if options:
+        sel      = st.selectbox('Select a node', options)
+        sel_name = sel.split('  [')[0]
+        sel_node = next((n for n in agg_nodes if n['name'] == sel_name), None)
+        if sel_node:
+            lids = _leaf_ids(nodes, int(sel_node['id']))
+            leaf_ids_set = {nm[i]['metadata']['leaf_id']
+                            for i in lids if i in nm and 'metadata' in nm[i]}
+            sub = can[can['_leaf_id'].isin(leaf_ids_set)]
+            st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
+            st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
+                         width='stretch')
+with tabs[2]:
+    st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), width='stretch')
+with tabs[3]:
+    _base = safe_name(project_name)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.download_button(
+            'Hierarchy JSON',
+            data=json.dumps(nodes, indent=2, ensure_ascii=False).encode('utf-8'),
+            file_name=f'{_base}_baseline_hierarchy.json',
+            mime='application/json',
+            width='stretch',
+        )
+    with col2:
+        st.download_button(
+            'Canonical CSV',
+            data=can.to_csv(index=False).encode('utf-8'),
+            file_name=f'{_base}_baseline_canonical.csv',
+            mime='text/csv',
+            width='stretch',
+        )
+    st.divider()
+    # ── Save directly into the project's outputs/baseline/ folder ──────────────
+    _out_dir = Path(__file__).resolve().parent.parent / 'outputs' / 'baseline'
+    st.markdown('### Save to project folder')
+    st.caption(
+        "The download buttons above go to your browser's Downloads folder (a browser "
+        f'restriction). This button instead writes the files into `{_out_dir}` with the '
+        'dataset name — convenient for `evaluate_all.py`.'
+    )
+    if st.button('Save all to outputs/baseline/', type='primary',
+                 width='stretch'):
+        try:
+            _out_dir.mkdir(parents=True, exist_ok=True)
+            (_out_dir / f'{_base}_baseline_hierarchy.json').write_text(
+                json.dumps(nodes, indent=2, ensure_ascii=False), encoding='utf-8')
+            can.to_csv(_out_dir / f'{_base}_baseline_canonical.csv', index=False)
+            st.success(f'Saved to `{_out_dir}`:\n\n'
+                       f'- {_base}_baseline_hierarchy.json\n'
+                       f'- {_base}_baseline_canonical.csv')
+        except Exception as _e:
+            st.error(f'Could not save: {_e}')
+with tabs[4]:
+    import hierarchy_eval as he
+    st.subheader('Hierarchy Quality Evaluation')
+    st.caption(
+        'No manually curated reference taxonomy is available for this experiment. '
+        'The metrics below are reference-free: they assess hierarchy structure, '
+        'label coherence and interpretability directly.'
+    )
+    with st.spinner('Computing reference-free metrics…'):
+        tm = he.traco_metrics(nodes)
+        npmi = he.npmi_coherence(nodes, can['_text'].tolist())
+    # ── PRIMARY: reference-free hierarchy quality ─────────────────────────────
+    st.markdown('#### Primary — reference-free hierarchy quality')
+    p1, p2, p3 = st.columns(3)
+    p1.metric('Parent–child coherence', tm['pc_coherence'],
+              help='TraCo (Wu et al., AAAI 2024). Mean similarity of each node to its parent. '
+                   'Higher = children correctly nest under their parent theme.')
+    p2.metric('Sibling diversity', tm['sibling_diversity'],
+              help='TraCo (Wu et al., AAAI 2024). Mean distance between sibling nodes. '
+                   'Higher = siblings are distinct (LOW = redundant/repeated siblings).')
+    p3.metric('NPMI label coherence', npmi,
+              help='Lau et al., EACL 2014. Whether node-label terms genuinely co-occur in the '
+                   'data. Higher = meaningful labels, not arbitrary term salads.')
+    st.caption(f'Embedding backend: **{tm["encoder"]}**.  '
+               'Coherence & diversity ∈ [−1, 1]; NPMI ∈ ≈[−1, 1].')
+    # ── Label-quality proxies (interpretability) ──────────────────────────────
+    st.markdown('#### Label quality *(interpretability — reference-free)*')
+    lq = he.label_quality(nodes)
+    l1, l2, l3 = st.columns(3)
+    l1.metric('Concept-valid labels', f"{lq['concept_label_pct']}%",
+              help='% of internal labels that read as a real concept (short noun '
+                   'phrase, WordNet head) rather than a "/"-joined term fragment.')
+    l2.metric('Sibling label redundancy', f"{lq['redundancy_pct']}%",
+              help='% of internal labels duplicating a sibling label (lower is better).')
+    l3.metric('Avg label words', lq['avg_label_words'],
+              help='Mean label length in words (shorter = more name-like).')
+    # ── Structural metrics ────────────────────────────────────────────────────
+    st.markdown('#### Structural statistics')
+    sm = he.structural_stats(nodes)
+    s1, s2, s3, s4, s5 = st.columns(5)
+    s1.metric('Aggregation nodes', sm['n_aggregation_nodes'])
+    s2.metric('Max leaf depth',    sm['max_depth'])
+    s3.metric('Avg leaf depth',    sm['avg_leaf_depth'])
+    s4.metric('Avg branching',     sm['avg_branching_factor'])
+    s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%",
+              help='Aggregation nodes with a single child (sparse-hierarchy indicator)')

version2/views/viewer.py ADDED Viewed

	@@ -0,0 +1,661 @@

+"""
+Metadata Hierarchy Explorer — TFM 2026
+Pre-built results viewer for Baseline, Approach 1, and Approach 2.
+Rendering faithfully replicates each app's display pipeline:
+  - Baseline    : raw tree, Greens, Sunburst + Treemap
+  - Approach 1  : raw tree, Blues,  Sunburst + Treemap + Node-link + Facets
+  - Approach 2  : compress one-child chains, Viridis, Sunburst + Treemap + Node-link
+Level-of-Detail controls (depth, leaf labels, hidden nodes, compress chains)
+match the controls in the individual apps.
+"""
+from __future__ import annotations
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+import numpy as np
+import plotly.graph_objects as go
+import streamlit as st
+# Shared method names / descriptions / display config (single source of truth).
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+import methods  # noqa: E402
+# Page config is set by the navigation router (demo.py).
+ROOT = Path(__file__).resolve().parent.parent / "outputs"
+DEFAULT_DEPTH = 7
+# ─────────────────────────────────────────────────────────────────────────────
+# PRE-BUILT OUTPUT PATHS
+# ─────────────────────────────────────────────────────────────────────────────
+PREBUILT = {
+    "Baseline": {
+        "AI-MIND": {"hierarchy": ROOT / "baseline" / "ai-mind-variable-descriptions_in__baseline_hierarchy.json"},
+        "HCP":     {"hierarchy": ROOT / "baseline" / "HCP_S1200_DataDictionary_Oct_30_2023_baseline_hierarchy.json"},
+    },
+    "Approach 1": {
+        "AI-MIND": {
+            "hierarchy": ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_hierarchy.json",
+            "facets":    ROOT / "approach_1" / "ai-mind-variable-descriptions_in__approach1_facets.json",
+        },
+        "HCP": {
+            "hierarchy": ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_hierarchy.json",
+            "facets":    ROOT / "approach_1" / "HCP_S1200_DataDictionary_Oct_30_2023_approach1_facets.json",
+        },
+    },
+    "Approach 2": {
+        "AI-MIND": {"hierarchy": ROOT / "approach_2" / "ai-mind-variable-descriptions_in__approach2_lod.json"},
+        "HCP":     {"hierarchy": ROOT / "approach_2" / "HCP_S1200_DataDictionary_Oct_30_2023_approach2_lod.json"},
+    },
+}
+# Per-approach rendering config + descriptions now live in methods.py
+# (methods.METHODS[key] carries color / compress / node_link / title / tag / about).
+# ─────────────────────────────────────────────────────────────────────────────
+# TREE TRANSFORMS  (copied from approach_2.py — display-only, exact behaviour)
+# ─────────────────────────────────────────────────────────────────────────────
+def _filter_dissolved(nodes: list) -> list:
+    drop_ids = {int(n["id"]) for n in nodes
+                if n.get("type") == "dissolved" or n.get("isShown") is False}
+    if not drop_ids:
+        return nodes
+    out = []
+    for n in nodes:
+        if int(n["id"]) in drop_ids:
+            continue
+        m = dict(n)
+        m["related"] = [int(c) for c in n.get("related", []) if int(c) not in drop_ids]
+        out.append(m)
+    return out
+def compress_one_child_chains(nodes: list) -> list:
+    """Collapse chains where an aggregation node has exactly one aggregation child
+    (e.g. 'DMS → DMS Recommended Standard' becomes 'DMS / DMS Recommended Standard')."""
+    nodes = _filter_dissolved(nodes)
+    nm = {int(n["id"]): dict(n) for n in nodes}
+    def _is_chain_link(n):
+        if n.get("type") != "aggregation":
+            return False
+        children = n.get("related", [])
+        return (len(children) == 1
+                and nm.get(int(children[0]), {}).get("type") == "aggregation")
+    changed = True
+    while changed:
+        changed = False
+        for nid, n in list(nm.items()):
+            if _is_chain_link(n):
+                child_id = int(n["related"][0])
+                child = nm[child_id]
+                new_node = dict(child)
+                new_node["id"] = nid
+                new_node["name"] = f"{n['name']} / {child['name']}"
+                new_node["desc"] = f"{n.get('desc', '')} | {child.get('desc', '')}"
+                nm[nid] = new_node
+                if child_id in nm:
+                    del nm[child_id]
+                for other in nm.values():
+                    other["related"] = [nid if int(c) == child_id else int(c)
+                                        for c in other.get("related", [])]
+                changed = True
+                break
+    return list(nm.values())
+# ─────────────────────────────────────────────────────────────────────────────
+# RENDER HELPERS  (DAG-safe value map — copied from approach_2.py)
+# ─────────────────────────────────────────────────────────────────────────────
+def _leaf_ids(nodes: list, nid: int) -> list:
+    m = {int(n["id"]): n for n in nodes}
+    out = []
+    def rec(x):
+        n = m.get(int(x))
+        if not n:
+            return
+        if n.get("type") == "attribute":
+            out.append(int(x)); return
+        for c in n.get("related", []):
+            rec(int(c))
+    rec(nid)
+    return list(dict.fromkeys(out))
+def _parent_map(nodes: list) -> dict:
+    pm = {}
+    for n in nodes:
+        for c in n.get("related", []):
+            if int(c) not in pm:
+                pm[int(c)] = int(n["id"])
+    return pm
+def _tree_value_map(nodes: list, pm: dict) -> dict:
+    kids = {}
+    for child, par in pm.items():
+        kids.setdefault(int(par), []).append(int(child))
+    nodemap = {int(n["id"]): n for n in nodes}
+    memo = {}
+    def count(nid: int) -> int:
+        if nid in memo:
+            return memo[nid]
+        memo[nid] = 1
+        n = nodemap.get(nid)
+        if n is not None and n.get("type") == "attribute":
+            memo[nid] = 1
+            return 1
+        ch = kids.get(nid, [])
+        v = sum(count(c) for c in ch) if ch else 1
+        memo[nid] = max(1, v)
+        return memo[nid]
+    return {nid: count(nid) for nid in nodemap}
+def _wrap_hover(text: str, width: int = 80) -> str:
+    import textwrap as _tw
+    s = str(text or "")
+    if not s:
+        return ""
+    lines = []
+    for raw_line in s.split("\n"):
+        lines.extend(_tw.wrap(raw_line, width=width) or [""])
+    return "<br>".join(lines)
+def plot_sunburst(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
+    nodes = _filter_dissolved(nodes)
+    pm = _parent_map(nodes)
+    vm = _tree_value_map(nodes, pm)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n["id"])
+        lc = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get("name", ""))[:40])
+        parents.append("" if nid == 0 else str(pm.get(nid, 0)))
+        values.append(vm.get(nid, 1))
+        hover.append(f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
+                     f"Variables: {lc}<br><br>{_wrap_hover(n.get('desc', ''))}")
+    fig = go.Figure(go.Sunburst(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues="total", hovertext=hover, hoverinfo="text",
+        maxdepth=max_depth, insidetextorientation="radial",
+        marker=dict(colorscale=color, line=dict(width=1, color="white"))))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=40, b=10),
+                      title=dict(text="Click sector to drill down — click centre to go back",
+                                 font=dict(size=13), x=0.5))
+    return fig
+def plot_treemap(nodes: list, color: str, max_depth: int = DEFAULT_DEPTH):
+    nodes = _filter_dissolved(nodes)
+    pm = _parent_map(nodes)
+    vm = _tree_value_map(nodes, pm)
+    ids, labels, parents, values, hover = [], [], [], [], []
+    for n in nodes:
+        nid = int(n["id"])
+        lc = len(_leaf_ids(nodes, nid))
+        ids.append(str(nid))
+        labels.append(str(n.get("name", ""))[:40])
+        parents.append("" if nid == 0 else str(pm.get(nid, 0)))
+        values.append(vm.get(nid, 1))
+        hover.append(f"<b>{n.get('name', '')}</b><br>Variables: {lc}<br>"
+                     f"{_wrap_hover(n.get('desc', ''))}")
+    fig = go.Figure(go.Treemap(
+        ids=ids, labels=labels, parents=parents, values=values,
+        branchvalues="total", hovertext=hover, hoverinfo="text",
+        textinfo="label+value", maxdepth=max_depth,
+        marker=dict(colorscale=color, line=dict(width=1, color="white"))))
+    fig.update_layout(height=700, margin=dict(l=10, r=10, t=10, b=10))
+    return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# NODE-LINK TREE  (Reingold-Tilford layout — copied from approach_2.py)
+# ─────────────────────────────────────────────────────────────────────────────
+def _node_color(n: dict) -> str:
+    t = n.get("type", "")
+    if t == "root":      return "#c44e52"
+    if t == "attribute": return "#4C72B0"
+    if t == "collapsed": return "#bbbbbb"
+    return "#8C8C8C"
+def _display_graph(nodes: list, max_depth: int, show_hidden: bool):
+    m = {int(n["id"]): n for n in nodes}
+    dnodes: dict = {}
+    edges: list = []
+    counter = 10 ** 9
+    def rec(nid, depth):
+        nonlocal counter
+        n = m.get(int(nid))
+        if not n:
+            return
+        if not show_hidden and n.get("isShown") is False and depth > 0:
+            return
+        dnodes[int(nid)] = n
+        if depth >= max_depth and n.get("related"):
+            counter += 1
+            cid = counter
+            n_leaves = len(_leaf_ids(nodes, nid))
+            dnodes[cid] = {"id": cid, "name": f"… {n_leaves} variables",
+                           "type": "collapsed", "related": [],
+                           "desc": f"Collapsed: {n.get('name')}"}
+            edges.append((int(nid), cid))
+            return
+        for c in n.get("related", []):
+            ch = m.get(int(c))
+            if not ch:
+                continue
+            if not show_hidden and ch.get("isShown") is False:
+                continue
+            edges.append((int(nid), int(c)))
+            rec(int(c), depth + 1)
+    rec(0, 0)
+    return list(dnodes.values()), edges
+def _positions(edges: list):
+    H_SCALE, V_SPACE = 3.0, 1.8
+    children: dict = defaultdict(list)
+    for p, c in edges:
+        children[p].append(c)
+    pos: dict = {}
+    counter = {"v": 0}
+    def rec(nid, depth):
+        ch = children.get(nid, [])
+        if not ch:
+            y_pos = counter["v"] * V_SPACE
+            counter["v"] += 1
+            pos[nid] = (depth * H_SCALE, y_pos)
+            return y_pos
+        child_ys = [rec(c, depth + 1) for c in ch]
+        y_pos = float(np.mean(child_ys))
+        pos[nid] = (depth * H_SCALE, y_pos)
+        return y_pos
+    rec(0, 0)
+    return pos
+def plot_node_link(nodes: list, max_depth: int, show_hidden: bool, show_leaf_labels: bool):
+    nodes = _filter_dissolved(nodes)
+    dnodes, edges = _display_graph(nodes, max_depth, show_hidden)
+    pos = _positions(edges)
+    ex, ey = [], []
+    for p, c in edges:
+        if p not in pos or c not in pos:
+            continue
+        x0, y0 = pos[p]
+        x1, y1 = pos[c]
+        xm = (x0 + x1) / 2
+        ex += [x0, xm, xm, x1, None]
+        ey += [y0, y0, y1, y1, None]
+    traces = [go.Scatter(x=ex, y=ey, mode="lines",
+                         line=dict(width=1, color="#c8c8c8"),
+                         hoverinfo="skip", showlegend=False)]
+    agg_x, agg_y, agg_lab, agg_col, agg_hov = [], [], [], [], []
+    lf_x, lf_y, lf_lab, lf_col, lf_hov = [], [], [], [], []
+    for n in dnodes:
+        nid = int(n["id"])
+        if nid not in pos:
+            continue
+        x, y = pos[nid]
+        lc = len(_leaf_ids(nodes, nid))
+        lab = str(n.get("name", ""))[:32]
+        hov = (f"<b>{n.get('name', '')}</b><br>Type: {n.get('type', '')}<br>"
+               f"Variables: {lc}")
+        if n.get("type") == "attribute":
+            lf_x.append(x); lf_y.append(y); lf_col.append(_node_color(n))
+            lf_lab.append(lab if show_leaf_labels else "")
+            lf_hov.append(hov)
+        else:
+            agg_x.append(x); agg_y.append(y); agg_col.append(_node_color(n))
+            agg_lab.append(lab); agg_hov.append(hov)
+    traces.append(go.Scatter(
+        x=lf_x, y=lf_y, mode="markers+text" if show_leaf_labels else "markers",
+        text=lf_lab, textposition="middle right", textfont=dict(size=9),
+        marker=dict(size=7, color=lf_col, line=dict(width=0.5, color="white")),
+        hovertext=lf_hov, hoverinfo="text", showlegend=False))
+    traces.append(go.Scatter(
+        x=agg_x, y=agg_y, mode="markers+text", text=agg_lab,
+        textposition="middle right", textfont=dict(size=10),
+        marker=dict(size=13, color=agg_col, line=dict(width=1, color="white")),
+        hovertext=agg_hov, hoverinfo="text", showlegend=False))
+    n_rows = max(len(lf_y), len(agg_y), 1)
+    fig = go.Figure(traces)
+    fig.update_layout(
+        height=max(600, n_rows * 16),
+        margin=dict(l=10, r=140, t=10, b=10),
+        xaxis=dict(visible=False), yaxis=dict(visible=False),
+        plot_bgcolor="white",
+    )
+    return fig
+# ─────────────────────────────────────────────────────────────────────────────
+# STATS / SAFE RENDERING
+# ─────────────────────────────────────────────────────────────────────────────
+def _tree_depth(nodes: list) -> int:
+    """Max depth of the rendered single-parent tree (root = depth 0)."""
+    nodes = _filter_dissolved(nodes)
+    m = {int(n["id"]): n for n in nodes}
+    best = {"d": 0}
+    def rec(nid, d):
+        best["d"] = max(best["d"], d)
+        for c in m.get(int(nid), {}).get("related", []):
+            if int(c) in m:
+                rec(int(c), d + 1)
+    rec(0, 0)
+    return best["d"]
+def safe_render_depth(nodes: list, requested: int) -> int:
+    """Plotly sunburst/treemap silently blank when asked to draw too many sectors
+    at once (large hierarchies like HCP). Cap the *initial* render depth — the
+    chart stays fully drillable by clicking, so no data is lost."""
+    n = len(_filter_dissolved(nodes))
+    if n > 400:
+        return min(requested, 3)
+    if n > 150:
+        return min(requested, 4)
+    return requested
+# ─────────────────────────────────────────────────────────────────────────────
+# IO
+# ─────────────────────────────────────────────────────────────────────────────
+@st.cache_data(show_spinner=False)
+def _load_json(path_str: str):
+    with open(path_str, encoding="utf-8") as f:
+        return json.load(f)
+def _read_bytes(path_str: str) -> bytes:
+    with open(path_str, "rb") as f:
+        return f.read()
+@st.cache_data(show_spinner=False)
+def _outputs_zip(root_str: str) -> bytes:
+    """Zip the entire bundled outputs/ folder for one-click download."""
+    import io, zipfile
+    root = Path(root_str)
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+        for p in sorted(root.rglob("*")):
+            if p.is_file():
+                zf.write(p, arcname=p.relative_to(root.parent).as_posix())
+    return buf.getvalue()
+def count_nodes(nodes: list) -> tuple[int, int]:
+    nodes = _filter_dissolved(nodes)
+    leaves = sum(1 for n in nodes if n.get("type") == "attribute")
+    aggs = sum(1 for n in nodes if n.get("type") == "aggregation")
+    return leaves, aggs
+def concept_aligned_pct(nodes: list) -> float | None:
+    """% of aggregation nodes that carry a concept/provenance label (Approach 1)."""
+    aggs = [n for n in _filter_dissolved(nodes) if n.get("type") == "aggregation"]
+    if not aggs:
+        return None
+    aligned = sum(1 for n in aggs
+                  if n.get("concept_provenance")
+                  or n.get("label_provenance")
+                  or n.get("structure_provenance")
+                  or n.get("provenance")
+                  or n.get("concept")
+                  or n.get("source_evidence"))
+    return 100.0 * aligned / len(aggs) if aligned else None
+def label_source_counts(nodes: list) -> Counter:
+    out = Counter()
+    for n in _filter_dissolved(nodes):
+        if n.get("type") != "aggregation":
+            continue
+        lp = n.get("label_provenance") or n.get("concept_provenance") or {}
+        src = lp.get("label_source") or lp.get("source") or ("not recorded" if not lp else "other")
+        out[str(src)] += 1
+    return out
+def structure_route_counts(nodes: list) -> Counter:
+    out = Counter()
+    for n in _filter_dissolved(nodes):
+        if n.get("type") != "aggregation":
+            continue
+        sp = n.get("structure_provenance") or {}
+        route = sp.get("route") or sp.get("route_used") or sp.get("aspect_method")
+        out[str(route or ("not recorded" if not sp else "other"))] += 1
+    return out
+@st.cache_data(show_spinner=False)
+def _leaf_counts_by_dataset(dataset_name: str) -> dict:
+    counts = {}
+    for key, per_dataset in PREBUILT.items():
+        path = per_dataset.get(dataset_name, {}).get("hierarchy")
+        if path and path.exists():
+            nodes = _load_json(str(path))
+            counts[key] = sum(1 for n in _filter_dissolved(nodes)
+                              if n.get("type") == "attribute")
+    return counts
+def output_manifest(paths: dict, nodes: list) -> list[dict]:
+    leaves = [n for n in nodes if n.get("type") == "attribute"]
+    leaf_ids = sum(1 for n in leaves if (n.get("metadata") or {}).get("leaf_id"))
+    row_indices = sum(1 for n in leaves if (n.get("metadata") or {}).get("row_index") is not None)
+    has_label_prov = any(n.get("label_provenance") or n.get("concept_provenance")
+                         for n in nodes if n.get("type") == "aggregation")
+    has_struct_prov = any(n.get("structure_provenance")
+                          for n in nodes if n.get("type") == "aggregation")
+    if leaf_ids:
+        leaf_status = f"present for {leaf_ids}/{len(leaves)} leaves"
+    elif row_indices:
+        leaf_status = f"row-index only for {row_indices}/{len(leaves)} leaves"
+    else:
+        leaf_status = "not recorded"
+    return [
+        {"Artifact": "Hierarchy JSON", "Status": "present",
+         "Purpose": "Tree topology, labels, leaf metadata"},
+        {"Artifact": "Stable leaf IDs", "Status": leaf_status,
+         "Purpose": "Cross-method matching and downstream evaluation scripts"},
+        {"Artifact": "Facet JSON", "Status": "present" if paths.get("facets") else "not applicable",
+         "Purpose": "Approach 1 parallel Castanet views"},
+        {"Artifact": "Label provenance", "Status": "present" if has_label_prov else "not recorded",
+         "Purpose": "Audit how internal labels were produced"},
+        {"Artifact": "Structure provenance", "Status": "present" if has_struct_prov else "not recorded",
+         "Purpose": "Audit how internal branches were produced"},
+        {"Artifact": "Canonical CSV", "Status": "not bundled in demo outputs",
+         "Purpose": "Available by rerunning a builder and exporting"},
+        {"Artifact": "Manual reference taxonomy", "Status": "not available",
+         "Purpose": "No accuracy claim is made"},
+    ]
+# ─────────────────────────────────────────────────────────────────────────────
+# SIDEBAR
+# ─────────────────────────────────────────────────────────────────────────────
+# Sidebar holds only context + repo link — selection lives in the main area.
+with st.sidebar:
+    st.caption("Results are pre-built from the thesis experiments. To run on your "
+               "own data, open a Build page and upload a CSV.")
+    st.markdown("[GitHub Repository]"
+                "(https://github.com/RoophaSharon/tfm_metadata_hierarchy_2026)")
+# ─────────────────────────────────────────────────────────────────────────────
+# MAIN
+# ─────────────────────────────────────────────────────────────────────────────
+# Method + dataset selection sit in the MAIN area (professor's UX note:
+# configuration belongs front-and-centre, not buried in the sidebar).
+title_options = [methods.title(k) for k in methods.METHOD_ORDER]
+sc1, sc2 = st.columns([3, 2])
+with sc1:
+    selected_title = st.selectbox("Select method", title_options, index=0)
+with sc2:
+    dataset = st.radio("Select dataset", ["AI-MIND", "HCP"], index=0, horizontal=True)
+approach = methods.TITLE_TO_KEY[selected_title]
+cfg = methods.METHODS[approach]
+color = cfg["color"]
+st.title(f"{cfg['title']} — {dataset}")
+paths = PREBUILT[approach][dataset]
+hier_path = paths.get("hierarchy")
+if hier_path is None or not hier_path.exists():
+    st.error(f"Pre-built result not found: `{hier_path}`")
+    st.stop()
+raw_nodes = _load_json(str(hier_path))
+leaves, aggs = count_nodes(raw_nodes)
+c1, c2, c3 = st.columns(3)
+c1.metric("Leaf Variables", leaves)
+c2.metric("Aggregation Nodes", aggs)
+c3.metric("Total Nodes", leaves + aggs)
+dataset_counts = _leaf_counts_by_dataset(dataset)
+max_leaves = max(dataset_counts.values(), default=leaves)
+if leaves < max_leaves:
+    st.warning(
+        f"This pre-built {cfg['title']} result contains {leaves}/{max_leaves} "
+        f"{dataset} variables. Treat cross-method comparisons for this dataset "
+        "as coverage-aware unless the output is regenerated with the same row cap."
+    )
+# ── Build summary (collapsed) ────────────────────────────────────────────────
+facet_path = paths.get("facets")
+n_facets = None
+if facet_path is not None and facet_path.exists():
+    try:
+        n_facets = len(_load_json(str(facet_path)))
+    except Exception:
+        n_facets = None
+with st.expander("Build summary", expanded=False):
+    bs1, bs2, bs3, bs4 = st.columns(4)
+    bs1.metric("Variables", leaves)
+    bs2.metric("Internal nodes", aggs)
+    bs3.metric("Tree depth", _tree_depth(raw_nodes))
+    bs4.metric("Facets", n_facets if n_facets is not None else "—")
+    pct = concept_aligned_pct(raw_nodes)
+    if pct is not None:
+        st.caption(f"Concept-aligned aggregation nodes: **{pct:.1f}%**")
+    st.caption(
+        f"Source file: `{hier_path.name}` · "
+        f"Approach: **{approach}** · Dataset: **{dataset}**. "
+        "Tree topology and labels are reproduced exactly from the pre-built "
+        "thesis output (the algorithms are not re-run in this viewer)."
+    )
+    st.dataframe(output_manifest(paths, raw_nodes), width="stretch", hide_index=True)
+# ── Provenance / traceability (method-aware) ─────────────────────────────────
+# Approach 2 records the richest provenance; Approach 1 records concept-alignment
+# coverage; the Baseline records none — and that contrast is itself a finding.
+with st.expander("Label & structure provenance", expanded=False):
+    if approach == "Baseline":
+        st.caption(
+            "The baseline records **no provenance**: node labels are unsupervised "
+            "contrastive terms derived from each cluster, not traceable to a concept "
+            "source or a generation route. Traceability increases across the methods "
+            "(Baseline → Approach 1 → Approach 2) — itself a comparison point."
+        )
+    else:
+        pct = concept_aligned_pct(raw_nodes)
+        if pct is not None:
+            st.caption(f"Aggregation nodes carrying a concept / label source: **{pct:.1f}%**")
+        label_counts = label_source_counts(raw_nodes)
+        route_counts = structure_route_counts(raw_nodes)
+        has_detail = (any(k != "not recorded" for k in label_counts)
+                      or any(k != "not recorded" for k in route_counts))
+        if has_detail:
+            pc1, pc2 = st.columns(2)
+            with pc1:
+                st.markdown("**Label sources**")
+                st.dataframe(
+                    [{"Source": k, "Nodes": v} for k, v in label_counts.most_common()],
+                    width="stretch", hide_index=True,
+                )
+            with pc2:
+                st.markdown("**Structure routes**")
+                st.dataframe(
+                    [{"Route": k, "Nodes": v} for k, v in route_counts.most_common()],
+                    width="stretch", hide_index=True,
+                )
+        elif pct is None:
+            st.info("No provenance fields were recorded in this output JSON.")
+# ── Downloads ────────────────────────────────────────────────────────────────
+d1, d2 = st.columns(2)
+with d1:
+    st.download_button("Hierarchy JSON", data=_read_bytes(str(hier_path)),
+                       file_name=hier_path.name, mime="application/json",
+                       width='stretch')
+with d2:
+    st.download_button("All outputs (ZIP)", data=_outputs_zip(str(ROOT)),
+                       file_name="metadata_hierarchy_outputs.zip",
+                       mime="application/zip", width='stretch')
+st.markdown("---")
+# ── Level-of-Detail controls (above chart — matches the apps) ────────────────
+view_options = ["Sunburst (drill-down)", "Treemap"]
+if cfg["node_link"]:
+    view_options.append("Node-link tree")
+if cfg["compress"]:
+    vc1, vc2, vc3, vc4, vc5 = st.columns([2.4, 2, 1, 1, 1.2])
+else:
+    vc1, vc2, vc3, vc4 = st.columns([2.4, 2, 1, 1])
+    vc5 = None
+with vc1:
+    viz_mode = st.radio("View mode", view_options, horizontal=True, index=0,
+                        help="Sunburst best for large hierarchies [Taxonomizer]. "
+                             "Node-link best for moderate-depth structure inspection.")
+with vc2:
+    depth = st.slider("Depth (Level of Detail)", 1, 9, DEFAULT_DEPTH, 1,
+                      help="Maximum tree levels shown. Set high to see the whole "
+                           "hierarchy, lower to peel back to the interior.")
+with vc3:
+    show_leaf_labels = st.checkbox("Leaf labels", value=False)
+with vc4:
+    show_hidden = st.checkbox("Hidden nodes", value=False)
+if vc5 is not None:
+    with vc5:
+        compress_chains = st.checkbox("Compress chains", value=True,
+                                      help="Merge one-child aggregation chains "
+                                           '(e.g. "DMS → DMS Recommended Standard") for '
+                                           "display. Export JSON keeps original structure.")
+else:
+    compress_chains = False
+st.divider()
+display_nodes = compress_one_child_chains(raw_nodes) if compress_chains else raw_nodes
+render_depth = safe_render_depth(display_nodes, depth)
+if render_depth < depth and viz_mode in {"Sunburst (drill-down)", "Treemap"}:
+    st.caption(
+        f"Initial render capped at depth {render_depth} for performance; "
+        "the chart remains drillable."
+    )
+if viz_mode == "Sunburst (drill-down)":
+    st.plotly_chart(plot_sunburst(display_nodes, color, render_depth), width='stretch')
+elif viz_mode == "Treemap":
+    st.plotly_chart(plot_treemap(display_nodes, color, render_depth), width='stretch')
+else:
+    st.plotly_chart(plot_node_link(display_nodes, depth, show_hidden, show_leaf_labels),
+                    width='stretch')
+# ── Facets (Approach 1 only) ─────────────────────────────────────────────────
+if facet_path is not None and facet_path.exists():
+    st.markdown("---")
+    st.subheader("Parallel facets")
+    facets = _load_json(str(facet_path))
+    names = list(facets.keys())
+    if not names:
+        st.info("No facets available for this dataset.")
+    else:
+        sel = st.selectbox("Select facet", names)
+        fnodes = facets[sel]
+        ft1, ft2 = st.tabs(["Sunburst", "Treemap"])
+        with ft1:
+            st.plotly_chart(plot_sunburst(fnodes, color, depth), width='stretch')
+        with ft2:
+            st.plotly_chart(plot_treemap(fnodes, color), width='stretch')

views/run_baseline.py CHANGED Viewed

@@ -5,7 +5,7 @@
 #
 # Pipeline:
 #   1. Load metadata file (CSV / TSV / XLSX / JSON)
-#   2. Detect column roles (leaf / group / text / meta) — same as Approach 1 / 2
 #   3. Build canonical schema (incl. _semantic_text = description values only)
 #   4. Embed each variable (code + description) via Word2Vec skip-gram and build
 #      the cosine-distance semantic space [TAX §3.2]
@@ -28,7 +28,7 @@
 #        the bare code goes out-of-vocabulary (a limitation the paper flags,
 #        e.g. "BP").  Taxonomizer embeds the NAME ("a few words"), not a
 #        paragraph; using the short name (not the full description prose) keeps
-#        task-distinctive words from being diluted by shared explanatory text.
 #     3. Fully-automatic labels — the paper's labelling is semi-automatic
 #        (human picks from suggestions); a baseline must be non-interactive, so
 #        we use data-driven contrastive terms from each cluster's members.
@@ -186,7 +186,7 @@ def detect_roles(df: pd.DataFrame) -> tuple:
     meta  = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
              .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
     # Representation columns (decimal/precision/unit/type/format/…) must never
-    # become structural levels — force them out of group and into metadata. [GON][TAX]
     _META_SUBSTR_BLOCK = {
         'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
         'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
@@ -306,9 +306,9 @@ def attribute_name(text: str) -> str:
     paragraph.  Descriptions here are formatted '<name>: <full sentence>' (some
     prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
     that is not a pure all-caps marker.  Embedding this short name — rather than
-    the full description prose — keeps the task-distinctive words from being
-    diluted by shared explanatory text, so the taxonomy groups far more by theme
-    (e.g. DMS / PAL / SWM) without ever touching the group column.
     """
     text = str(text)
     for clause in re.split(r'[:\n]', text):
@@ -470,7 +470,7 @@ def build_hierarchy(can: pd.DataFrame, w2v_model, project: str = 'project',
     average) — the name clause of the description, as Taxonomizer specifies.
     Recursively clusters via balanced Ward linkage — the semantic-space
     dendrogram.  Labels each internal node with the contrastive content terms of
-    its members (data-driven, fully automatic).  No group column, no hardcoding.
     """
     # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
     nodes: list = [{'id': 0, 'name': project, 'type': 'root',
@@ -807,8 +807,8 @@ with st.sidebar:
     max_items     = st.slider('Maximum variables', 25, 1200, 900, 25,
                               help='Cap on variables included (lower only to speed up very large files). '
                                    'Default keeps full datasets like HCP (813).')
-    group_filter  = st.text_input('Group filter (optional)', value='',
-                                  help='Filter rows whose group path contains this text')
 # ─────────────────────────────────────────────────────────────────────────────
 # MAIN
@@ -829,8 +829,9 @@ if not uploaded:
     | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram | Taxonomizer §4.2 |
     | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer §4.3 *(adapted)* |
-    The group column is **not** used for construction, so the recovered taxonomy
-    can be fairly evaluated against it (NMI / ARI / Purity in the Evaluation tab).
     **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
@@ -853,7 +854,7 @@ st.subheader('Step 1 — File preview')
 with st.expander(f'{uploaded.name}  ({len(df):,} rows, {len(df.columns)} columns)',
                  expanded=False):
     st.dataframe(df.head(10), width='stretch')
-    score_cols = [c for c in ['column', 'leaf_score', 'group_score', 'text_score', 'metadata_score']
                   if c in prof.columns]
     st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
                  width='stretch')
@@ -869,8 +870,9 @@ with st.expander('Column configuration', expanded=True):
     with left:
         leaf_cols = st.multiselect('Leaf variable column(s)', cols,
             default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
-        group_cols = st.multiselect('Group/task column(s)', cols,
-            default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}')
     with right:
         text_cols = st.multiselect('Text/description column(s)', cols,
             default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
@@ -982,11 +984,11 @@ with tabs[1]:
                             for i in lids if i in nm and 'metadata' in nm[i]}
             sub = can[can['_leaf_id'].isin(leaf_ids_set)]
             st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
-            st.dataframe(sub[['_leaf_label', '_group_path', '_text']].reset_index(drop=True),
                          width='stretch')
 with tabs[2]:
-    st.dataframe(can, width='stretch')
 with tabs[3]:
     _base = safe_name(project_name)
@@ -1035,9 +1037,9 @@ with tabs[4]:
     st.subheader('Hierarchy Quality Evaluation')
     st.caption(
-        'The group column is a *construction input* (Gonçalves text object), so it '
-        'cannot serve as ground truth. The primary metrics below are **reference-free** '
-        '— they assess the hierarchy itself, with no gold standard.'
     )
     with st.spinner('Computing reference-free metrics…'):
@@ -1082,14 +1084,3 @@ with tabs[4]:
     s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%",
               help='Aggregation nodes with a single child (sparse-hierarchy indicator)')
-    # ── Held-out group recovery (VALID — group column not used in construction) ─
-    st.markdown('#### Held-out group recovery *(valid — group column not used)*')
-    st.caption(
-        'The baseline never uses the group column (it embeds only attribute '
-        'names), so this is a **valid held-out** recovery score. ARI and AMI are '
-        'chance-corrected; NMI and Purity are omitted as inflated by over-splitting.'
-    )
-    gp = he.group_preservation(nodes, can)
-    g1, g2 = st.columns(2)
-    g1.metric('ARI', gp['ARI'], help='Adjusted Rand Index (chance-corrected).')
-    g2.metric('AMI', gp['AMI'], help='Adjusted Mutual Information (chance-corrected).')

 #
 # Pipeline:
 #   1. Load metadata file (CSV / TSV / XLSX / JSON)
+#   2. Detect column roles (leaf / context / text / meta) — same as Approach 1 / 2
 #   3. Build canonical schema (incl. _semantic_text = description values only)
 #   4. Embed each variable (code + description) via Word2Vec skip-gram and build
 #      the cosine-distance semantic space [TAX §3.2]
 #        the bare code goes out-of-vocabulary (a limitation the paper flags,
 #        e.g. "BP").  Taxonomizer embeds the NAME ("a few words"), not a
 #        paragraph; using the short name (not the full description prose) keeps
+#        domain-specific words from being diluted by shared explanatory text.
 #     3. Fully-automatic labels — the paper's labelling is semi-automatic
 #        (human picks from suggestions); a baseline must be non-interactive, so
 #        we use data-driven contrastive terms from each cluster's members.
     meta  = (prof[(prof.metadata_score >= 4) & (~prof.column.isin(text + leaf + group))]
              .sort_values('metadata_score', ascending=False)['column'].head(5).tolist())
     # Representation columns (decimal/precision/unit/type/format/…) must never
+    # become structural levels; prefer them as metadata. [GON][TAX]
     _META_SUBSTR_BLOCK = {
         'decimal', 'precision', 'unit', 'dtype', 'type', 'format', 'scale',
         'values', 'range', 'min', 'max', 'coding', 'codebook', 'missing',
     paragraph.  Descriptions here are formatted '<name>: <full sentence>' (some
     prefixed with a marker like 'KEY: <name>: …'), so we take the first clause
     that is not a pure all-caps marker.  Embedding this short name — rather than
+    the full description prose — keeps the domain-specific words from being
+    diluted by shared explanatory text, so the taxonomy clusters more by theme
+    (e.g. DMS / PAL / SWM).
     """
     text = str(text)
     for clause in re.split(r'[:\n]', text):
     average) — the name clause of the description, as Taxonomizer specifies.
     Recursively clusters via balanced Ward linkage — the semantic-space
     dendrogram.  Labels each internal node with the contrastive content terms of
+    its members (data-driven, fully automatic). No hardcoding.
     """
     # ── leaf attribute nodes (ids 1..N) ──────────────────────────────────────
     nodes: list = [{'id': 0, 'name': project, 'type': 'root',
     max_items     = st.slider('Maximum variables', 25, 1200, 900, 25,
                               help='Cap on variables included (lower only to speed up very large files). '
                                    'Default keeps full datasets like HCP (813).')
+    group_filter  = st.text_input('Row filter (optional)', value='',
+                                  help='Filter rows by contextual path text before building')
 # ─────────────────────────────────────────────────────────────────────────────
 # MAIN
     | Hierarchy construction | Agglomerative clustering (cosine, average-linkage), k by silhouette → dendrogram | Taxonomizer §4.2 |
     | Internal node labelling | **Data-driven contrastive terms** (paper's labelling is semi-automatic) | Taxonomizer §4.3 *(adapted)* |
+    This page is the pure Taxonomizer-style semantic-space reference method:
+    variable meanings are embedded and recursively clustered into a hierarchy,
+    with node labels generated from contrastive terms.
     **Approach 1** adds SBERT embeddings + Wikidata/BioPortal enrichment + HiExpan refinement.
 with st.expander(f'{uploaded.name}  ({len(df):,} rows, {len(df.columns)} columns)',
                  expanded=False):
     st.dataframe(df.head(10), width='stretch')
+    score_cols = [c for c in ['column', 'leaf_score', 'text_score', 'metadata_score']
                   if c in prof.columns]
     st.dataframe(prof[score_cols].sort_values('leaf_score', ascending=False),
                  width='stretch')
     with left:
         leaf_cols = st.multiselect('Leaf variable column(s)', cols,
             default=[c for c in auto_cfg.get('leaf_cols', []) if c in cols], key=f'leaf_{_fk}')
+        group_cols = st.multiselect('Context column(s) (optional)', cols,
+            default=[c for c in auto_cfg.get('group_cols', []) if c in cols], key=f'group_{_fk}',
+            help='Optional contextual columns for display/filtering.')
     with right:
         text_cols = st.multiselect('Text/description column(s)', cols,
             default=[c for c in auto_cfg.get('text_cols', []) if c in cols], key=f'text_{_fk}')
                             for i in lids if i in nm and 'metadata' in nm[i]}
             sub = can[can['_leaf_id'].isin(leaf_ids_set)]
             st.write(f'**{len(lids)} variables** under "{sel_node["name"]}"')
+            st.dataframe(sub[['_leaf_label', '_text']].reset_index(drop=True),
                          width='stretch')
 with tabs[2]:
+    st.dataframe(can.drop(columns=['_group_path'], errors='ignore'), width='stretch')
 with tabs[3]:
     _base = safe_name(project_name)
     st.subheader('Hierarchy Quality Evaluation')
     st.caption(
+        'No manually curated reference taxonomy is available for this experiment. '
+        'The metrics below are reference-free: they assess hierarchy structure, '
+        'label coherence and interpretability directly.'
     )
     with st.spinner('Computing reference-free metrics…'):
     s5.metric('Singleton nodes',   f"{sm['singleton_nodes_%']}%",
               help='Aggregation nodes with a single child (sparse-hierarchy indicator)')