cy0307 commited on 28 days ago

Commit

eea471e

verified ·

1 Parent(s): 4496d29

Publish Ropedia minimal task baseline weights

Browse files

Files changed (49) hide show

.gitattributes +1 -0
README.md +107 -0
artifacts/episode_task_suite/available_modalities.json +83 -0
artifacts/episode_task_suite/caption_grounding/metrics.json +15 -0
artifacts/episode_task_suite/caption_grounding/model.npz +3 -0
artifacts/episode_task_suite/contact_prediction/metrics.json +19 -0
artifacts/episode_task_suite/contact_prediction/model.npz +3 -0
artifacts/episode_task_suite/cross_modal_retrieval/metrics.json +15 -0
artifacts/episode_task_suite/cross_modal_retrieval/model.npz +3 -0
artifacts/episode_task_suite/feature_manifest.json +104 -0
artifacts/episode_task_suite/hand_trajectory_forecast/metrics.json +15 -0
artifacts/episode_task_suite/misalignment_detection/metrics.json +19 -0
artifacts/episode_task_suite/misalignment_detection/model.npz +3 -0
artifacts/episode_task_suite/modality_reconstruction/metrics.json +12 -0
artifacts/episode_task_suite/next_action/metrics.json +24 -0
artifacts/episode_task_suite/next_action/model.npz +3 -0
artifacts/episode_task_suite/object_relevance/metrics.json +14 -0
artifacts/episode_task_suite/object_relevance/model.npz +3 -0
artifacts/episode_task_suite/temporal_order/metrics.json +19 -0
artifacts/episode_task_suite/temporal_order/model.npz +3 -0
artifacts/episode_task_suite/timeline_action/metrics.json +24 -0
artifacts/episode_task_suite/timeline_action/model.npz +3 -0
artifacts/episode_task_suite/timeline_subtask/metrics.json +24 -0
artifacts/episode_task_suite/timeline_subtask/model.npz +3 -0
artifacts/episode_task_suite/transition_detection/metrics.json +26 -0
artifacts/episode_task_suite/transition_detection/model.npz +3 -0
artifacts/min_action_model/metrics.json +11 -0
artifacts/min_action_model/model.npz +3 -0
artifacts/min_all_modalities_action_model/available_modalities.json +83 -0
artifacts/min_all_modalities_action_model/feature_manifest.json +104 -0
artifacts/min_all_modalities_action_model/metrics.json +13 -0
artifacts/min_all_modalities_action_model/model.npz +3 -0
artifacts/min_all_modalities_subtask_model/available_modalities.json +83 -0
artifacts/min_all_modalities_subtask_model/feature_manifest.json +104 -0
artifacts/min_all_modalities_subtask_model/metrics.json +13 -0
artifacts/min_all_modalities_subtask_model/model.npz +3 -0
artifacts/min_subtask_model/metrics.json +11 -0
artifacts/min_subtask_model/model.npz +3 -0
assets/task_architectures.svg +216 -0
assets/task_suite_infographic.png +3 -0
notes/all_modalities_model.md +148 -0
notes/episode_task_suite.md +176 -0
notes/min_action_model.md +85 -0
notes/reproducibility_audit.md +124 -0
scripts/episode_task_suite.py +776 -0
scripts/generate_visualizations.py +474 -0
scripts/render_task_suite_infographic.py +378 -0
scripts/train_all_modalities_model.py +582 -0
scripts/train_min_action_model.py +531 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/task_suite_infographic.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+license: other
+library_name: numpy
+tags:
+  - robotics
+  - embodied-ai
+  - multimodal
+  - ropedia
+  - xperience-10m
+  - baseline
+  - linear-model
+  - retrieval
+metrics:
+  - accuracy
+  - f1
+  - mean-reciprocal-rank
+  - mean-squared-error
+model-index:
+  - name: Ropedia Minimal Task Baselines
+    results:
+      - task:
+          type: robotics
+          name: Cross-modal retrieval
+        dataset:
+          type: ropedia-ai/xperience-10m-sample
+          name: Xperience-10M public sample episode
+        metrics:
+          - type: top_5_accuracy
+            value: 0.3764
+            name: top-5 retrieval accuracy
+          - type: mrr
+            value: 0.2634
+            name: mean reciprocal rank
+      - task:
+          type: robotics
+          name: Transition detection
+        dataset:
+          type: ropedia-ai/xperience-10m-sample
+          name: Xperience-10M public sample episode
+        metrics:
+          - type: f1
+            value: 0.6552
+            name: macro-F1
+---
+# Ropedia Minimal Task Baselines
+This repo stores the minimal baseline weights and metrics for the 12-task Ropedia episode suite.
+These are intentionally small, transparent baselines:
+- z-score + linear softmax classifiers,
+- dual ridge regression/projection heads,
+- sigmoid multi-label logistic regression,
+- cosine ranking for retrieval tasks.
+They are not deep robot policies or foundation models. Their purpose is to make every input/output contract auditable before scaling to many episodes.
+## Included
+- `artifacts/**/model.npz`: minimal baseline weights, scalers, and labels
+- `artifacts/**/metrics.json`: committed metrics
+- `artifacts/**/feature_manifest.json`: feature block boundaries where relevant
+- `scripts/*.py`: training and visualization scripts
+- `notes/*.md`: interpretation and reproducibility notes
+The companion artifact dataset repo stores CSV/JSON predictions and dashboard assets:
+https://huggingface.co/datasets/cy0307/ropedia-episode-task-suite-artifacts
+The public visual dashboard is here:
+https://huggingface.co/spaces/cy0307/ropedia-episode-task-suite
+## Minimal Architecture
+![Minimal 12-task architecture](assets/task_architectures.svg)
+## Metrics Snapshot
+| Task | Minimal head | Main metric |
+| --- | --- | ---: |
+| `timeline_action` | linear softmax | 0.0500 macro-F1 |
+| `timeline_subtask` | linear softmax | 0.0495 macro-F1 |
+| `transition_detection` | linear softmax | 0.6552 macro-F1 |
+| `next_action` | linear softmax | 0.0593 macro-F1 |
+| `hand_trajectory_forecast` | ridge regression | 0.8223 MPJPE |
+| `contact_prediction` | linear softmax | 1.0000 macro-F1 |
+| `object_relevance` | multi-label logistic | 0.1839 micro-F1 |
+| `caption_grounding` | ridge + cosine rank | 0.0172 MRR |
+| `cross_modal_retrieval` | ridge + cosine rank | 0.3764 top-5 |
+| `modality_reconstruction` | ridge regression | -0.0160 R2 |
+| `temporal_order` | binary softmax | 0.5487 F1 |
+| `misalignment_detection` | binary softmax | 0.4866 F1 |
+## Data Notice
+This repo does not redistribute raw Ropedia videos or raw `annotation.hdf5`. Download the original sample from Ropedia / Hugging Face and follow the dataset terms:
+- https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample
+- https://ropedia.com/dataset
+## Source
+GitHub:
+https://github.com/ChaoYue0307/ropedia-episode-task-suite

artifacts/episode_task_suite/available_modalities.json ADDED Viewed

	@@ -0,0 +1,83 @@

+[
+  {
+    "modality": "depth_confidence",
+    "shape": [
+      5821,
+      140
+    ]
+  },
+  {
+    "modality": "video/fisheye_cam0",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam1",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam2",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam3",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/stereo_left",
+    "path": "data/sample/xperience-10m-sample/stereo_left.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/stereo_right",
+    "path": "data/sample/xperience-10m-sample/stereo_right.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "caption_text",
+    "shape": [
+      5821,
+      128
+    ],
+    "fields": "objects,interaction"
+  },
+  {
+    "modality": "slam_point_cloud_static",
+    "shape": [
+      22
+    ]
+  },
+  {
+    "modality": "calibration_static",
+    "shape": [
+      117
+    ]
+  }
+]

artifacts/episode_task_suite/caption_grounding/metrics.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "mrr": 0.017183946083791223,
+  "median_rank": 167.0,
+  "mean_rank": 174.39367816091954,
+  "num_queries": 348,
+  "top1_accuracy": 0.0028735632183908046,
+  "top5_accuracy": 0.011494252873563218,
+  "top10_accuracy": 0.017241379310344827,
+  "task": "caption_grounding",
+  "input": "caption objects/interaction text query + candidate sensor windows",
+  "output": "matching time window",
+  "split": "chronological",
+  "num_train_windows": 813,
+  "num_test_windows": 348
+}

artifacts/episode_task_suite/caption_grounding/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:032da1fd5b5142b449e758a13bf5a450bb9ac22afde032bebf194987f97c1341
+size 14459176

artifacts/episode_task_suite/contact_prediction/metrics.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "accuracy": 1.0,
+  "balanced_accuracy": 1.0,
+  "macro_f1": 1.0,
+  "weighted_f1": 1.0,
+  "num_eval_windows": 348,
+  "num_classes": 1,
+  "task": "contact_prediction",
+  "input": "all non-contact/non-caption-label modalities -> any body contact",
+  "split": "chronological",
+  "num_windows": 1161,
+  "num_train_windows": 813,
+  "num_test_windows": 348,
+  "feature_dim": 7335,
+  "majority_baseline_accuracy": 1.0,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.0005947681493125856,
+  "unseen_test_classes": []
+}

artifacts/episode_task_suite/contact_prediction/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:050d2139076c55b251c2c23b62d6c58023cc7fb1c0431ded6795e775c9300a7b
+size 82797

artifacts/episode_task_suite/cross_modal_retrieval/metrics.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "mrr": 0.26335984006618296,
+  "median_rank": 12.5,
+  "mean_rank": 43.33045977011494,
+  "num_queries": 348,
+  "top1_accuracy": 0.14942528735632185,
+  "top5_accuracy": 0.3764367816091954,
+  "top10_accuracy": 0.47413793103448276,
+  "task": "cross_modal_retrieval",
+  "input": "motion/IMU/camera query",
+  "output": "matching depth/video window",
+  "split": "chronological",
+  "num_train_windows": 813,
+  "num_test_windows": 348
+}

artifacts/episode_task_suite/cross_modal_retrieval/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc5b2d0bc4350c4348be1e6098f9793a8ed5e479bad9ee20351bf2991c71347a
+size 41310574

artifacts/episode_task_suite/feature_manifest.json ADDED Viewed

	@@ -0,0 +1,104 @@

+[
+  {
+    "name": "hand_left_joints",
+    "start": 0,
+    "end": 441,
+    "dim": 441
+  },
+  {
+    "name": "hand_right_joints",
+    "start": 441,
+    "end": 882,
+    "dim": 441
+  },
+  {
+    "name": "body_joints",
+    "start": 882,
+    "end": 1974,
+    "dim": 1092
+  },
+  {
+    "name": "body_contacts",
+    "start": 1974,
+    "end": 2121,
+    "dim": 147
+  },
+  {
+    "name": "camera_translation",
+    "start": 2121,
+    "end": 2142,
+    "dim": 21
+  },
+  {
+    "name": "camera_rotation_matrix",
+    "start": 2142,
+    "end": 2205,
+    "dim": 63
+  },
+  {
+    "name": "imu_accel_gyro",
+    "start": 2205,
+    "end": 2247,
+    "dim": 42
+  },
+  {
+    "name": "depth_confidence",
+    "start": 2247,
+    "end": 3227,
+    "dim": 980
+  },
+  {
+    "name": "video_fisheye_cam0",
+    "start": 3227,
+    "end": 3913,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam1",
+    "start": 3913,
+    "end": 4599,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam2",
+    "start": 4599,
+    "end": 5285,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam3",
+    "start": 5285,
+    "end": 5971,
+    "dim": 686
+  },
+  {
+    "name": "video_stereo_left",
+    "start": 5971,
+    "end": 6657,
+    "dim": 686
+  },
+  {
+    "name": "video_stereo_right",
+    "start": 6657,
+    "end": 7343,
+    "dim": 686
+  },
+  {
+    "name": "caption_objects_interaction_text",
+    "start": 7343,
+    "end": 8239,
+    "dim": 896
+  },
+  {
+    "name": "slam_point_cloud",
+    "start": 8239,
+    "end": 8261,
+    "dim": 22
+  },
+  {
+    "name": "calibration",
+    "start": 8261,
+    "end": 8378,
+    "dim": 117
+  }
+]

artifacts/episode_task_suite/hand_trajectory_forecast/metrics.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "mse": 11.323140144348145,
+  "mae": 0.40246668457984924,
+  "r2": -1334.788993815828,
+  "task": "hand_trajectory_forecast",
+  "input": "all modalities at t -> future left/right hand 3D joints",
+  "split": "chronological",
+  "num_windows": 1159,
+  "num_train_windows": 811,
+  "num_test_windows": 348,
+  "forecast_frames": 10,
+  "mpjpe": 0.8222644925117493,
+  "final_frame_mpjpe": 1.0649521350860596,
+  "target_dim": 1260
+}

artifacts/episode_task_suite/misalignment_detection/metrics.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "accuracy": 0.5028901734104047,
+  "precision": 0.5030864197530864,
+  "recall": 0.47109826589595377,
+  "f1": 0.4865671641791045,
+  "tp": 163,
+  "tn": 185,
+  "fp": 161,
+  "fn": 183,
+  "positive_rate_true": 0.5,
+  "positive_rate_pred": 0.4682080924855491,
+  "task": "misalignment_detection",
+  "input": "motion+visual pair -> aligned vs shifted by 8 windows",
+  "split": "chronological",
+  "num_samples": 2306,
+  "num_train_samples": 1614,
+  "num_test_samples": 692,
+  "train_final_accuracy": 0.5018587360594795
+}

artifacts/episode_task_suite/misalignment_detection/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:375daf8e2d5e8e926970c457eff3c48ab402608c02cc564135f367b133609063
+size 110186

artifacts/episode_task_suite/modality_reconstruction/metrics.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "mse": 1359.1639404296875,
+  "mae": 0.31084805727005005,
+  "r2": -0.016022846771134747,
+  "task": "modality_reconstruction",
+  "input": "motion/IMU/camera",
+  "output": "depth/video feature vector",
+  "split": "chronological",
+  "num_train_windows": 813,
+  "num_test_windows": 348,
+  "target_dim": 5096
+}

artifacts/episode_task_suite/next_action/metrics.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "accuracy": 0.034482758620689655,
+  "balanced_accuracy": 0.04,
+  "macro_f1": 0.05925925925925927,
+  "weighted_f1": 0.05108556832694764,
+  "num_eval_windows": 348,
+  "num_classes": 18,
+  "task": "next_action",
+  "input": "all modalities at t -> action at t+20 frames",
+  "split": "chronological",
+  "num_windows": 1161,
+  "num_train_windows": 813,
+  "num_test_windows": 348,
+  "feature_dim": 8378,
+  "majority_baseline_accuracy": 0.0,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.017629079520702362,
+  "unseen_test_classes": [
+    "Place item on table",
+    "Pour coffee",
+    "Pour milk into coffee",
+    "Wait/Prepare for pouring"
+  ]
+}

artifacts/episode_task_suite/next_action/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fcfa0e624694a7b07fecac33d9385c54f5aeb1faf4517d11fbf6db3b973292d
+size 620530

artifacts/episode_task_suite/object_relevance/metrics.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "micro_f1": 0.18393030009680542,
+  "macro_f1": 0.06427052187996415,
+  "exact_match": 0.005747126436781609,
+  "precision": 0.16360505166475317,
+  "recall": 0.21002210759027265,
+  "task": "object_relevance",
+  "input": "all non-caption modalities -> current relevant object set",
+  "split": "chronological",
+  "num_windows": 1161,
+  "num_train_windows": 813,
+  "num_test_windows": 348,
+  "num_objects": 34
+}

artifacts/episode_task_suite/object_relevance/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aca088062b23a8fa8b05b261cf698c50c00b11be238eb4b9260f7609da70ff11
+size 1002718

artifacts/episode_task_suite/temporal_order/metrics.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "accuracy": 0.46120689655172414,
+  "precision": 0.4720496894409938,
+  "recall": 0.6551724137931034,
+  "f1": 0.5487364620938628,
+  "tp": 228,
+  "tn": 93,
+  "fp": 255,
+  "fn": 120,
+  "positive_rate_true": 0.5,
+  "positive_rate_pred": 0.6939655172413793,
+  "task": "temporal_order",
+  "input": "two adjacent windows -> whether order is correct",
+  "split": "chronological",
+  "num_samples": 2320,
+  "num_train_samples": 1624,
+  "num_test_samples": 696,
+  "train_final_accuracy": 0.5104679802955665
+}

artifacts/episode_task_suite/temporal_order/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04330ca7fe354ecb592f366d27764a538e2b51fd6d23f66d618ea86d33c34f4e
+size 335170

artifacts/episode_task_suite/timeline_action/metrics.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "accuracy": 0.029154518950437316,
+  "balanced_accuracy": 0.03125,
+  "macro_f1": 0.05,
+  "weighted_f1": 0.04664723032069971,
+  "num_eval_windows": 343,
+  "num_classes": 18,
+  "task": "timeline_action",
+  "input": "all modalities -> current action label",
+  "split": "chronological",
+  "num_windows": 1144,
+  "num_train_windows": 801,
+  "num_test_windows": 343,
+  "feature_dim": 8378,
+  "majority_baseline_accuracy": 0.0,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.01664665900170803,
+  "unseen_test_classes": [
+    "Place item on table",
+    "Pour coffee",
+    "Pour milk into coffee",
+    "Wait/Prepare for pouring"
+  ]
+}

artifacts/episode_task_suite/timeline_action/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3052fc9442607895eb6dc5ca81d5a1c28f4cdf9e1f9a4931e6ef78403283a7c
+size 620781

artifacts/episode_task_suite/timeline_subtask/metrics.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "accuracy": 0.05813953488372093,
+  "balanced_accuracy": 0.05376979652090881,
+  "macro_f1": 0.04954121121178666,
+  "weighted_f1": 0.06731304264454903,
+  "num_eval_windows": 344,
+  "num_classes": 14,
+  "task": "timeline_subtask",
+  "input": "all modalities -> current subtask label",
+  "split": "chronological",
+  "num_windows": 1147,
+  "num_train_windows": 803,
+  "num_test_windows": 344,
+  "feature_dim": 8378,
+  "majority_baseline_accuracy": 0.0,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.014040183275938034,
+  "unseen_test_classes": [
+    "Move bottle to coffee equipment",
+    "Pour coffee",
+    "Pour milk into coffee",
+    "Prepare for pouring"
+  ]
+}

artifacts/episode_task_suite/timeline_subtask/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39dace29541e90a947e902a7ba7afd39b7a2c1d3123ed513653d0704d45d2ad1
+size 496518

artifacts/episode_task_suite/transition_detection/metrics.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "accuracy": 0.9252873563218391,
+  "balanced_accuracy": 0.6931475903614458,
+  "macro_f1": 0.6551829268292684,
+  "weighted_f1": 0.9323030557891787,
+  "num_eval_windows": 348,
+  "num_classes": 2,
+  "task": "transition_detection",
+  "input": "all modalities -> action boundary/steady",
+  "split": "chronological",
+  "num_windows": 1161,
+  "num_train_windows": 813,
+  "num_test_windows": 348,
+  "feature_dim": 8378,
+  "majority_baseline_accuracy": 0.9540229885057471,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.007071746978908777,
+  "unseen_test_classes": [],
+  "boundary_precision": 0.125,
+  "boundary_recall": 0.75,
+  "boundary_f1": 0.21428571428571427,
+  "matched_boundaries": 3,
+  "true_boundaries": 4,
+  "predicted_boundaries": 24,
+  "mean_abs_timing_error_frames": 2.6666666666666665
+}

artifacts/episode_task_suite/transition_detection/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f179e3278c2b0e6563ed0bfe14a42faae28a5a0a0aa4a0b056113fc345aa4a27
+size 122843

artifacts/min_action_model/metrics.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "accuracy": 0.9828178694158075,
+  "balanced_accuracy": 0.9643518518518519,
+  "macro_f1": 0.96884342657456,
+  "weighted_f1": 0.9824311468352843,
+  "num_eval_windows": 291,
+  "num_classes": 18,
+  "majority_baseline_accuracy": 0.13745704467353953,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.019042566418647766
+}

artifacts/min_action_model/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b143a74aa94c882e08279adabfcf5806348ccb37c70c9192c8def206fda97895
+size 163871

artifacts/min_all_modalities_action_model/available_modalities.json ADDED Viewed

	@@ -0,0 +1,83 @@

+[
+  {
+    "modality": "depth_confidence",
+    "shape": [
+      5821,
+      140
+    ]
+  },
+  {
+    "modality": "video/fisheye_cam0",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam1",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam2",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam3",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/stereo_left",
+    "path": "data/sample/xperience-10m-sample/stereo_left.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/stereo_right",
+    "path": "data/sample/xperience-10m-sample/stereo_right.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "caption_text",
+    "shape": [
+      5821,
+      128
+    ],
+    "fields": "objects,interaction"
+  },
+  {
+    "modality": "slam_point_cloud_static",
+    "shape": [
+      22
+    ]
+  },
+  {
+    "modality": "calibration_static",
+    "shape": [
+      117
+    ]
+  }
+]

artifacts/min_all_modalities_action_model/feature_manifest.json ADDED Viewed

	@@ -0,0 +1,104 @@

+[
+  {
+    "name": "hand_left_joints",
+    "start": 0,
+    "end": 441,
+    "dim": 441
+  },
+  {
+    "name": "hand_right_joints",
+    "start": 441,
+    "end": 882,
+    "dim": 441
+  },
+  {
+    "name": "body_joints",
+    "start": 882,
+    "end": 1974,
+    "dim": 1092
+  },
+  {
+    "name": "body_contacts",
+    "start": 1974,
+    "end": 2121,
+    "dim": 147
+  },
+  {
+    "name": "camera_translation",
+    "start": 2121,
+    "end": 2142,
+    "dim": 21
+  },
+  {
+    "name": "camera_rotation_matrix",
+    "start": 2142,
+    "end": 2205,
+    "dim": 63
+  },
+  {
+    "name": "imu_accel_gyro",
+    "start": 2205,
+    "end": 2247,
+    "dim": 42
+  },
+  {
+    "name": "depth_confidence",
+    "start": 2247,
+    "end": 3227,
+    "dim": 980
+  },
+  {
+    "name": "video_fisheye_cam0",
+    "start": 3227,
+    "end": 3913,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam1",
+    "start": 3913,
+    "end": 4599,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam2",
+    "start": 4599,
+    "end": 5285,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam3",
+    "start": 5285,
+    "end": 5971,
+    "dim": 686
+  },
+  {
+    "name": "video_stereo_left",
+    "start": 5971,
+    "end": 6657,
+    "dim": 686
+  },
+  {
+    "name": "video_stereo_right",
+    "start": 6657,
+    "end": 7343,
+    "dim": 686
+  },
+  {
+    "name": "caption_objects_interaction_text",
+    "start": 7343,
+    "end": 8239,
+    "dim": 896
+  },
+  {
+    "name": "slam_point_cloud",
+    "start": 8239,
+    "end": 8261,
+    "dim": 22
+  },
+  {
+    "name": "calibration",
+    "start": 8261,
+    "end": 8378,
+    "dim": 117
+  }
+]

artifacts/min_all_modalities_action_model/metrics.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "accuracy": 0.9828178694158075,
+  "balanced_accuracy": 0.9800925925925925,
+  "macro_f1": 0.9791023658779895,
+  "weighted_f1": 0.98276563540562,
+  "num_eval_windows": 291,
+  "num_classes": 18,
+  "majority_baseline_accuracy": 0.13745704467353953,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.014624637551605701,
+  "feature_dim": 8378,
+  "num_windows": 1144
+}

artifacts/min_all_modalities_action_model/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:228cda0f036f86a7a1cb44e67d5c7112747bfc5cc27bf91c90516c6ba8322c81
+size 621786

artifacts/min_all_modalities_subtask_model/available_modalities.json ADDED Viewed

	@@ -0,0 +1,83 @@

+[
+  {
+    "modality": "depth_confidence",
+    "shape": [
+      5821,
+      140
+    ]
+  },
+  {
+    "modality": "video/fisheye_cam0",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam1",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam2",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/fisheye_cam3",
+    "path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/stereo_left",
+    "path": "data/sample/xperience-10m-sample/stereo_left.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "video/stereo_right",
+    "path": "data/sample/xperience-10m-sample/stereo_right.mp4",
+    "shape": [
+      5821,
+      98
+    ],
+    "exists": true
+  },
+  {
+    "modality": "caption_text",
+    "shape": [
+      5821,
+      128
+    ],
+    "fields": "objects,interaction"
+  },
+  {
+    "modality": "slam_point_cloud_static",
+    "shape": [
+      22
+    ]
+  },
+  {
+    "modality": "calibration_static",
+    "shape": [
+      117
+    ]
+  }
+]

artifacts/min_all_modalities_subtask_model/feature_manifest.json ADDED Viewed

	@@ -0,0 +1,104 @@

+[
+  {
+    "name": "hand_left_joints",
+    "start": 0,
+    "end": 441,
+    "dim": 441
+  },
+  {
+    "name": "hand_right_joints",
+    "start": 441,
+    "end": 882,
+    "dim": 441
+  },
+  {
+    "name": "body_joints",
+    "start": 882,
+    "end": 1974,
+    "dim": 1092
+  },
+  {
+    "name": "body_contacts",
+    "start": 1974,
+    "end": 2121,
+    "dim": 147
+  },
+  {
+    "name": "camera_translation",
+    "start": 2121,
+    "end": 2142,
+    "dim": 21
+  },
+  {
+    "name": "camera_rotation_matrix",
+    "start": 2142,
+    "end": 2205,
+    "dim": 63
+  },
+  {
+    "name": "imu_accel_gyro",
+    "start": 2205,
+    "end": 2247,
+    "dim": 42
+  },
+  {
+    "name": "depth_confidence",
+    "start": 2247,
+    "end": 3227,
+    "dim": 980
+  },
+  {
+    "name": "video_fisheye_cam0",
+    "start": 3227,
+    "end": 3913,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam1",
+    "start": 3913,
+    "end": 4599,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam2",
+    "start": 4599,
+    "end": 5285,
+    "dim": 686
+  },
+  {
+    "name": "video_fisheye_cam3",
+    "start": 5285,
+    "end": 5971,
+    "dim": 686
+  },
+  {
+    "name": "video_stereo_left",
+    "start": 5971,
+    "end": 6657,
+    "dim": 686
+  },
+  {
+    "name": "video_stereo_right",
+    "start": 6657,
+    "end": 7343,
+    "dim": 686
+  },
+  {
+    "name": "caption_objects_interaction_text",
+    "start": 7343,
+    "end": 8239,
+    "dim": 896
+  },
+  {
+    "name": "slam_point_cloud",
+    "start": 8239,
+    "end": 8261,
+    "dim": 22
+  },
+  {
+    "name": "calibration",
+    "start": 8261,
+    "end": 8378,
+    "dim": 117
+  }
+]

artifacts/min_all_modalities_subtask_model/metrics.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "accuracy": 0.9827586206896551,
+  "balanced_accuracy": 0.9505102040816327,
+  "macro_f1": 0.9307645963773675,
+  "weighted_f1": 0.9837987833808578,
+  "num_eval_windows": 290,
+  "num_classes": 14,
+  "majority_baseline_accuracy": 0.14482758620689656,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.012823422439396381,
+  "feature_dim": 8378,
+  "num_windows": 1147
+}

artifacts/min_all_modalities_subtask_model/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ec248d69f63d5acd00c83c024bbfe23cadf0ab0ba1b6c9ff3916d2b1d76ee94
+size 497409

artifacts/min_subtask_model/metrics.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "accuracy": 0.9758620689655172,
+  "balanced_accuracy": 0.9783924095954172,
+  "macro_f1": 0.9528048001232955,
+  "weighted_f1": 0.9778836359351952,
+  "num_eval_windows": 290,
+  "num_classes": 14,
+  "majority_baseline_accuracy": 0.14482758620689656,
+  "train_final_accuracy": 1.0,
+  "train_final_loss": 0.02664567530155182
+}

artifacts/min_subtask_model/model.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:226b91679611e731abf36ec55f1181ad2748b25e9e84c6f09e35b00dd43a863f
+size 131612

assets/task_architectures.svg ADDED Viewed

assets/task_suite_infographic.png ADDED Viewed

Git LFS Details

SHA256: 38ba0968f53333b74069e36bec35382cb9c97568da8be528536acc2d69fdb168
Pointer size: 132 Bytes
Size of remote file: 1.32 MB

notes/all_modalities_model.md ADDED Viewed

	@@ -0,0 +1,148 @@

+# All-Modality Minimal Model
+Script:
+```text
+scripts/train_all_modalities_model.py
+```
+This extends the first minimal model by using every major sample modality in a lightweight way.
+## Modalities Used
+Dynamic sensor/action modalities:
+- `hand_mocap/left_joints_3d`
+- `hand_mocap/right_joints_3d`
+- `full_body_mocap/keypoints`
+- `full_body_mocap/contacts`
+- `slam/trans_xyz`
+- `slam/quat_wxyz` converted by the toolkit into camera rotation matrices
+- `imu/accel_xyz`
+- `imu/gyro_xyz`
+- `depth/depth`
+- `depth/confidence`
+- `fisheye_cam0.mp4`
+- `fisheye_cam1.mp4`
+- `fisheye_cam2.mp4`
+- `fisheye_cam3.mp4`
+- `stereo_left.mp4`
+- `stereo_right.mp4`
+Static/context modalities:
+- `slam/point_cloud`
+- `calibration/*`
+- caption objects
+- caption interaction text
+By default, the script does **not** include `action_label`, `Sub Task`, or action-description text as input, because those are too close to the prediction target. You can force that with `--include-label-text`, but that should be treated as a leakage/debug run, not a fair action-recognition experiment.
+## Feature Design
+The model is still intentionally small:
+```text
+raw modality -> per-frame or static handcrafted features -> window temporal statistics -> softmax classifier
+```
+For each 20-frame window:
+- Motion signals use mean/std/min/max/delta/velocity statistics.
+- Depth uses global depth stats plus a small normalized depth grid and confidence grid.
+- Each video stream uses color stats, color histograms, a small grayscale grid, and simple edge stats.
+- Text uses a hashed bag-of-words vector from objects and interaction text.
+- Point cloud and calibration are included as static episode-level features.
+Current feature blocks:
+```text
+hand_left_joints:                  441
+hand_right_joints:                 441
+body_joints:                      1092
+body_contacts:                     147
+camera_translation:                 21
+camera_rotation_matrix:             63
+imu_accel_gyro:                     42
+depth_confidence:                  980
+video_fisheye_cam0:                686
+video_fisheye_cam1:                686
+video_fisheye_cam2:                686
+video_fisheye_cam3:                686
+video_stereo_left:                 686
+video_stereo_right:                686
+caption_objects_interaction_text:  896
+slam_point_cloud:                   22
+calibration:                       117
+total:                            8378
+```
+## Run Commands
+Action prediction:
+```bash
+cd /path/to/Ropedia
+source .venv/bin/activate
+python scripts/train_all_modalities_model.py
+```
+Subtask prediction:
+```bash
+python scripts/train_all_modalities_model.py --target subtask
+```
+The first run builds reusable caches in:
+```text
+outputs/feature_cache/
+```
+## Current Results
+Action-label model:
+```text
+outputs/min_all_modalities_action_model/
+accuracy:          0.9828
+balanced_accuracy: 0.9801
+macro_f1:          0.9791
+weighted_f1:       0.9828
+majority_baseline: 0.1375
+classes:           18
+feature_dim:       8378
+test_windows:      291
+```
+Subtask-label model:
+```text
+outputs/min_all_modalities_subtask_model/
+accuracy:          0.9828
+balanced_accuracy: 0.9505
+macro_f1:          0.9308
+weighted_f1:       0.9838
+majority_baseline: 0.1448
+classes:           14
+feature_dim:       8378
+test_windows:      290
+```
+## How To Interpret This
+This proves that the full sample can be converted into a complete supervised learning pipeline on this Mac.
+It does **not** prove real generalization, because the public sample is one episode and the split is random windows from that same episode. Neighboring windows are correlated.
+For a serious embodied-AI experiment:
+```text
+many episodes
+-> cache features per episode
+-> split by episode or task instance
+-> train on some episodes
+-> test on unseen episodes
+```
+The next useful upgrade is not a bigger classifier. It is a better split and more episodes.

notes/episode_task_suite.md ADDED Viewed

	@@ -0,0 +1,176 @@

+# Episode Task Suite
+Script:
+```text
+scripts/episode_task_suite.py
+```
+This script turns the single public Ropedia sample episode into many end-to-end tasks. It is designed for learning, debugging, and task design. It is **not** a generalization benchmark because the data is still one episode.
+Run:
+```bash
+cd /path/to/Ropedia
+source .venv/bin/activate
+python scripts/episode_task_suite.py
+```
+Output:
+```text
+outputs/episode_task_suite/
+```
+Shared setup:
+```text
+sample episode: 5821 frames
+windows:        1161
+window size:    20 frames
+stride:         5 frames
+feature dim:    8378
+split:          chronological, first 70% train and last 30% test
+```
+## Implemented Tasks
+| Task | Input | Output | Main artifact |
+|---|---|---|---|
+| `timeline_action` | all modality window | current action label | `timeline_action/metrics.json` |
+| `timeline_subtask` | all modality window | current subtask label | `timeline_subtask/metrics.json` |
+| `transition_detection` | all modality window | steady vs action boundary | `transition_detection/metrics.json` |
+| `next_action` | current all modality window | action 20 frames later | `next_action/metrics.json` |
+| `hand_trajectory_forecast` | current all modality window | future 10-frame left/right hand joints | `hand_trajectory_forecast/predictions.npz` |
+| `contact_prediction` | non-contact modalities | any body contact in window | `contact_prediction/metrics.json` |
+| `object_relevance` | non-caption modalities | relevant object set | `object_relevance/predictions.csv` |
+| `caption_grounding` | caption objects/interaction query + sensor candidates | matching time window | `caption_grounding/metrics.json` |
+| `cross_modal_retrieval` | motion/IMU/camera query | matching depth/video window | `cross_modal_retrieval/metrics.json` |
+| `modality_reconstruction` | motion/IMU/camera | depth/video feature vector | `modality_reconstruction/predictions.npz` |
+| `temporal_order` | two adjacent windows | whether order is correct | `temporal_order/metrics.json` |
+| `misalignment_detection` | motion+visual pair | aligned vs shifted | `misalignment_detection/metrics.json` |
+## Minimal Model Architectures
+All tasks share the same window builder unless a task explicitly removes a
+feature block to avoid label leakage.
+```text
+raw sample episode
+  -> 20-frame sliding windows, stride 5
+  -> all-modality feature vector X_all, 8,378 dimensions
+  -> chronological split, first 70% train and last 30% test
+  -> train-only z-score scaler
+  -> task-specific minimal head
+```
+The task suite intentionally uses simple heads:
+| Family | Formula | Tasks |
+|---|---|---|
+| Linear softmax | `softmax(z(X)W + b)`, cross-entropy, L2 | `timeline_action`, `timeline_subtask`, `transition_detection`, `next_action`, `contact_prediction`, `temporal_order`, `misalignment_detection` |
+| Ridge regression/projection | dual ridge regression with L2=10 on z-scored X/Y | `hand_trajectory_forecast`, `caption_grounding`, `cross_modal_retrieval`, `modality_reconstruction` |
+| Multi-label logistic | `sigmoid(z(X)W + b)`, weighted object heads | `object_relevance` |
+Task-specific architecture details:
+| Task | Input tensor/vector | Minimal head | Output target |
+|---|---|---|---|
+| `timeline_action` | `X_all`, 8,378d | class-weighted linear softmax | current action label |
+| `timeline_subtask` | `X_all`, 8,378d | class-weighted linear softmax | current subtask label |
+| `transition_detection` | `X_all`, 8,378d | class-weighted linear softmax | steady vs transition near action boundary |
+| `next_action` | `X_all(t)`, 8,378d | class-weighted linear softmax | action at `t+20` frames |
+| `hand_trajectory_forecast` | `X_all(t)`, 8,378d | ridge regression | future 10 frames of left/right hand joints, 1,260d |
+| `contact_prediction` | all features except `body_contacts` and caption text, 7,335d | linear softmax on observed labels | any body contact in window |
+| `object_relevance` | all features except caption text, 7,482d | multi-label logistic regression | 34-object multi-hot vector |
+| `caption_grounding` | sensor features, 7,482d, projected into 896d text space | ridge projection plus cosine ranking | matching time window for a text query |
+| `cross_modal_retrieval` | motion/IMU/camera, 2,247d, projected into 5,096d visual space | ridge projection plus cosine ranking | matching depth/video window |
+| `modality_reconstruction` | motion/IMU/camera, 2,247d | ridge regression | depth/video feature vector, 5,096d |
+| `temporal_order` | `[x_t, x_t+1, x_t+1-x_t]`, 25,134d | binary linear softmax | correct vs reversed order |
+| `misalignment_detection` | motion plus visual pair, 7,343d | binary linear softmax | aligned vs shifted by 8 windows |
+Diagram:
+```text
+docs/assets/task_architectures.svg
+```
+## Current Results
+```text
+timeline_action:
+  accuracy: 0.0292
+  macro_f1: 0.0500
+  note: future test region contains unseen action classes
+timeline_subtask:
+  accuracy: 0.0581
+  macro_f1: 0.0495
+  note: future test region contains unseen subtask classes
+transition_detection:
+  accuracy: 0.9253
+  macro_f1: 0.6552
+  boundary_f1: 0.2143
+next_action:
+  accuracy: 0.0345
+  macro_f1: 0.0593
+  note: same unseen-future-class problem as timeline_action
+hand_trajectory_forecast:
+  MPJPE: 0.8223
+  final-frame MPJPE: 1.0650
+contact_prediction:
+  accuracy: 1.0000
+  note: degenerate on this sample because the binary contact label has only one class
+object_relevance:
+  micro_f1: 0.1839
+  macro_f1: 0.0643
+caption_grounding:
+  top1: 0.0029
+  top5: 0.0115
+  MRR: 0.0172
+cross_modal_retrieval:
+  top1: 0.1494
+  top5: 0.3764
+  top10: 0.4741
+  MRR: 0.2634
+modality_reconstruction:
+  R2: -0.0160
+temporal_order:
+  accuracy: 0.4612
+  f1: 0.5487
+misalignment_detection:
+  accuracy: 0.5029
+  f1: 0.4866
+```
+## How To Read These Results
+Low scores are useful here. They show which tasks are not learnable from this one chronological sample with this minimal model.
+The strongest signal is `cross_modal_retrieval`: motion/IMU/camera features can retrieve the matching depth/video window better than random. That means the modalities are synchronized and contain shared temporal structure.
+The weakest supervised timeline tasks are weak mainly because of the split. The last 30% of a single ordered episode contains actions/subtasks not present in the first 70%, so a classifier trained on the first part cannot predict labels it never saw.
+For serious research, keep the same task code but change the dataset unit:
+```text
+many episodes -> train episodes -> test unseen episodes
+```
+For single-episode learning, these tasks are best used as:
+- data pipeline tests
+- modality ablations
+- label-alignment checks
+- self-supervised retrieval experiments
+- debugging templates before scaling to many episodes

notes/min_action_model.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# Minimal Action Model
+This is the first modeling baseline for the Ropedia/Xperience sample.
+The script is:
+```text
+scripts/train_min_action_model.py
+```
+It trains a small Numpy-only softmax classifier:
+```text
+annotation.hdf5
+  -> hand/body/IMU/camera/contact windows
+  -> action or subtask labels from captions
+  -> stratified train/test split
+  -> multinomial logistic regression
+  -> metrics and predictions
+```
+Run:
+```bash
+cd /path/to/Ropedia
+source .venv/bin/activate
+python scripts/train_min_action_model.py
+```
+Default output:
+```text
+outputs/min_action_model/
+```
+Important artifacts:
+- `metrics.json`: accuracy, balanced accuracy, macro-F1, weighted-F1, majority baseline.
+- `per_class_metrics.csv`: precision/recall/F1 per action class.
+- `confusion_matrix.csv`: true label vs predicted label matrix.
+- `predictions.csv`: one row per test window.
+- `feature_dataset.npz`: processed numeric features and labels.
+- `model.npz`: fitted scaler and softmax weights.
+This is a learning baseline, not a publishable benchmark. The public sample is only one episode, so stratified windows from one episode are correlated. For serious evaluation, use many episodes and split by held-out episodes or held-out task instances.
+## Current Sample Results
+Action-label model:
+```text
+outputs/min_action_model/
+accuracy:          0.9828
+balanced_accuracy: 0.9644
+macro_f1:          0.9688
+weighted_f1:       0.9824
+majority_baseline: 0.1375
+classes:           18
+test_windows:      291
+```
+Subtask-label model:
+```text
+outputs/min_subtask_model/
+accuracy:          0.9759
+balanced_accuracy: 0.9784
+macro_f1:          0.9528
+weighted_f1:       0.9779
+majority_baseline: 0.1448
+classes:           14
+test_windows:      290
+```
+Why the numbers are high:
+- This is one public sample episode.
+- Windows are stratified randomly, so train/test windows can be close in time.
+- The result proves the pipeline works; it does not prove cross-episode generalization.
+Next serious evaluation:
+```text
+many episodes -> split by episode -> train on some episodes -> test on unseen episodes
+```

notes/reproducibility_audit.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# Reproducibility Audit
+Audit date: 2026-05-30 Asia/Singapore.
+Purpose: verify that the committed Ropedia Episode Task Suite artifacts are
+real outputs from the scripts, not placeholder or fabricated metrics.
+## Raw Inputs Checked
+The audit used the local public sample episode:
+```text
+data/sample/xperience-10m-sample/
+  annotation.hdf5
+  fisheye_cam0.mp4
+  fisheye_cam1.mp4
+  fisheye_cam2.mp4
+  fisheye_cam3.mp4
+  stereo_left.mp4
+  stereo_right.mp4
+```
+`annotation.hdf5` contains 5,821 aligned frames with depth, hand mocap, body
+mocap, IMU, SLAM, calibration, and caption metadata. The video feature cache was
+rebuilt from all six video files during the audit.
+## Commands Re-run
+All audit outputs were written outside the repo:
+```bash
+AUDIT=/private/tmp/ropedia-audit
+WORKSPACE=/path/to/Ropedia
+ANN=$WORKSPACE/data/sample/xperience-10m-sample/annotation.hdf5
+PY=$WORKSPACE/.venv/bin/python
+$PY -B scripts/train_min_action_model.py \
+  --workspace $WORKSPACE \
+  --annotation $ANN \
+  --output-dir $AUDIT/min_action_model \
+  --target action
+$PY -B scripts/train_min_action_model.py \
+  --workspace $WORKSPACE \
+  --annotation $ANN \
+  --output-dir $AUDIT/min_subtask_model \
+  --target subtask
+$PY -B scripts/train_all_modalities_model.py \
+  --workspace $WORKSPACE \
+  --annotation $ANN \
+  --output-dir $AUDIT/min_all_modalities_action_model \
+  --cache-dir $AUDIT/cache \
+  --target action
+$PY -B scripts/train_all_modalities_model.py \
+  --workspace $WORKSPACE \
+  --annotation $ANN \
+  --output-dir $AUDIT/min_all_modalities_subtask_model \
+  --cache-dir $AUDIT/cache \
+  --target subtask
+$PY -B scripts/episode_task_suite.py \
+  --workspace $WORKSPACE \
+  --annotation $ANN \
+  --output-dir $AUDIT/episode_task_suite \
+  --cache-dir $AUDIT/cache
+```
+## Exact Match Checks
+The regenerated files matched the committed files:
+```text
+min_action_model/metrics.json: MATCH
+min_subtask_model/metrics.json: MATCH
+min_all_modalities_action_model/metrics.json: MATCH
+min_all_modalities_subtask_model/metrics.json: MATCH
+episode_task_suite/summary_report.json: MATCH
+episode_task_suite/feature_manifest.json: MATCH
+episode_task_suite/available_modalities.json: MATCH
+```
+Every per-task `metrics.json` also matched:
+```text
+caption_grounding/metrics.json: MATCH
+contact_prediction/metrics.json: MATCH
+cross_modal_retrieval/metrics.json: MATCH
+hand_trajectory_forecast/metrics.json: MATCH
+misalignment_detection/metrics.json: MATCH
+modality_reconstruction/metrics.json: MATCH
+next_action/metrics.json: MATCH
+object_relevance/metrics.json: MATCH
+temporal_order/metrics.json: MATCH
+timeline_action/metrics.json: MATCH
+timeline_subtask/metrics.json: MATCH
+transition_detection/metrics.json: MATCH
+```
+## Fresh Cache Evidence
+The all-modality audit rebuilt a fresh feature cache:
+```text
+depth_n5821_grid8.npz: shape=(5821, 140), nonzero=809107
+video_fisheye_cam0_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570458
+video_fisheye_cam1_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570400
+video_fisheye_cam2_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570458
+video_fisheye_cam3_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=568723
+video_stereo_left_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570249
+video_stereo_right_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570430
+```
+This confirms the committed metrics are reproducible from the raw sample and
+that the all-modality pipeline reads real depth/video files instead of using
+empty placeholder features.
+## Caveats
+The scripts contain a zero-feature fallback if a video file is missing. That is
+not the path used in this audit: all six videos existed and produced nonzero
+features. The repo remains a single-episode learning and pipeline-validation
+project, not evidence of cross-episode generalization.

scripts/episode_task_suite.py ADDED Viewed

	@@ -0,0 +1,776 @@

+#!/usr/bin/env python3
+"""
+End-to-end task suite for one Ropedia/Xperience episode.
+The purpose is not to prove generalization from one sample episode. It is to
+turn the episode into multiple meaningful supervised/self-supervised learning
+problems and write reproducible artifacts for each one.
+"""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import math
+import sys
+from collections import Counter, OrderedDict
+from pathlib import Path
+import numpy as np
+from train_all_modalities_model import (
+    extract_all_window_features,
+    prepare_modalities,
+)
+from train_min_action_model import (
+    add_toolkit_to_path,
+    compute_metrics,
+    encode_labels,
+    fit_scaler,
+    frame_label,
+    majority_label,
+    predict,
+    portable_path,
+    softmax,
+    train_softmax_classifier,
+)
+TASKS = [
+    "timeline_action",
+    "timeline_subtask",
+    "transition_detection",
+    "next_action",
+    "hand_trajectory_forecast",
+    "contact_prediction",
+    "object_relevance",
+    "caption_grounding",
+    "cross_modal_retrieval",
+    "modality_reconstruction",
+    "temporal_order",
+    "misalignment_detection",
+]
+def parse_args() -> argparse.Namespace:
+    workspace_default = Path(__file__).resolve().parents[1]
+    annotation_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
+    parser = argparse.ArgumentParser(description="Run an end-to-end task suite on one Ropedia episode.")
+    parser.add_argument("--workspace", type=Path, default=workspace_default)
+    parser.add_argument("--annotation", type=Path, default=annotation_default)
+    parser.add_argument("--output-dir", type=Path, default=workspace_default / "outputs/episode_task_suite")
+    parser.add_argument("--cache-dir", type=Path, default=workspace_default / "outputs/feature_cache")
+    parser.add_argument("--window-frames", type=int, default=20)
+    parser.add_argument("--stride-frames", type=int, default=5)
+    parser.add_argument("--min-label-fraction", type=float, default=0.6)
+    parser.add_argument("--test-fraction", type=float, default=0.30)
+    parser.add_argument("--epochs", type=int, default=400)
+    parser.add_argument("--learning-rate", type=float, default=0.12)
+    parser.add_argument("--l2", type=float, default=2e-3)
+    parser.add_argument("--ridge-l2", type=float, default=10.0)
+    parser.add_argument("--seed", type=int, default=7)
+    parser.add_argument("--future-frames", type=int, default=20, help="Future offset for next-action prediction.")
+    parser.add_argument("--forecast-frames", type=int, default=10, help="Future hand trajectory length.")
+    parser.add_argument("--boundary-tolerance-frames", type=int, default=10)
+    parser.add_argument("--misalignment-shift-windows", type=int, default=8)
+    parser.add_argument("--tasks", default="all", help="Comma-separated task list or 'all'.")
+    # Match train_all_modalities_model defaults used by prepare_modalities.
+    parser.add_argument("--force-rebuild-cache", action="store_true")
+    parser.add_argument("--video-image-size", type=int, default=32)
+    parser.add_argument("--video-grid-size", type=int, default=8)
+    parser.add_argument("--video-hist-bins", type=int, default=8)
+    parser.add_argument("--depth-grid-size", type=int, default=8)
+    parser.add_argument("--text-hash-dim", type=int, default=128)
+    parser.add_argument("--include-label-text", action="store_true")
+    parser.add_argument("--no-class-weights", action="store_true")
+    return parser.parse_args()
+def selected_tasks(spec: str) -> list[str]:
+    if spec.strip().lower() == "all":
+        return TASKS
+    chosen = [x.strip() for x in spec.split(",") if x.strip()]
+    unknown = [x for x in chosen if x not in TASKS]
+    if unknown:
+        raise ValueError(f"Unknown tasks: {unknown}. Valid tasks: {TASKS}")
+    return chosen
+def write_json(path: Path, data: dict | list) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, indent=2), encoding="utf-8")
+def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", newline="", encoding="utf-8") as fp:
+        writer = csv.DictWriter(fp, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+def write_confusion(path: Path, cm: np.ndarray, class_names: list[str]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", newline="", encoding="utf-8") as fp:
+        writer = csv.writer(fp)
+        writer.writerow(["true\\pred"] + class_names)
+        for i, name in enumerate(class_names):
+            writer.writerow([name] + [int(v) for v in cm[i]])
+def chronological_split_indices(n: int, test_fraction: float) -> tuple[np.ndarray, np.ndarray]:
+    if n < 2:
+        raise ValueError("Need at least two samples for train/test split.")
+    split = int(round(n * (1.0 - test_fraction)))
+    split = max(1, min(split, n - 1))
+    return np.arange(split, dtype=np.int64), np.arange(split, n, dtype=np.int64)
+def build_windows(args: argparse.Namespace, ann: dict, extras: dict):
+    frame_info = ann["caption_frame_info_map"]
+    n_frames = len(ann["img_names"])
+    rows = []
+    X = []
+    feature_manifest = None
+    for start in range(0, n_frames - args.window_frames + 1, args.stride_frames):
+        end = start + args.window_frames
+        action_labels = [frame_label(frame_info.get(i, {}), "action") for i in range(start, end)]
+        subtask_labels = [frame_label(frame_info.get(i, {}), "subtask") for i in range(start, end)]
+        action, action_frac = majority_label(action_labels, args.min_label_fraction)
+        subtask, subtask_frac = majority_label(subtask_labels, args.min_label_fraction)
+        if feature_manifest is None:
+            vec, blocks = extract_all_window_features(ann, extras, start, end, return_blocks=True)
+            offset = 0
+            feature_manifest = []
+            for name, dim in blocks:
+                feature_manifest.append({"name": name, "start": offset, "end": offset + dim, "dim": dim})
+                offset += dim
+        else:
+            vec = extract_all_window_features(ann, extras, start, end)
+        X.append(vec)
+        rows.append({
+            "window_index": len(rows),
+            "start_frame": start,
+            "end_frame": end - 1,
+            "center_frame": (start + end - 1) // 2,
+            "action_label": action,
+            "action_fraction": action_frac,
+            "subtask_label": subtask,
+            "subtask_fraction": subtask_frac,
+        })
+    return np.stack(X).astype(np.float32), rows, feature_manifest or []
+def block_indices(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> np.ndarray:
+    include = include or []
+    exclude = exclude or []
+    idxs = []
+    for block in feature_manifest:
+        name = block["name"]
+        if include and not any(name == p or name.startswith(p) for p in include):
+            continue
+        if exclude and any(name == p or name.startswith(p) for p in exclude):
+            continue
+        idxs.extend(range(int(block["start"]), int(block["end"])))
+    return np.asarray(idxs, dtype=np.int64)
+def label_array(rows: list[dict], key: str) -> np.ndarray:
+    return np.asarray([str(row.get(key, "") or "") for row in rows], dtype=object)
+def classification_task(
+    out_dir: Path,
+    X: np.ndarray,
+    labels: np.ndarray,
+    rows: list[dict],
+    args: argparse.Namespace,
+    task_name: str,
+    input_description: str,
+) -> dict:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    valid = np.asarray([bool(x) for x in labels])
+    valid_idx = np.flatnonzero(valid)
+    Xv = X[valid_idx]
+    labelv = labels[valid_idx]
+    rowv = [rows[int(i)] for i in valid_idx]
+    y, class_names = encode_labels(labelv)
+    train_local, test_local = chronological_split_indices(len(y), args.test_fraction)
+    train_classes = set(int(x) for x in y[train_local])
+    test_classes = set(int(x) for x in y[test_local])
+    unseen_test_classes = sorted(class_names[i] for i in (test_classes - train_classes))
+    mean, std = fit_scaler(Xv[train_local])
+    Xs = (Xv - mean) / std
+    W, b, history = train_softmax_classifier(
+        Xs[train_local],
+        y[train_local],
+        n_classes=len(class_names),
+        epochs=args.epochs,
+        lr=args.learning_rate,
+        l2=args.l2,
+        use_class_weights=not args.no_class_weights,
+        seed=args.seed,
+    )
+    pred, probs = predict(Xs[test_local], W, b)
+    metrics, per_class, cm = compute_metrics(y[test_local], pred, class_names)
+    majority = Counter(y[train_local]).most_common(1)[0][0]
+    metrics.update({
+        "task": task_name,
+        "input": input_description,
+        "split": "chronological",
+        "num_windows": int(len(y)),
+        "num_train_windows": int(len(train_local)),
+        "num_test_windows": int(len(test_local)),
+        "num_classes": int(len(class_names)),
+        "feature_dim": int(X.shape[1]),
+        "majority_baseline_accuracy": float(np.mean(y[test_local] == majority)),
+        "train_final_accuracy": float(history[-1]["train_accuracy"]),
+        "train_final_loss": float(history[-1]["loss"]),
+        "unseen_test_classes": unseen_test_classes,
+    })
+    pred_rows = []
+    for local_pos, pred_id in zip(test_local, pred):
+        row = rowv[int(local_pos)]
+        true_id = int(y[int(local_pos)])
+        pred_rows.append({
+            "window_index": row["window_index"],
+            "start_frame": row["start_frame"],
+            "end_frame": row["end_frame"],
+            "center_frame": row["center_frame"],
+            "true_label": class_names[true_id],
+            "predicted_label": class_names[int(pred_id)],
+            "confidence": float(probs[list(test_local).index(local_pos), int(pred_id)]),
+            "correct": int(true_id == int(pred_id)),
+        })
+    write_json(out_dir / "metrics.json", metrics)
+    write_csv(out_dir / "per_class_metrics.csv", per_class, ["class_id", "class_name", "support", "predicted", "precision", "recall", "f1"])
+    write_confusion(out_dir / "confusion_matrix.csv", cm, class_names)
+    write_csv(out_dir / "predictions.csv", pred_rows, ["window_index", "start_frame", "end_frame", "center_frame", "true_label", "predicted_label", "confidence", "correct"])
+    np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, class_names=np.asarray(class_names, dtype=object))
+    return metrics
+def binary_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
+    y_true = y_true.astype(np.int64)
+    y_pred = y_pred.astype(np.int64)
+    tp = int(np.sum((y_true == 1) & (y_pred == 1)))
+    tn = int(np.sum((y_true == 0) & (y_pred == 0)))
+    fp = int(np.sum((y_true == 0) & (y_pred == 1)))
+    fn = int(np.sum((y_true == 1) & (y_pred == 0)))
+    precision = tp / (tp + fp) if tp + fp else 0.0
+    recall = tp / (tp + fn) if tp + fn else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
+    return {
+        "accuracy": float((tp + tn) / max(len(y_true), 1)),
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+        "tp": tp,
+        "tn": tn,
+        "fp": fp,
+        "fn": fn,
+        "positive_rate_true": float(np.mean(y_true)) if len(y_true) else 0.0,
+        "positive_rate_pred": float(np.mean(y_pred)) if len(y_pred) else 0.0,
+    }
+def boundary_f1(true_frames: list[int], pred_frames: list[int], tolerance: int) -> dict:
+    used = set()
+    matches = 0
+    errors = []
+    for pf in pred_frames:
+        candidates = [(abs(pf - tf), j, tf) for j, tf in enumerate(true_frames) if j not in used and abs(pf - tf) <= tolerance]
+        if not candidates:
+            continue
+        diff, j, tf = min(candidates)
+        used.add(j)
+        matches += 1
+        errors.append(diff)
+    precision = matches / len(pred_frames) if pred_frames else 0.0
+    recall = matches / len(true_frames) if true_frames else 0.0
+    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
+    return {
+        "boundary_precision": precision,
+        "boundary_recall": recall,
+        "boundary_f1": f1,
+        "matched_boundaries": matches,
+        "true_boundaries": len(true_frames),
+        "predicted_boundaries": len(pred_frames),
+        "mean_abs_timing_error_frames": float(np.mean(errors)) if errors else None,
+    }
+def task_transition_detection(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
+    frame_info = ann["caption_frame_info_map"]
+    n_frames = len(ann["img_names"])
+    per_frame = [frame_label(frame_info.get(i, {}), "action") for i in range(n_frames)]
+    true_boundaries = [i for i in range(1, n_frames) if per_frame[i] and per_frame[i - 1] and per_frame[i] != per_frame[i - 1]]
+    y = []
+    for row in rows:
+        c = int(row["center_frame"])
+        y.append(int(any(abs(c - b) <= args.boundary_tolerance_frames for b in true_boundaries)))
+    labels = np.asarray(["transition" if v else "steady" for v in y], dtype=object)
+    metrics = classification_task(out_dir, X, labels, rows, args, "transition_detection", "all modalities -> action boundary/steady")
+    pred_path = out_dir / "predictions.csv"
+    pred_rows = []
+    with pred_path.open("r", encoding="utf-8") as fp:
+        for row in csv.DictReader(fp):
+            pred_rows.append(row)
+    pred_frames = [int(r["center_frame"]) for r in pred_rows if r["predicted_label"] == "transition"]
+    test_start = min((int(r["center_frame"]) for r in pred_rows), default=0)
+    test_end = max((int(r["center_frame"]) for r in pred_rows), default=0)
+    true_test = [b for b in true_boundaries if test_start <= b <= test_end]
+    metrics.update(boundary_f1(true_test, pred_frames, args.boundary_tolerance_frames))
+    write_json(out_dir / "metrics.json", metrics)
+    write_csv(out_dir / "true_boundaries.csv", [{"frame": x} for x in true_boundaries], ["frame"])
+    return metrics
+def task_next_action(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
+    frame_info = ann["caption_frame_info_map"]
+    labels = []
+    for row in rows:
+        future_frame = min(len(ann["img_names"]) - 1, int(row["end_frame"]) + args.future_frames)
+        labels.append(frame_label(frame_info.get(future_frame, {}), "action"))
+    return classification_task(out_dir, X, np.asarray(labels, dtype=object), rows, args, "next_action", f"all modalities at t -> action at t+{args.future_frames} frames")
+def ridge_fit_predict(X_train: np.ndarray, Y_train: np.ndarray, X_test: np.ndarray, l2: float):
+    x_mean, x_std = fit_scaler(X_train)
+    y_mean = Y_train.mean(axis=0)
+    y_std = Y_train.std(axis=0)
+    y_std = np.where(y_std < 1e-6, 1.0, y_std)
+    Xtr = (X_train - x_mean) / x_std
+    Xte = (X_test - x_mean) / x_std
+    Ytr = (Y_train - y_mean) / y_std
+    Xtr_aug = np.concatenate([Xtr, np.ones((len(Xtr), 1), dtype=np.float32)], axis=1)
+    Xte_aug = np.concatenate([Xte, np.ones((len(Xte), 1), dtype=np.float32)], axis=1)
+    K = Xtr_aug @ Xtr_aug.T
+    alpha = np.linalg.solve(K + l2 * np.eye(K.shape[0], dtype=np.float32), Ytr)
+    W = Xtr_aug.T @ alpha
+    pred = (Xte_aug @ W) * y_std + y_mean
+    return pred.astype(np.float32), {"x_mean": x_mean, "x_std": x_std, "y_mean": y_mean.astype(np.float32), "y_std": y_std.astype(np.float32), "W": W.astype(np.float32)}
+def regression_metrics(Y_true: np.ndarray, Y_pred: np.ndarray) -> dict:
+    mse = float(np.mean((Y_true - Y_pred) ** 2))
+    mae = float(np.mean(np.abs(Y_true - Y_pred)))
+    ss_res = float(np.sum((Y_true - Y_pred) ** 2))
+    ss_tot = float(np.sum((Y_true - Y_true.mean(axis=0)) ** 2))
+    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
+    return {"mse": mse, "mae": mae, "r2": r2}
+def task_hand_forecast(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
+    left = ann.get("hand_left_joints")
+    right = ann.get("hand_right_joints")
+    body = ann.get("smplh_body_joints")
+    if left is None or right is None:
+        raise ValueError("Hand joints not available.")
+    valid_idx, Y = [], []
+    n_frames = len(left)
+    for i, row in enumerate(rows):
+        future_start = int(row["end_frame"]) + 1
+        future_end = future_start + args.forecast_frames
+        if future_end > n_frames:
+            continue
+        hand = np.concatenate([left[future_start:future_end], right[future_start:future_end]], axis=1)
+        if body is not None and future_end <= len(body):
+            root = body[future_start:future_end, :1, :]
+            hand = hand - root
+        valid_idx.append(i)
+        Y.append(hand.reshape(-1))
+    valid_idx = np.asarray(valid_idx, dtype=np.int64)
+    Y = np.stack(Y).astype(np.float32)
+    train, test = chronological_split_indices(len(valid_idx), args.test_fraction)
+    pred, model = ridge_fit_predict(X[valid_idx[train]], Y[train], X[valid_idx[test]], args.ridge_l2)
+    metrics = regression_metrics(Y[test], pred)
+    true_hand = Y[test].reshape(len(test), args.forecast_frames, 42, 3)
+    pred_hand = pred.reshape(len(test), args.forecast_frames, 42, 3)
+    mpjpe = np.linalg.norm(true_hand - pred_hand, axis=-1).mean()
+    final_error = np.linalg.norm(true_hand[:, -1] - pred_hand[:, -1], axis=-1).mean()
+    metrics.update({
+        "task": "hand_trajectory_forecast",
+        "input": "all modalities at t -> future left/right hand 3D joints",
+        "split": "chronological",
+        "num_windows": int(len(valid_idx)),
+        "num_train_windows": int(len(train)),
+        "num_test_windows": int(len(test)),
+        "forecast_frames": int(args.forecast_frames),
+        "mpjpe": float(mpjpe),
+        "final_frame_mpjpe": float(final_error),
+        "target_dim": int(Y.shape[1]),
+    })
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write_json(out_dir / "metrics.json", metrics)
+    np.savez_compressed(out_dir / "predictions.npz", y_true=Y[test], y_pred=pred, test_window_indices=valid_idx[test], **model)
+    return metrics
+def task_contact_prediction(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict:
+    contacts = ann.get("contacts")
+    if contacts is None:
+        raise ValueError("Contacts not available.")
+    y = []
+    for row in rows:
+        c = contacts[int(row["start_frame"]):int(row["end_frame"]) + 1]
+        y.append("contact" if np.any(c > 0) else "no_contact")
+    keep = block_indices(manifest, exclude=["body_contacts", "caption_objects_interaction_text"])
+    return classification_task(out_dir, X[:, keep], np.asarray(y, dtype=object), rows, args, "contact_prediction", "all non-contact/non-caption-label modalities -> any body contact")
+def extract_objects(info: dict) -> list[str]:
+    objects = info.get("objects")
+    if isinstance(objects, list):
+        return [str(x).strip() for x in objects if str(x).strip()]
+    if objects:
+        return [str(objects).strip()]
+    return []
+def sigmoid(z: np.ndarray) -> np.ndarray:
+    return 1.0 / (1.0 + np.exp(-np.clip(z, -40, 40)))
+def train_multilabel_logistic(X: np.ndarray, Y: np.ndarray, epochs: int, lr: float, l2: float, seed: int):
+    rng = np.random.default_rng(seed)
+    n, d = X.shape
+    c = Y.shape[1]
+    W = rng.normal(0, 0.01, size=(d, c)).astype(np.float32)
+    b = np.zeros(c, dtype=np.float32)
+    counts = Y.sum(axis=0)
+    pos_weight = (n - counts) / np.maximum(counts, 1.0)
+    pos_weight = np.clip(pos_weight, 1.0, 20.0).astype(np.float32)
+    history = []
+    for epoch in range(1, epochs + 1):
+        P = sigmoid(X @ W + b)
+        weights = np.where(Y > 0, pos_weight[None, :], 1.0)
+        diff = (P - Y) * weights / n
+        W -= lr * (X.T @ diff + l2 * W)
+        b -= lr * diff.sum(axis=0)
+        if epoch == 1 or epoch == epochs or epoch % max(1, epochs // 5) == 0:
+            pred = (P >= 0.5).astype(np.float32)
+            history.append({"epoch": epoch, **multilabel_metrics(Y, pred)})
+    return W.astype(np.float32), b.astype(np.float32), history
+def multilabel_metrics(Y: np.ndarray, P: np.ndarray) -> dict:
+    Y = Y.astype(np.int64)
+    P = P.astype(np.int64)
+    tp = int(np.sum((Y == 1) & (P == 1)))
+    fp = int(np.sum((Y == 0) & (P == 1)))
+    fn = int(np.sum((Y == 1) & (P == 0)))
+    precision = tp / (tp + fp) if tp + fp else 0.0
+    recall = tp / (tp + fn) if tp + fn else 0.0
+    micro_f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
+    per_f1 = []
+    for j in range(Y.shape[1]):
+        tpj = np.sum((Y[:, j] == 1) & (P[:, j] == 1))
+        fpj = np.sum((Y[:, j] == 0) & (P[:, j] == 1))
+        fnj = np.sum((Y[:, j] == 1) & (P[:, j] == 0))
+        pj = tpj / (tpj + fpj) if tpj + fpj else 0.0
+        rj = tpj / (tpj + fnj) if tpj + fnj else 0.0
+        per_f1.append(2 * pj * rj / (pj + rj) if pj + rj else 0.0)
+    exact = float(np.mean(np.all(Y == P, axis=1)))
+    return {"micro_f1": float(micro_f1), "macro_f1": float(np.mean(per_f1)), "exact_match": exact, "precision": precision, "recall": recall}
+def task_object_relevance(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict:
+    frame_info = ann["caption_frame_info_map"]
+    vocab = OrderedDict()
+    labels = []
+    for row in rows:
+        counts = Counter()
+        for frame in range(int(row["start_frame"]), int(row["end_frame"]) + 1):
+            counts.update(extract_objects(frame_info.get(frame, {})))
+        objects = [obj for obj, count in counts.items() if count > 0]
+        for obj in objects:
+            if obj not in vocab:
+                vocab[obj] = len(vocab)
+        labels.append(objects)
+    if not vocab:
+        raise ValueError("No object labels found.")
+    Y = np.zeros((len(rows), len(vocab)), dtype=np.float32)
+    for i, objects in enumerate(labels):
+        for obj in objects:
+            Y[i, vocab[obj]] = 1.0
+    keep = block_indices(manifest, exclude=["caption_objects_interaction_text"])
+    Xo = X[:, keep]
+    train, test = chronological_split_indices(len(rows), args.test_fraction)
+    mean, std = fit_scaler(Xo[train])
+    Xs = (Xo - mean) / std
+    W, b, history = train_multilabel_logistic(Xs[train], Y[train], args.epochs, 0.05, args.l2, args.seed)
+    prob = sigmoid(Xs[test] @ W + b)
+    pred = (prob >= 0.5).astype(np.float32)
+    # Ensure at least one object is emitted per row.
+    empty = np.where(pred.sum(axis=1) == 0)[0]
+    if len(empty):
+        pred[empty, np.argmax(prob[empty], axis=1)] = 1
+    metrics = multilabel_metrics(Y[test], pred)
+    metrics.update({
+        "task": "object_relevance",
+        "input": "all non-caption modalities -> current relevant object set",
+        "split": "chronological",
+        "num_windows": int(len(rows)),
+        "num_train_windows": int(len(train)),
+        "num_test_windows": int(len(test)),
+        "num_objects": int(len(vocab)),
+    })
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write_json(out_dir / "metrics.json", metrics)
+    write_json(out_dir / "object_vocab.json", list(vocab.keys()))
+    rows_out = []
+    names = list(vocab.keys())
+    for local_i, global_i in enumerate(test):
+        true_objs = [names[j] for j in np.flatnonzero(Y[global_i] > 0)]
+        pred_objs = [names[j] for j in np.flatnonzero(pred[local_i] > 0)]
+        rows_out.append({
+            "window_index": int(global_i),
+            "start_frame": rows[int(global_i)]["start_frame"],
+            "end_frame": rows[int(global_i)]["end_frame"],
+            "true_objects": "|".join(true_objs),
+            "predicted_objects": "|".join(pred_objs),
+        })
+    write_csv(out_dir / "predictions.csv", rows_out, ["window_index", "start_frame", "end_frame", "true_objects", "predicted_objects"])
+    np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, object_vocab=np.asarray(names, dtype=object), history=np.asarray(history, dtype=object))
+    return metrics
+def normalize_rows(A: np.ndarray) -> np.ndarray:
+    norm = np.linalg.norm(A, axis=1, keepdims=True)
+    return A / np.maximum(norm, 1e-8)
+def retrieval_metrics(query: np.ndarray, candidates: np.ndarray, positive_indices: np.ndarray, topks=(1, 5, 10)) -> dict:
+    Q = normalize_rows(query)
+    C = normalize_rows(candidates)
+    sims = Q @ C.T
+    ranks = []
+    for i, pos in enumerate(positive_indices):
+        order = np.argsort(-sims[i])
+        rank = int(np.where(order == pos)[0][0]) + 1
+        ranks.append(rank)
+    ranks = np.asarray(ranks)
+    out = {
+        "mrr": float(np.mean(1.0 / ranks)),
+        "median_rank": float(np.median(ranks)),
+        "mean_rank": float(np.mean(ranks)),
+        "num_queries": int(len(ranks)),
+    }
+    for k in topks:
+        out[f"top{k}_accuracy"] = float(np.mean(ranks <= k))
+    return out
+def task_caption_grounding(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
+    text_idx = block_indices(manifest, include=["caption_objects_interaction_text"])
+    sensor_idx = block_indices(manifest, exclude=["caption_objects_interaction_text"])
+    train, test = chronological_split_indices(len(X), args.test_fraction)
+    pred_text, model = ridge_fit_predict(X[train][:, sensor_idx], X[train][:, text_idx], X[test][:, sensor_idx], args.ridge_l2)
+    # Query is true text; candidates are sensor windows projected into text space.
+    metrics = retrieval_metrics(X[test][:, text_idx], pred_text, np.arange(len(test)))
+    metrics.update({
+        "task": "caption_grounding",
+        "input": "caption objects/interaction text query + candidate sensor windows",
+        "output": "matching time window",
+        "split": "chronological",
+        "num_train_windows": int(len(train)),
+        "num_test_windows": int(len(test)),
+    })
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write_json(out_dir / "metrics.json", metrics)
+    np.savez_compressed(out_dir / "model.npz", **model)
+    return metrics
+def task_cross_modal_retrieval(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
+    motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
+    visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
+    train, test = chronological_split_indices(len(X), args.test_fraction)
+    pred_visual, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2)
+    metrics = retrieval_metrics(pred_visual, X[test][:, visual_idx], np.arange(len(test)))
+    metrics.update({
+        "task": "cross_modal_retrieval",
+        "input": "motion/IMU/camera query",
+        "output": "matching depth/video window",
+        "split": "chronological",
+        "num_train_windows": int(len(train)),
+        "num_test_windows": int(len(test)),
+    })
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write_json(out_dir / "metrics.json", metrics)
+    np.savez_compressed(out_dir / "model.npz", **model)
+    return metrics
+def task_modality_reconstruction(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
+    motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
+    visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
+    train, test = chronological_split_indices(len(X), args.test_fraction)
+    pred, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2)
+    metrics = regression_metrics(X[test][:, visual_idx], pred)
+    metrics.update({
+        "task": "modality_reconstruction",
+        "input": "motion/IMU/camera",
+        "output": "depth/video feature vector",
+        "split": "chronological",
+        "num_train_windows": int(len(train)),
+        "num_test_windows": int(len(test)),
+        "target_dim": int(len(visual_idx)),
+    })
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write_json(out_dir / "metrics.json", metrics)
+    np.savez_compressed(out_dir / "predictions.npz", y_true=X[test][:, visual_idx], y_pred=pred, **model)
+    return metrics
+def binary_classification_from_arrays(out_dir: Path, X: np.ndarray, y: np.ndarray, args: argparse.Namespace, task: str, input_desc: str) -> dict:
+    train, test = chronological_split_indices(len(y), args.test_fraction)
+    mean, std = fit_scaler(X[train])
+    Xs = (X - mean) / std
+    W, b, history = train_softmax_classifier(
+        Xs[train],
+        y[train].astype(np.int64),
+        n_classes=2,
+        epochs=args.epochs,
+        lr=args.learning_rate,
+        l2=args.l2,
+        use_class_weights=True,
+        seed=args.seed,
+    )
+    pred, prob = predict(Xs[test], W, b)
+    metrics = binary_metrics(y[test], pred)
+    metrics.update({
+        "task": task,
+        "input": input_desc,
+        "split": "chronological",
+        "num_samples": int(len(y)),
+        "num_train_samples": int(len(train)),
+        "num_test_samples": int(len(test)),
+        "train_final_accuracy": float(history[-1]["train_accuracy"]),
+    })
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write_json(out_dir / "metrics.json", metrics)
+    pred_rows = []
+    for k, idx in enumerate(test):
+        pred_rows.append({"sample_index": int(idx), "true": int(y[idx]), "predicted": int(pred[k]), "prob_positive": float(prob[k, 1])})
+    write_csv(out_dir / "predictions.csv", pred_rows, ["sample_index", "true", "predicted", "prob_positive"])
+    np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b)
+    return metrics
+def task_temporal_order(out_dir: Path, X: np.ndarray, args: argparse.Namespace) -> dict:
+    pairs, y = [], []
+    for i in range(len(X) - 1):
+        a, b = X[i], X[i + 1]
+        pairs.append(np.concatenate([a, b, b - a]))
+        y.append(1)
+        pairs.append(np.concatenate([b, a, a - b]))
+        y.append(0)
+    return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "temporal_order", "two adjacent windows -> whether order is correct")
+def task_misalignment(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
+    motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
+    visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
+    shift = args.misalignment_shift_windows
+    pairs, y = [], []
+    limit = len(X) - shift
+    for i in range(limit):
+        pairs.append(np.concatenate([X[i, motion_idx], X[i, visual_idx]]))
+        y.append(1)
+        pairs.append(np.concatenate([X[i, motion_idx], X[i + shift, visual_idx]]))
+        y.append(0)
+    return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "misalignment_detection", f"motion+visual pair -> aligned vs shifted by {shift} windows")
+def main() -> int:
+    args = parse_args()
+    add_toolkit_to_path(args.workspace)
+    from data_loader import load_from_annotation_hdf5
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    tasks = selected_tasks(args.tasks)
+    print(f"Loading annotation: {args.annotation}")
+    ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=True)
+    extras, available_modalities = prepare_modalities(args, ann)
+    print("Building shared all-modality windows")
+    X, rows, manifest = build_windows(args, ann, extras)
+    write_json(args.output_dir / "available_modalities.json", available_modalities)
+    write_json(args.output_dir / "feature_manifest.json", manifest)
+    write_csv(args.output_dir / "windows.csv", rows, ["window_index", "start_frame", "end_frame", "center_frame", "action_label", "action_fraction", "subtask_label", "subtask_fraction"])
+    np.savez_compressed(args.output_dir / "shared_windows.npz", X=X, starts=np.asarray([r["start_frame"] for r in rows]), ends=np.asarray([r["end_frame"] for r in rows]))
+    summary = {
+        "annotation": portable_path(args.annotation, args.workspace),
+        "num_frames": int(len(ann["img_names"])),
+        "num_windows": int(len(rows)),
+        "feature_dim": int(X.shape[1]),
+        "window_frames": int(args.window_frames),
+        "stride_frames": int(args.stride_frames),
+        "tasks": {},
+    }
+    print(f"Windows: {len(rows)}, feature_dim: {X.shape[1]}")
+    for task in tasks:
+        print(f"\nRunning task: {task}")
+        out = args.output_dir / task
+        try:
+            if task == "timeline_action":
+                metrics = classification_task(out, X, label_array(rows, "action_label"), rows, args, task, "all modalities -> current action label")
+            elif task == "timeline_subtask":
+                metrics = classification_task(out, X, label_array(rows, "subtask_label"), rows, args, task, "all modalities -> current subtask label")
+            elif task == "transition_detection":
+                metrics = task_transition_detection(out, X, rows, ann, args)
+            elif task == "next_action":
+                metrics = task_next_action(out, X, rows, ann, args)
+            elif task == "hand_trajectory_forecast":
+                metrics = task_hand_forecast(out, X, rows, ann, args)
+            elif task == "contact_prediction":
+                metrics = task_contact_prediction(out, X, rows, ann, manifest, args)
+            elif task == "object_relevance":
+                metrics = task_object_relevance(out, X, rows, ann, manifest, args)
+            elif task == "caption_grounding":
+                metrics = task_caption_grounding(out, X, manifest, args)
+            elif task == "cross_modal_retrieval":
+                metrics = task_cross_modal_retrieval(out, X, manifest, args)
+            elif task == "modality_reconstruction":
+                metrics = task_modality_reconstruction(out, X, manifest, args)
+            elif task == "temporal_order":
+                metrics = task_temporal_order(out, X, args)
+            elif task == "misalignment_detection":
+                metrics = task_misalignment(out, X, manifest, args)
+            else:
+                raise ValueError(task)
+            summary["tasks"][task] = metrics
+            key_metrics = {k: metrics[k] for k in ("accuracy", "macro_f1", "f1", "mpjpe", "mrr", "r2", "micro_f1") if k in metrics}
+            print(f"  done: {key_metrics}")
+        except Exception as exc:
+            summary["tasks"][task] = {"error": str(exc)}
+            write_json(out / "error.json", {"task": task, "error": str(exc)})
+            print(f"  error: {exc}")
+    write_json(args.output_dir / "summary_report.json", summary)
+    print(f"\nSuite artifacts written to: {args.output_dir}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/generate_visualizations.py ADDED Viewed

	@@ -0,0 +1,474 @@

+#!/usr/bin/env python3
+"""
+Generate static SVG visualizations and website data for the Ropedia task suite.
+No plotting dependencies are required; this uses only the Python standard
+library so the repo stays easy to run.
+The polished GitHub Pages homepage in docs/index.html is hand-curated and is
+not overwritten by this script. This script refreshes docs/assets/*.svg,
+docs/assets/charts/*.svg, and docs/data/summary_metrics.json.
+"""
+from __future__ import annotations
+import html
+import json
+import textwrap
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+RESULTS = ROOT / "results"
+DOCS = ROOT / "docs"
+ASSETS = DOCS / "assets"
+CHARTS = ASSETS / "charts"
+def read_json(path: Path) -> dict:
+    return json.loads(path.read_text(encoding="utf-8"))
+def svg_bar_chart(path: Path, title: str, rows: list[tuple[str, float]], x_label: str = "score", max_value: float | None = None) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    width = 1100
+    row_h = 34
+    top = 78
+    left = 310
+    right = 70
+    height = top + row_h * len(rows) + 70
+    max_value = max_value if max_value is not None else max([v for _, v in rows] + [1.0])
+    max_value = max(max_value, 1e-9)
+    plot_w = width - left - right
+    colors = ["#2563eb", "#059669", "#ea580c", "#7b5d12", "#0891b2", "#dc2626"]
+    parts = [
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
+        '<rect width="100%" height="100%" fill="#ffffff"/>',
+        f'<text x="32" y="42" font-family="Arial, sans-serif" font-size="26" font-weight="700" fill="#111827">{html.escape(title)}</text>',
+        f'<text x="{left}" y="{height - 24}" font-family="Arial, sans-serif" font-size="13" fill="#6b7280">{html.escape(x_label)}</text>',
+    ]
+    for tick in range(6):
+        x = left + plot_w * tick / 5
+        val = max_value * tick / 5
+        parts.append(f'<line x1="{x:.1f}" y1="{top - 18}" x2="{x:.1f}" y2="{height - 50}" stroke="#e5e7eb" stroke-width="1"/>')
+        parts.append(f'<text x="{x:.1f}" y="{height - 30}" text-anchor="middle" font-family="Arial, sans-serif" font-size="12" fill="#6b7280">{val:.2f}</text>')
+    for i, (label, value) in enumerate(rows):
+        y = top + i * row_h
+        bar_w = max(0.0, min(value / max_value, 1.0)) * plot_w
+        color = colors[i % len(colors)]
+        parts.append(f'<text x="{left - 14}" y="{y + 21}" text-anchor="end" font-family="Arial, sans-serif" font-size="14" fill="#111827">{html.escape(label)}</text>')
+        parts.append(f'<rect x="{left}" y="{y + 5}" width="{bar_w:.1f}" height="20" rx="4" fill="{color}"/>')
+        parts.append(f'<text x="{left + bar_w + 8:.1f}" y="{y + 21}" font-family="Arial, sans-serif" font-size="13" fill="#374151">{value:.4f}</text>')
+    parts.append("</svg>")
+    path.write_text("\n".join(parts), encoding="utf-8")
+def svg_feature_blocks(path: Path, feature_manifest: list[dict]) -> None:
+    rows = [(block["name"], float(block["dim"])) for block in feature_manifest]
+    svg_bar_chart(path, "All-Modality Feature Blocks", rows, x_label="feature dimensions", max_value=max(v for _, v in rows) * 1.08)
+def svg_pipeline_diagram(path: Path, summary: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    suite = summary["suite"]
+    task_count = len(suite["tasks"])
+    width, height = 1400, 760
+    boxes = [
+        (60, 110, 250, 132, "1. Raw public sample", [
+            "annotation.hdf5",
+            "6 video files",
+            f"{suite['num_frames']:,} aligned frames",
+        ], "#1f63e9"),
+        (365, 110, 250, 132, "2. HOMIE loader", [
+            "mocap, IMU, depth",
+            "caption map",
+            "SLAM and calibration",
+        ], "#008b9a"),
+        (670, 110, 250, 132, "3. Window builder", [
+            f"{suite['window_frames']}-frame windows",
+            f"{suite['stride_frames']}-frame stride",
+            f"{suite['num_windows']:,} windows",
+        ], "#0a7f55"),
+        (975, 110, 300, 132, "4. Feature vector", [
+            f"{suite['feature_dim']:,} dimensions",
+            "17 named feature blocks",
+            "stored manifest",
+        ], "#b65b04"),
+        (60, 380, 360, 168, "5. Baseline models", [
+            "motion-only action/subtask",
+            "all-modality action/subtask",
+            "numpy softmax classifier",
+            "metrics and predictions",
+        ], "#1f63e9"),
+        (520, 380, 360, 168, "6. Episode task suite", [
+            f"{task_count} supervised/self-supervised tasks",
+            "chronological split",
+            "retrieval, forecast, alignment",
+            "per-task artifacts",
+        ], "#008b9a"),
+        (980, 380, 300, 168, "7. Published artifacts", [
+            "results/**/*.json/csv/npz",
+            "docs/data/summary_metrics.json",
+            "GitHub Pages dashboard",
+            "reproducibility audit",
+        ], "#0a7f55"),
+    ]
+    parts = [
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
+        '<rect width="100%" height="100%" fill="#ffffff"/>',
+        '<rect x="0" y="0" width="1400" height="760" fill="#ffffff"/>',
+        '<text x="60" y="58" font-family="Arial, sans-serif" font-size="32" font-weight="700" fill="#10141f">Verified Ropedia Episode Pipeline</text>',
+        '<text x="60" y="88" font-family="Arial, sans-serif" font-size="16" fill="#5b6475">Generated from committed scripts and metrics; no conceptual placeholder stages.</text>',
+    ]
+    arrows = [
+        (310, 176, 365, 176),
+        (615, 176, 670, 176),
+        (920, 176, 975, 176),
+        (215, 242, 240, 380),
+        (1095, 242, 700, 380),
+        (420, 464, 520, 464),
+        (880, 464, 980, 464),
+    ]
+    for x1, y1, x2, y2 in arrows:
+        parts.append(f'<line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}" stroke="#cbd5e1" stroke-width="3" marker-end="url(#arrow)"/>')
+    parts.insert(1, '<defs><marker id="arrow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M 0 0 L 10 5 L 0 10 z" fill="#cbd5e1"/></marker></defs>')
+    for x, y, w, h, title, lines, color in boxes:
+        parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
+        parts.append(f'<rect x="{x}" y="{y}" width="8" height="{h}" rx="4" fill="{color}"/>')
+        parts.append(f'<text x="{x + 24}" y="{y + 34}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="#10141f">{html.escape(title)}</text>')
+        for i, line in enumerate(lines):
+            parts.append(f'<text x="{x + 24}" y="{y + 66 + i * 22}" font-family="Arial, sans-serif" font-size="14" fill="#394255">{html.escape(line)}</text>')
+    checks = [
+        "Audit check: rerunning scripts to /private/tmp reproduced committed metrics exactly.",
+        "Video/depth check: fresh cache read depth plus fisheye_cam0/1/2/3 and stereo_left/right from raw files.",
+        "Scope check: this validates one public sample episode, not cross-episode generalization.",
+    ]
+    parts.append('<rect x="60" y="620" width="1220" height="96" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
+    for i, line in enumerate(checks):
+        parts.append(f'<text x="84" y="{650 + i * 24}" font-family="Arial, sans-serif" font-size="15" fill="#273143">{html.escape(line)}</text>')
+    parts.append("</svg>")
+    path.write_text("\n".join(parts), encoding="utf-8")
+def feature_dim(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> int:
+    include = include or []
+    exclude = exclude or []
+    total = 0
+    for block in feature_manifest:
+        name = block["name"]
+        if include and not any(name == prefix or name.startswith(prefix) for prefix in include):
+            continue
+        if exclude and any(name == prefix or name.startswith(prefix) for prefix in exclude):
+            continue
+        total += int(block["dim"])
+    return total
+def metric_text(task_name: str, metrics: dict) -> str:
+    if task_name == "hand_trajectory_forecast":
+        return f"MPJPE {metrics['mpjpe']:.4f}"
+    if task_name == "cross_modal_retrieval":
+        return f"top-5 {metrics['top5_accuracy']:.4f}"
+    if task_name == "caption_grounding":
+        return f"MRR {metrics['mrr']:.4f}"
+    if task_name == "object_relevance":
+        return f"micro-F1 {metrics['micro_f1']:.4f}"
+    if task_name == "modality_reconstruction":
+        return f"R2 {metrics['r2']:.4f}"
+    if task_name in {"temporal_order", "misalignment_detection"}:
+        return f"F1 {metrics['f1']:.4f}"
+    if "macro_f1" in metrics:
+        return f"macro-F1 {metrics['macro_f1']:.4f}"
+    if "accuracy" in metrics:
+        return f"accuracy {metrics['accuracy']:.4f}"
+    return "metric in summary_report.json"
+def draw_text_block(parts: list[str], x: int, y: int, lines: list[str], size: int = 13, color: str = "#394255", weight: str = "500", max_chars: int = 42, line_h: int = 18) -> int:
+    cursor = y
+    for line in lines:
+        wrapped = textwrap.wrap(line, width=max_chars) or [""]
+        for item in wrapped:
+            parts.append(f'<text x="{x}" y="{cursor}" font-family="Arial, sans-serif" font-size="{size}" font-weight="{weight}" fill="{color}">{html.escape(item)}</text>')
+            cursor += line_h
+    return cursor
+def task_architecture_rows(summary: dict) -> list[dict]:
+    suite = summary["suite"]
+    tasks = suite["tasks"]
+    manifest = summary["feature_manifest"]
+    all_dim = int(suite["feature_dim"])
+    no_contact_text_dim = feature_dim(manifest, exclude=["body_contacts", "caption_objects_interaction_text"])
+    no_text_dim = feature_dim(manifest, exclude=["caption_objects_interaction_text"])
+    sensor_dim = no_text_dim
+    text_dim = feature_dim(manifest, include=["caption_objects_interaction_text"])
+    motion_dim = feature_dim(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
+    visual_dim = feature_dim(manifest, include=["depth_confidence", "video_"])
+    pair_dim = all_dim * 3
+    align_dim = motion_dim + visual_dim
+    return [
+        {
+            "task": "timeline_action",
+            "family": "softmax",
+            "input": f"X_all window, {all_dim:,}d",
+            "head": "z-score -> linear softmax, class-weighted CE + L2",
+            "output": f"current action class, {tasks['timeline_action']['num_classes']} classes",
+            "metric": metric_text("timeline_action", tasks["timeline_action"]),
+        },
+        {
+            "task": "timeline_subtask",
+            "family": "softmax",
+            "input": f"X_all window, {all_dim:,}d",
+            "head": "z-score -> linear softmax, class-weighted CE + L2",
+            "output": f"current subtask class, {tasks['timeline_subtask']['num_classes']} classes",
+            "metric": metric_text("timeline_subtask", tasks["timeline_subtask"]),
+        },
+        {
+            "task": "transition_detection",
+            "family": "softmax",
+            "input": f"X_all window, {all_dim:,}d",
+            "head": "z-score -> linear softmax, class-weighted CE + L2",
+            "output": "steady vs transition near action boundary",
+            "metric": f"{metric_text('transition_detection', tasks['transition_detection'])}; boundary-F1 {tasks['transition_detection']['boundary_f1']:.4f}",
+        },
+        {
+            "task": "next_action",
+            "family": "softmax",
+            "input": f"X_all at time t, {all_dim:,}d",
+            "head": "z-score -> linear softmax, class-weighted CE + L2",
+            "output": f"action at t+{tasks['next_action'].get('future_frames', 20)} frames",
+            "metric": metric_text("next_action", tasks["next_action"]),
+        },
+        {
+            "task": "hand_trajectory_forecast",
+            "family": "ridge",
+            "input": f"X_all at time t, {all_dim:,}d",
+            "head": "z-score X/Y -> dual ridge regression, L2=10",
+            "output": f"future hand joints, {tasks['hand_trajectory_forecast']['target_dim']}d",
+            "metric": metric_text("hand_trajectory_forecast", tasks["hand_trajectory_forecast"]),
+        },
+        {
+            "task": "contact_prediction",
+            "family": "softmax",
+            "input": f"X without contact/text leakage, {no_contact_text_dim:,}d",
+            "head": "z-score -> linear softmax on observed labels",
+            "output": "any body contact in window; degenerate one-class sample",
+            "metric": metric_text("contact_prediction", tasks["contact_prediction"]),
+        },
+        {
+            "task": "object_relevance",
+            "family": "multilabel",
+            "input": f"X without caption text, {no_text_dim:,}d",
+            "head": "z-score -> sigmoid multi-label logistic, weighted",
+            "output": f"multi-hot object set, {tasks['object_relevance']['num_objects']} objects",
+            "metric": metric_text("object_relevance", tasks["object_relevance"]),
+        },
+        {
+            "task": "caption_grounding",
+            "family": "ridge+rank",
+            "input": f"sensor {sensor_dim:,}d -> text space {text_dim:,}d",
+            "head": "ridge projection, then cosine ranking",
+            "output": "text query retrieves matching time window",
+            "metric": metric_text("caption_grounding", tasks["caption_grounding"]),
+        },
+        {
+            "task": "cross_modal_retrieval",
+            "family": "ridge+rank",
+            "input": f"motion/IMU/camera {motion_dim:,}d -> visual {visual_dim:,}d",
+            "head": "ridge projection, then cosine ranking",
+            "output": "retrieve matching depth/video window",
+            "metric": metric_text("cross_modal_retrieval", tasks["cross_modal_retrieval"]),
+        },
+        {
+            "task": "modality_reconstruction",
+            "family": "ridge",
+            "input": f"motion/IMU/camera {motion_dim:,}d",
+            "head": "z-score X/Y -> dual ridge regression, L2=10",
+            "output": f"depth/video feature vector, {visual_dim:,}d",
+            "metric": metric_text("modality_reconstruction", tasks["modality_reconstruction"]),
+        },
+        {
+            "task": "temporal_order",
+            "family": "softmax",
+            "input": f"concat[x_t, x_t+1, diff], {pair_dim:,}d",
+            "head": "z-score -> binary linear softmax, CE + L2",
+            "output": "correct vs reversed adjacent windows",
+            "metric": metric_text("temporal_order", tasks["temporal_order"]),
+        },
+        {
+            "task": "misalignment_detection",
+            "family": "softmax",
+            "input": f"concat[motion_t, visual_t/visual_t+8], {align_dim:,}d",
+            "head": "z-score -> binary linear softmax, CE + L2",
+            "output": "aligned vs shifted by 8 windows",
+            "metric": metric_text("misalignment_detection", tasks["misalignment_detection"]),
+        },
+    ]
+def svg_task_architectures(path: Path, summary: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    suite = summary["suite"]
+    rows = task_architecture_rows(summary)
+    family_colors = {
+        "softmax": "#1f63e9",
+        "ridge": "#0a7f55",
+        "ridge+rank": "#008b9a",
+        "multilabel": "#b65b04",
+    }
+    width, height = 1500, 1840
+    parts = [
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
+        '<defs><marker id="arrow2" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M 0 0 L 10 5 L 0 10 z" fill="#cbd5e1"/></marker></defs>',
+        '<rect width="100%" height="100%" fill="#ffffff"/>',
+        '<text x="60" y="56" font-family="Arial, sans-serif" font-size="34" font-weight="700" fill="#10141f">Minimal Architectures for the 12 Ropedia Episode Tasks</text>',
+        '<text x="60" y="88" font-family="Arial, sans-serif" font-size="16" fill="#5b6475">Generated from scripts/episode_task_suite.py semantics and committed summary metrics. These are minimal baselines, not deep foundation models.</text>',
+    ]
+    setup = [
+        (60, 122, 310, 110, "Shared episode windows", [
+            f"{suite['num_frames']:,} frames -> {suite['num_windows']:,} windows",
+            f"{suite['window_frames']}-frame window, {suite['stride_frames']}-frame stride",
+            "chronological 70/30 split",
+        ], "#1f63e9"),
+        (410, 122, 310, 110, "Feature vector", [
+            f"X_all = {suite['feature_dim']:,} dimensions",
+            "17 named modality blocks",
+            "mean/std fit on train only",
+        ], "#008b9a"),
+        (760, 122, 320, 110, "Reusable heads", [
+            "linear softmax classifier",
+            "dual ridge regression/projection",
+            "multi-label logistic + cosine rank",
+        ], "#0a7f55"),
+        (1120, 122, 320, 110, "Artifacts", [
+            "metrics.json, predictions.csv/npz",
+            "model.npz with scaler and weights",
+            "summary_report.json source of numbers",
+        ], "#b65b04"),
+    ]
+    for i in range(len(setup) - 1):
+        x1 = setup[i][0] + setup[i][2]
+        x2 = setup[i + 1][0]
+        y = setup[i][1] + 55
+        parts.append(f'<line x1="{x1 + 12}" y1="{y}" x2="{x2 - 14}" y2="{y}" stroke="#cbd5e1" stroke-width="3" marker-end="url(#arrow2)"/>')
+    for x, y, w, h, title, lines, color in setup:
+        parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
+        parts.append(f'<rect x="{x}" y="{y}" width="8" height="{h}" rx="4" fill="{color}"/>')
+        parts.append(f'<text x="{x + 24}" y="{y + 31}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="#10141f">{html.escape(title)}</text>')
+        draw_text_block(parts, x + 24, y + 58, lines, size=13, color="#394255", max_chars=34, line_h=18)
+    families = [
+        ("Softmax classifier", "logits = z(X)W + b; CE + L2; class weights for classifiers", "#1f63e9", 60, 270),
+        ("Ridge regression/projection", "closed-form dual ridge on z(X), z(Y); used for forecast and reconstruction", "#0a7f55", 780, 270),
+        ("Ridge + cosine ranking", "project one modality into another feature space, then rank candidates by cosine", "#008b9a", 60, 394),
+        ("Multi-label logistic", "sigmoid heads for object vocabulary; threshold 0.5 with top-1 fallback", "#b65b04", 780, 394),
+    ]
+    for title, desc, color, x, y in families:
+        parts.append(f'<rect x="{x}" y="{y}" width="660" height="100" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
+        parts.append(f'<text x="{x + 18}" y="{y + 33}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="{color}">{html.escape(title)}</text>')
+        draw_text_block(parts, x + 18, y + 60, [desc], size=13, color="#394255", max_chars=76, line_h=18)
+    card_w, card_h = 440, 248
+    gap_x, gap_y = 30, 30
+    start_x, start_y = 60, 540
+    for idx, row in enumerate(rows):
+        col, card_row = idx % 3, idx // 3
+        x = start_x + col * (card_w + gap_x)
+        y = start_y + card_row * (card_h + gap_y)
+        color = family_colors[row["family"]]
+        parts.append(f'<rect x="{x}" y="{y}" width="{card_w}" height="{card_h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
+        parts.append(f'<rect x="{x}" y="{y}" width="8" height="{card_h}" rx="4" fill="{color}"/>')
+        parts.append(f'<rect x="{x + 20}" y="{y + 18}" width="96" height="24" rx="6" fill="#f8fafc" stroke="{color}"/>')
+        parts.append(f'<text x="{x + 68}" y="{y + 35}" text-anchor="middle" font-family="Arial, sans-serif" font-size="11" font-weight="700" fill="{color}">{html.escape(row["family"])}</text>')
+        parts.append(f'<text x="{x + 20}" y="{y + 72}" font-family="Arial, sans-serif" font-size="20" font-weight="700" fill="#10141f">{html.escape(row["task"])}</text>')
+        cursor = y + 104
+        for label in ("input", "head", "output", "metric"):
+            parts.append(f'<text x="{x + 20}" y="{cursor}" font-family="Arial, sans-serif" font-size="12" font-weight="700" fill="{color}">{label.upper()}</text>')
+            cursor = draw_text_block(parts, x + 92, cursor, [row[label]], size=13, color="#394255", max_chars=41, line_h=17)
+            cursor += 8
+    notes = [
+        "Interpretation: this suite tests whether each input/output contract is wired correctly before scaling to many episodes.",
+        "Research-grade claims need held-out episode splits and stronger sequence/vision-language/robot-policy models.",
+    ]
+    parts.append('<rect x="60" y="1688" width="1380" height="72" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
+    for i, line in enumerate(notes):
+        parts.append(f'<text x="84" y="{1718 + i * 24}" font-family="Arial, sans-serif" font-size="15" fill="#273143">{html.escape(line)}</text>')
+    parts.append("</svg>")
+    path.write_text("\n".join(parts), encoding="utf-8")
+def collect_summary() -> dict:
+    all_action = read_json(RESULTS / "min_all_modalities_action_model/metrics.json")
+    all_subtask = read_json(RESULTS / "min_all_modalities_subtask_model/metrics.json")
+    min_action = read_json(RESULTS / "min_action_model/metrics.json")
+    min_subtask = read_json(RESULTS / "min_subtask_model/metrics.json")
+    suite = read_json(RESULTS / "episode_task_suite/summary_report.json")
+    manifest = read_json(RESULTS / "episode_task_suite/feature_manifest.json")
+    return {
+        "models": {
+            "motion_action": min_action,
+            "motion_subtask": min_subtask,
+            "all_modalities_action": all_action,
+            "all_modalities_subtask": all_subtask,
+        },
+        "suite": suite,
+        "feature_manifest": manifest,
+    }
+def generate_charts(summary: dict) -> None:
+    CHARTS.mkdir(parents=True, exist_ok=True)
+    svg_pipeline_diagram(ASSETS / "pipeline_diagram.svg", summary)
+    svg_task_architectures(ASSETS / "task_architectures.svg", summary)
+    model_rows = [
+        ("Motion-only action macro-F1", summary["models"]["motion_action"]["macro_f1"]),
+        ("All-modality action macro-F1", summary["models"]["all_modalities_action"]["macro_f1"]),
+        ("Motion-only subtask macro-F1", summary["models"]["motion_subtask"]["macro_f1"]),
+        ("All-modality subtask macro-F1", summary["models"]["all_modalities_subtask"]["macro_f1"]),
+    ]
+    svg_bar_chart(CHARTS / "model_macro_f1.svg", "Minimal Model Macro-F1 Comparison", model_rows, max_value=1.0)
+    suite = summary["suite"]["tasks"]
+    task_rows = []
+    for task_name, metrics in suite.items():
+        score = metrics.get("macro_f1", metrics.get("f1", metrics.get("micro_f1", metrics.get("top5_accuracy", metrics.get("r2", 0.0)))))
+        if score is None:
+            score = 0.0
+        score = max(float(score), 0.0)
+        task_rows.append((task_name, score))
+    svg_bar_chart(CHARTS / "episode_task_scores.svg", "Episode Task Suite: Main Scores", task_rows, max_value=1.0)
+    svg_feature_blocks(CHARTS / "feature_blocks.svg", summary["feature_manifest"])
+    retrieval = suite["cross_modal_retrieval"]
+    retrieval_rows = [
+        ("top1", retrieval["top1_accuracy"]),
+        ("top5", retrieval["top5_accuracy"]),
+        ("top10", retrieval["top10_accuracy"]),
+        ("MRR", retrieval["mrr"]),
+    ]
+    svg_bar_chart(CHARTS / "cross_modal_retrieval.svg", "Cross-Modal Retrieval", retrieval_rows, max_value=1.0)
+def write_summary_data(summary: dict) -> None:
+    DOCS.mkdir(parents=True, exist_ok=True)
+    (DOCS / "data").mkdir(parents=True, exist_ok=True)
+    (DOCS / "data/summary_metrics.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
+def main() -> int:
+    summary = collect_summary()
+    generate_charts(summary)
+    write_summary_data(summary)
+    print(f"Wrote pipeline diagram: {ASSETS / 'pipeline_diagram.svg'}")
+    print(f"Wrote task architectures diagram: {ASSETS / 'task_architectures.svg'}")
+    print(f"Wrote charts: {CHARTS}")
+    print(f"Wrote data: {DOCS / 'data/summary_metrics.json'}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/render_task_suite_infographic.py ADDED Viewed

	@@ -0,0 +1,378 @@

+#!/usr/bin/env python3
+"""
+Render a ChatGPT-image-backed 12-task infographic.
+The background bitmap is AI-generated. The task names, inputs, and metrics are
+read from results/episode_task_suite/summary_report.json so the published image
+does not rely on image-model text generation.
+"""
+from __future__ import annotations
+import argparse
+import html
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
+DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
+DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
+GROUPS = [
+    {
+        "name": "Label + State",
+        "color": "#008b9a",
+        "left": 94,
+        "top": 374,
+        "width": 246,
+        "tasks": [
+            ("timeline_action", "supervised"),
+            ("timeline_subtask", "supervised"),
+            ("next_action", "supervised"),
+        ],
+    },
+    {
+        "name": "Prediction + Reconstruction",
+        "color": "#1f63e9",
+        "left": 472,
+        "top": 374,
+        "width": 248,
+        "tasks": [
+            ("hand_trajectory_forecast", "forecast"),
+            ("modality_reconstruction", "forecast"),
+            ("contact_prediction", "supervised"),
+        ],
+    },
+    {
+        "name": "Grounding + Retrieval",
+        "color": "#b65b04",
+        "left": 848,
+        "top": 374,
+        "width": 220,
+        "tasks": [
+            ("caption_grounding", "retrieval"),
+            ("cross_modal_retrieval", "retrieval"),
+            ("object_relevance", "supervised"),
+        ],
+    },
+    {
+        "name": "Temporal Diagnostics",
+        "color": "#b42318",
+        "left": 1202,
+        "top": 374,
+        "width": 244,
+        "tasks": [
+            ("transition_detection", "diagnostic"),
+            ("temporal_order", "diagnostic"),
+            ("misalignment_detection", "diagnostic"),
+        ],
+    },
+]
+def load_summary() -> dict:
+    return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
+def fmt(value: float) -> str:
+    return f"{float(value):.4f}"
+def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
+    if task_name == "hand_trajectory_forecast":
+        return "MPJPE", fmt(metrics["mpjpe"])
+    if task_name == "cross_modal_retrieval":
+        return "top-5", fmt(metrics["top5_accuracy"])
+    if task_name == "caption_grounding":
+        return "MRR", fmt(metrics["mrr"])
+    if task_name == "object_relevance":
+        return "micro-F1", fmt(metrics["micro_f1"])
+    if task_name == "modality_reconstruction":
+        return "R2", fmt(metrics["r2"])
+    if task_name in {"temporal_order", "misalignment_detection"}:
+        return "F1", fmt(metrics["f1"])
+    if "macro_f1" in metrics:
+        return "macro-F1", fmt(metrics["macro_f1"])
+    if "accuracy" in metrics:
+        return "accuracy", fmt(metrics["accuracy"])
+    raise KeyError(f"No main metric configured for {task_name}")
+def short_io(task_name: str, metrics: dict) -> str:
+    custom = {
+        "timeline_action": "all modalities -> action label",
+        "timeline_subtask": "all modalities -> subtask label",
+        "transition_detection": "all modalities -> boundary / steady",
+        "next_action": "window at t -> action at t+20",
+        "hand_trajectory_forecast": "all modalities -> future hand joints",
+        "contact_prediction": "non-contact modalities -> contact",
+        "object_relevance": "non-caption modalities -> object set",
+        "caption_grounding": "text query -> matching window",
+        "cross_modal_retrieval": "motion / IMU / camera -> depth / video",
+        "modality_reconstruction": "motion / IMU / camera -> depth / video vec",
+        "temporal_order": "two windows -> correct order?",
+        "misalignment_detection": "motion + visual -> aligned / shifted",
+    }
+    return custom.get(task_name, metrics.get("input", ""))
+def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str:
+    label, value = metric_for(task_name, metrics)
+    io = short_io(task_name, metrics)
+    name_size = 17 if len(task_name) > 22 else 18
+    return f"""
+      <section class="task" style="left:{group['left']}px;top:{top}px;width:{group['width']}px;--accent:{group['color']};">
+        <div class="kind">{html.escape(kind)}</div>
+        <div class="task-name" style="font-size:{name_size}px;">{html.escape(task_name)}</div>
+        <div class="io">{html.escape(io)}</div>
+        <div class="metric"><span>{html.escape(label)}</span><strong>{html.escape(value)}</strong></div>
+      </section>
+    """
+def build_html(summary: dict, base_image: Path) -> str:
+    suite = summary["tasks"]
+    task_count = len(suite)
+    group_headers = []
+    cards = []
+    row_tops = [374, 552, 730]
+    header_lefts = [38, 417, 792, 1143]
+    for group, header_left in zip(GROUPS, header_lefts):
+        group_headers.append(
+            f'<div class="group-title" style="left:{header_left}px;top:333px;color:{group["color"]};">{html.escape(group["name"])}</div>'
+        )
+        for row_idx, (task_name, kind) in enumerate(group["tasks"]):
+            cards.append(task_html(task_name, kind, suite[task_name], row_tops[row_idx], group))
+    stats = [
+        f"{summary['num_frames']:,} frames",
+        f"{summary['num_windows']:,} windows",
+        f"{summary['feature_dim']:,} features",
+        f"{task_count} tasks",
+        "chronological split",
+    ]
+    stat_html = "".join(f"<span>{html.escape(item)}</span>" for item in stats)
+    base_uri = base_image.resolve().as_uri()
+    return f"""<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=1536, initial-scale=1">
+  <title>Ropedia 12-Task Episode Suite Infographic</title>
+  <style>
+    * {{ box-sizing: border-box; }}
+    html, body {{ margin: 0; width: 1536px; height: 1024px; background: #ffffff; }}
+    body {{
+      font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
+      color: #10141f;
+    }}
+    .canvas {{
+      position: relative;
+      width: 1536px;
+      height: 1024px;
+      overflow: hidden;
+      background-image: url("{base_uri}");
+      background-size: 1536px 1024px;
+      background-repeat: no-repeat;
+    }}
+    .title {{
+      position: absolute;
+      left: 330px;
+      top: 42px;
+      width: 876px;
+      text-align: center;
+    }}
+    h1 {{
+      margin: 0;
+      font-size: 38px;
+      line-height: 1.05;
+      letter-spacing: 0;
+      font-weight: 820;
+    }}
+    .subtitle {{
+      margin-top: 8px;
+      color: #425067;
+      font-size: 15px;
+      line-height: 1.35;
+      font-weight: 520;
+    }}
+    .stats {{
+      margin-top: 12px;
+      display: flex;
+      justify-content: center;
+      gap: 8px;
+    }}
+    .stats span {{
+      display: inline-flex;
+      align-items: center;
+      height: 24px;
+      padding: 0 10px;
+      border: 1px solid #cdd8e8;
+      background: rgba(255, 255, 255, 0.82);
+      border-radius: 999px;
+      color: #253046;
+      font-size: 12px;
+      font-weight: 720;
+    }}
+    .modality {{
+      position: absolute;
+      top: 256px;
+      width: 180px;
+      text-align: center;
+      font-size: 12px;
+      color: #536074;
+      font-weight: 720;
+      text-transform: uppercase;
+      letter-spacing: 0;
+    }}
+    .group-title {{
+      position: absolute;
+      width: 322px;
+      text-align: center;
+      font-size: 18px;
+      line-height: 1;
+      font-weight: 830;
+      letter-spacing: 0;
+    }}
+    .task {{
+      position: absolute;
+      padding: 0;
+    }}
+    .kind {{
+      display: inline-flex;
+      align-items: center;
+      height: 22px;
+      padding: 0 8px;
+      border-radius: 6px;
+      border: 1px solid color-mix(in srgb, var(--accent) 35%, #ffffff);
+      color: var(--accent);
+      background: rgba(255, 255, 255, 0.76);
+      text-transform: uppercase;
+      font-size: 10px;
+      line-height: 1;
+      font-weight: 840;
+      letter-spacing: 0;
+    }}
+    .task-name {{
+      margin-top: 7px;
+      color: #111827;
+      line-height: 1.05;
+      font-weight: 850;
+      letter-spacing: 0;
+      white-space: nowrap;
+    }}
+    .io {{
+      margin-top: 8px;
+      min-height: 36px;
+      color: #475569;
+      font-size: 13.5px;
+      line-height: 1.28;
+      font-weight: 570;
+    }}
+    .metric {{
+      display: inline-flex;
+      align-items: center;
+      gap: 9px;
+      margin-top: 8px;
+      height: 30px;
+      padding: 0 10px;
+      border-radius: 7px;
+      border: 1px solid color-mix(in srgb, var(--accent) 36%, #ffffff);
+      background: rgba(255, 255, 255, 0.90);
+      box-shadow: 0 7px 20px rgba(16, 20, 31, 0.07);
+    }}
+    .metric span {{
+      color: #64748b;
+      font-size: 12px;
+      font-weight: 760;
+    }}
+    .metric strong {{
+      color: var(--accent);
+      font-size: 16px;
+      line-height: 1;
+      font-weight: 860;
+    }}
+    .footer {{
+      position: absolute;
+      left: 360px;
+      top: 932px;
+      width: 816px;
+      text-align: center;
+      color: #536074;
+      font-size: 14px;
+      font-weight: 650;
+    }}
+  </style>
+</head>
+<body>
+  <main class="canvas" aria-label="Ropedia 12-task episode suite infographic">
+    <div class="title">
+      <h1>Ropedia 12-Task Episode Suite</h1>
+      <div class="subtitle">All labels and metrics are overlaid from the verified single-episode results.</div>
+      <div class="stats">{stat_html}</div>
+    </div>
+    <div class="modality" style="left:50px;">fisheye video</div>
+    <div class="modality" style="left:270px;">depth</div>
+    <div class="modality" style="left:530px;">3D / SLAM</div>
+    <div class="modality" style="left:770px;">IMU</div>
+    <div class="modality" style="left:1030px;">hands</div>
+    <div class="modality" style="left:1278px;">text / objects</div>
+    {''.join(group_headers)}
+    {''.join(cards)}
+    <div class="footer">Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</div>
+  </main>
+</body>
+</html>
+"""
+def render_html(html_path: Path, output_path: Path) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(
+        [
+            "npx",
+            "--yes",
+            "playwright",
+            "screenshot",
+            "--full-page",
+            "--viewport-size=1536,1024",
+            html_path.resolve().as_uri(),
+            str(output_path),
+        ],
+        check=True,
+    )
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
+    parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
+    parser.add_argument("--html", type=Path)
+    parser.add_argument("--no-export", action="store_true", help="Only write the HTML overlay.")
+    args = parser.parse_args()
+    summary = load_summary()
+    html_text = build_html(summary, args.base_image)
+    if args.html is None:
+        with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
+            handle.write(html_text)
+            html_path = Path(handle.name)
+    else:
+        html_path = args.html
+        html_path.parent.mkdir(parents=True, exist_ok=True)
+        html_path.write_text(html_text, encoding="utf-8")
+    if not args.no_export:
+        render_html(html_path, args.output)
+        print(f"Wrote image: {args.output}")
+    print(f"Wrote overlay HTML: {html_path}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/train_all_modalities_model.py ADDED Viewed

	@@ -0,0 +1,582 @@

+#!/usr/bin/env python3
+"""
+All-modality lightweight baseline for a Ropedia/Xperience episode.
+This intentionally stays small enough for a MacBook:
+  - no deep video training
+  - no CUDA
+  - no PyTorch dependency
+Each modality is compressed into window-level statistics, then the same
+Numpy softmax classifier from train_min_action_model.py is used.
+"""
+from __future__ import annotations
+import argparse
+import csv
+import hashlib
+import json
+import re
+import sys
+from collections import Counter, OrderedDict
+from pathlib import Path
+import cv2
+import h5py
+import numpy as np
+from train_min_action_model import (
+    add_toolkit_to_path,
+    center_by_body_root,
+    compute_metrics,
+    encode_labels,
+    fit_scaler,
+    frame_label,
+    majority_label,
+    predict,
+    portable_path,
+    safe_window,
+    save_artifacts,
+    stratified_split,
+    temporal_stats,
+    train_softmax_classifier,
+)
+VIDEO_FILES = OrderedDict([
+    ("fisheye_cam0", "fisheye_cam0.mp4"),
+    ("fisheye_cam1", "fisheye_cam1.mp4"),
+    ("fisheye_cam2", "fisheye_cam2.mp4"),
+    ("fisheye_cam3", "fisheye_cam3.mp4"),
+    ("stereo_left", "stereo_left.mp4"),
+    ("stereo_right", "stereo_right.mp4"),
+])
+def parse_args() -> argparse.Namespace:
+    workspace_default = Path(__file__).resolve().parents[1]
+    annotation_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
+    parser = argparse.ArgumentParser(description="Train a lightweight all-modality Ropedia classifier.")
+    parser.add_argument("--workspace", type=Path, default=workspace_default, help="Ropedia workspace root.")
+    parser.add_argument("--annotation", type=Path, default=annotation_default, help="Path to annotation.hdf5.")
+    parser.add_argument("--output-dir", type=Path, default=None, help="Output artifact directory.")
+    parser.add_argument("--cache-dir", type=Path, default=None, help="Feature cache directory.")
+    parser.add_argument("--target", choices=["action", "subtask"], default="action", help="Prediction target.")
+    parser.add_argument("--window-frames", type=int, default=20, help="Frames per training window.")
+    parser.add_argument("--stride-frames", type=int, default=5, help="Stride between windows.")
+    parser.add_argument("--min-label-fraction", type=float, default=0.6, help="Minimum majority-label fraction.")
+    parser.add_argument("--test-fraction", type=float, default=0.25, help="Stratified test fraction.")
+    parser.add_argument("--epochs", type=int, default=800, help="Training epochs.")
+    parser.add_argument("--learning-rate", type=float, default=0.12, help="Softmax learning rate.")
+    parser.add_argument("--l2", type=float, default=2e-3, help="L2 weight decay.")
+    parser.add_argument("--seed", type=int, default=7, help="Random seed.")
+    parser.add_argument("--no-class-weights", action="store_true", help="Disable inverse-frequency class weighting.")
+    parser.add_argument("--force-rebuild-cache", action="store_true", help="Recompute cached depth/video features.")
+    parser.add_argument("--video-image-size", type=int, default=32, help="Resize video frames before visual features.")
+    parser.add_argument("--video-grid-size", type=int, default=8, help="Small grayscale grid per video frame.")
+    parser.add_argument("--video-hist-bins", type=int, default=8, help="Color histogram bins per channel.")
+    parser.add_argument("--depth-grid-size", type=int, default=8, help="Small depth/confidence grid per frame.")
+    parser.add_argument("--text-hash-dim", type=int, default=128, help="Hashed bag-of-words dimension.")
+    parser.add_argument(
+        "--include-label-text",
+        action="store_true",
+        help="Also include action/subtask/action-description text as input. This leaks target semantics.",
+    )
+    args = parser.parse_args()
+    if args.output_dir is None:
+        name = "min_all_modalities_action_model" if args.target == "action" else "min_all_modalities_subtask_model"
+        args.output_dir = args.workspace / "outputs" / name
+    if args.cache_dir is None:
+        args.cache_dir = args.workspace / "outputs/feature_cache"
+    return args
+def numeric_array(value) -> np.ndarray | None:
+    try:
+        arr = np.asarray(value, dtype=np.float32)
+    except (TypeError, ValueError):
+        return None
+    if arr.size == 0:
+        return None
+    return np.nan_to_num(arr.reshape(-1), nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
+def calibration_features(calib_data: dict | None) -> np.ndarray:
+    if not calib_data:
+        return np.zeros(0, dtype=np.float32)
+    chunks: list[np.ndarray] = []
+    for cam_id in sorted(calib_data):
+        cam = calib_data.get(cam_id, {})
+        if not isinstance(cam, dict):
+            continue
+        for key in sorted(cam):
+            arr = numeric_array(cam.get(key))
+            if arr is not None:
+                chunks.append(arr)
+    if not chunks:
+        return np.zeros(0, dtype=np.float32)
+    return np.concatenate(chunks).astype(np.float32)
+def point_cloud_features(points: np.ndarray | None) -> np.ndarray:
+    if points is None:
+        return np.zeros(0, dtype=np.float32)
+    pts = np.asarray(points, dtype=np.float32)
+    if pts.ndim != 2 or pts.shape[1] != 3 or len(pts) == 0:
+        return np.zeros(0, dtype=np.float32)
+    pts = np.nan_to_num(pts, nan=0.0, posinf=0.0, neginf=0.0)
+    stats = [
+        pts.mean(axis=0),
+        pts.std(axis=0),
+        pts.min(axis=0),
+        pts.max(axis=0),
+        np.percentile(pts, 10, axis=0),
+        np.percentile(pts, 50, axis=0),
+        np.percentile(pts, 90, axis=0),
+        np.asarray([np.log1p(len(pts))], dtype=np.float32),
+    ]
+    return np.concatenate(stats).astype(np.float32)
+def video_frame_features(frame: np.ndarray, image_size: int, grid_size: int, hist_bins: int) -> np.ndarray:
+    small = cv2.resize(frame, (image_size, image_size), interpolation=cv2.INTER_AREA)
+    rgb = cv2.cvtColor(small, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
+    mean = rgb.reshape(-1, 3).mean(axis=0)
+    std = rgb.reshape(-1, 3).std(axis=0)
+    hists = []
+    for channel in range(3):
+        hist, _ = np.histogram(rgb[:, :, channel], bins=hist_bins, range=(0.0, 1.0))
+        hist = hist.astype(np.float32)
+        hist /= max(float(hist.sum()), 1.0)
+        hists.append(hist)
+    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0
+    grid = cv2.resize(gray, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1)
+    gy, gx = np.gradient(gray)
+    edge = np.asarray([np.abs(gx).mean(), np.abs(gy).mean(), np.abs(gx).std(), np.abs(gy).std()], dtype=np.float32)
+    return np.concatenate([mean, std, *hists, grid, edge]).astype(np.float32)
+def read_video_feature_cache(
+    path: Path,
+    n_frames: int,
+    cache_dir: Path,
+    image_size: int,
+    grid_size: int,
+    hist_bins: int,
+    force: bool,
+) -> np.ndarray:
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    cache_path = cache_dir / f"video_{path.stem}_n{n_frames}_img{image_size}_grid{grid_size}_hist{hist_bins}.npz"
+    if cache_path.exists() and not force:
+        return np.load(cache_path)["features"].astype(np.float32)
+    dummy_dim = 6 + 3 * hist_bins + grid_size * grid_size + 4
+    features = np.zeros((n_frames, dummy_dim), dtype=np.float32)
+    if not path.exists():
+        np.savez_compressed(cache_path, features=features)
+        return features
+    cap = cv2.VideoCapture(str(path))
+    if not cap.isOpened():
+        np.savez_compressed(cache_path, features=features)
+        return features
+    last = np.zeros(dummy_dim, dtype=np.float32)
+    for idx in range(n_frames):
+        ok, frame = cap.read()
+        if ok:
+            last = video_frame_features(frame, image_size, grid_size, hist_bins)
+        features[idx] = last
+        if idx and idx % 1000 == 0:
+            print(f"    {path.name}: {idx}/{n_frames} frames")
+    cap.release()
+    np.savez_compressed(cache_path, features=features)
+    return features
+def depth_frame_features(depth: np.ndarray, confidence: np.ndarray | None, depth_min: float, depth_max: float, grid_size: int) -> np.ndarray:
+    d = np.asarray(depth, dtype=np.float32)
+    valid = np.isfinite(d) & (d > 0)
+    if valid.any():
+        vals = d[valid]
+        d_stats = np.asarray([
+            vals.mean(),
+            vals.std(),
+            vals.min(),
+            vals.max(),
+            np.percentile(vals, 10),
+            np.percentile(vals, 50),
+            np.percentile(vals, 90),
+            valid.mean(),
+        ], dtype=np.float32)
+    else:
+        d_stats = np.zeros(8, dtype=np.float32)
+    denom = max(depth_max - depth_min, 1e-6)
+    d_norm = np.clip((np.nan_to_num(d, nan=0.0) - depth_min) / denom, 0.0, 1.0)
+    d_grid = cv2.resize(d_norm, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1).astype(np.float32)
+    if confidence is None:
+        c_stats = np.zeros(4, dtype=np.float32)
+        c_grid = np.zeros(grid_size * grid_size, dtype=np.float32)
+    else:
+        c = np.asarray(confidence, dtype=np.float32)
+        c_scale = 255.0 if c.max(initial=0) > 1.0 else 1.0
+        c = np.clip(c / c_scale, 0.0, 1.0)
+        c_stats = np.asarray([c.mean(), c.std(), c.min(initial=0), c.max(initial=0)], dtype=np.float32)
+        c_grid = cv2.resize(c, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1).astype(np.float32)
+    return np.concatenate([d_stats, d_grid, c_stats, c_grid]).astype(np.float32)
+def read_depth_feature_cache(annotation: Path, n_frames: int, cache_dir: Path, grid_size: int, force: bool) -> np.ndarray:
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    cache_path = cache_dir / f"depth_n{n_frames}_grid{grid_size}.npz"
+    if cache_path.exists() and not force:
+        return np.load(cache_path)["features"].astype(np.float32)
+    feature_dim = 8 + grid_size * grid_size + 4 + grid_size * grid_size
+    features = np.zeros((n_frames, feature_dim), dtype=np.float32)
+    with h5py.File(annotation, "r") as f:
+        if "depth/depth" not in f:
+            np.savez_compressed(cache_path, features=features)
+            return features
+        depth_ds = f["depth/depth"]
+        conf_ds = f["depth/confidence"] if "depth/confidence" in f else None
+        depth_min = float(np.asarray(f["depth/depth_min"][()]).flat[0]) if "depth/depth_min" in f else 0.0
+        depth_max = float(np.asarray(f["depth/depth_max"][()]).flat[0]) if "depth/depth_max" in f else 4.0
+        limit = min(n_frames, depth_ds.shape[0])
+        for idx in range(limit):
+            confidence = conf_ds[idx] if conf_ds is not None else None
+            features[idx] = depth_frame_features(depth_ds[idx], confidence, depth_min, depth_max, grid_size)
+            if idx and idx % 1000 == 0:
+                print(f"    depth: {idx}/{limit} frames")
+    np.savez_compressed(cache_path, features=features)
+    return features
+TOKEN_RE = re.compile(r"[a-zA-Z0-9_]+")
+def hashed_text(text: str, dim: int) -> np.ndarray:
+    vec = np.zeros(dim, dtype=np.float32)
+    for token in TOKEN_RE.findall(text.lower()):
+        digest = hashlib.blake2b(token.encode("utf-8"), digest_size=8).digest()
+        bucket = int.from_bytes(digest[:4], "little") % dim
+        sign = 1.0 if digest[4] & 1 else -1.0
+        vec[bucket] += sign
+    norm = np.linalg.norm(vec)
+    if norm > 0:
+        vec /= norm
+    return vec
+def text_for_frame(info: dict, include_label_text: bool) -> str:
+    parts: list[str] = []
+    objects = info.get("objects")
+    if isinstance(objects, list):
+        parts.extend(str(x) for x in objects)
+    elif objects:
+        parts.append(str(objects))
+    if info.get("interaction"):
+        parts.append(str(info["interaction"]))
+    if include_label_text:
+        for key in ("theme", "action_label", "action_desc"):
+            if info.get(key):
+                parts.append(str(info[key]))
+    return " ".join(parts)
+def build_text_features(frame_info_map: dict, n_frames: int, dim: int, include_label_text: bool) -> np.ndarray:
+    features = np.zeros((n_frames, dim), dtype=np.float32)
+    for idx in range(n_frames):
+        info = frame_info_map.get(idx, {})
+        features[idx] = hashed_text(text_for_frame(info, include_label_text), dim)
+    return features
+def prepare_modalities(args: argparse.Namespace, ann: dict) -> tuple[dict, list[dict]]:
+    data_root = args.annotation.parent
+    n_frames = len(ann["img_names"])
+    extras: dict = {
+        "video": OrderedDict(),
+        "depth": None,
+        "text": None,
+        "static": OrderedDict(),
+    }
+    available = []
+    print("Preparing all-modality feature caches")
+    print("  depth/confidence")
+    depth = read_depth_feature_cache(args.annotation, n_frames, args.cache_dir, args.depth_grid_size, args.force_rebuild_cache)
+    extras["depth"] = depth
+    available.append({"modality": "depth_confidence", "shape": list(depth.shape)})
+    print("  videos")
+    for name, filename in VIDEO_FILES.items():
+        path = data_root / filename
+        feats = read_video_feature_cache(
+            path,
+            n_frames,
+            args.cache_dir,
+            args.video_image_size,
+            args.video_grid_size,
+            args.video_hist_bins,
+            args.force_rebuild_cache,
+        )
+        extras["video"][name] = feats
+        available.append({
+            "modality": f"video/{name}",
+            "path": portable_path(path, args.workspace),
+            "shape": list(feats.shape),
+            "exists": path.exists(),
+        })
+    print("  caption objects/interaction text")
+    text = build_text_features(
+        ann["caption_frame_info_map"],
+        n_frames,
+        args.text_hash_dim,
+        args.include_label_text,
+    )
+    extras["text"] = text
+    available.append({
+        "modality": "caption_text",
+        "shape": list(text.shape),
+        "fields": "objects,interaction" + (",theme,action_label,action_desc" if args.include_label_text else ""),
+    })
+    pc = point_cloud_features(ann.get("slam_point_cloud"))
+    if len(pc):
+        extras["static"]["slam_point_cloud"] = pc
+        available.append({"modality": "slam_point_cloud_static", "shape": [int(len(pc))]})
+    calib = calibration_features(ann.get("calib_data"))
+    if len(calib):
+        extras["static"]["calibration"] = calib
+        available.append({"modality": "calibration_static", "shape": [int(len(calib))]})
+    return extras, available
+def extract_all_window_features(ann: dict, extras: dict, start: int, end: int, return_blocks: bool = False):
+    body = safe_window(ann.get("smplh_body_joints"), start, end)
+    left = safe_window(ann.get("hand_left_joints"), start, end)
+    right = safe_window(ann.get("hand_right_joints"), start, end)
+    contacts = safe_window(ann.get("contacts"), start, end)
+    cam_t = safe_window(ann.get("t_c2w_all"), start, end)
+    cam_R = safe_window(ann.get("R_c2w_all"), start, end)
+    blocks: list[tuple[str, np.ndarray]] = []
+    def add(name: str, vec: np.ndarray | None) -> None:
+        if vec is None:
+            return
+        arr = np.asarray(vec, dtype=np.float32).reshape(-1)
+        if arr.size:
+            blocks.append((name, np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)))
+    if left is not None:
+        add("hand_left_joints", temporal_stats(center_by_body_root(left, body)))
+    if right is not None:
+        add("hand_right_joints", temporal_stats(center_by_body_root(right, body)))
+    if body is not None:
+        root = body[:, :1, :] if body.ndim == 3 else 0.0
+        add("body_joints", temporal_stats(body - root))
+    if contacts is not None:
+        add("body_contacts", temporal_stats(contacts))
+    if cam_t is not None:
+        add("camera_translation", temporal_stats(cam_t - cam_t[:1]))
+    if cam_R is not None:
+        add("camera_rotation_matrix", temporal_stats(cam_R))
+    imu_accel = ann.get("imu_accel_xyz")
+    imu_gyro = ann.get("imu_gyro_xyz")
+    imu_keyframes = ann.get("imu_keyframe_indices")
+    if imu_accel is not None and imu_gyro is not None and imu_keyframes is not None and len(imu_keyframes) > end - 1:
+        imu_start = int(max(0, imu_keyframes[start]))
+        imu_end = int(min(len(imu_accel), max(imu_start + 1, imu_keyframes[end - 1] + 1)))
+        imu = np.concatenate([imu_accel[imu_start:imu_end], imu_gyro[imu_start:imu_end]], axis=1)
+        add("imu_accel_gyro", temporal_stats(imu))
+    if extras.get("depth") is not None:
+        add("depth_confidence", temporal_stats(extras["depth"][start:end]))
+    for name, feats in extras.get("video", {}).items():
+        add(f"video_{name}", temporal_stats(feats[start:end]))
+    if extras.get("text") is not None:
+        add("caption_objects_interaction_text", temporal_stats(extras["text"][start:end]))
+    for name, vec in extras.get("static", {}).items():
+        add(name, vec)
+    if not blocks:
+        raise ValueError("No usable modalities found.")
+    full = np.concatenate([vec for _, vec in blocks]).astype(np.float32)
+    if return_blocks:
+        return full, [(name, int(len(vec))) for name, vec in blocks]
+    return full
+def build_feature_dataset(ann: dict, extras: dict, target: str, window_frames: int, stride_frames: int, min_label_fraction: float):
+    frame_info = ann.get("caption_frame_info_map")
+    if frame_info is None:
+        raise ValueError("No caption_frame_info_map found in annotation.")
+    n_frames = len(ann["img_names"])
+    X, y_labels, starts, ends, label_fracs = [], [], [], [], []
+    feature_manifest = None
+    for start in range(0, n_frames - window_frames + 1, stride_frames):
+        end = start + window_frames
+        labels = [frame_label(frame_info.get(i, {}), target) for i in range(start, end)]
+        label, frac = majority_label(labels, min_label_fraction)
+        if not label:
+            continue
+        if feature_manifest is None:
+            vec, blocks = extract_all_window_features(ann, extras, start, end, return_blocks=True)
+            offset = 0
+            feature_manifest = []
+            for name, length in blocks:
+                feature_manifest.append({"name": name, "start": offset, "end": offset + length, "dim": length})
+                offset += length
+        else:
+            vec = extract_all_window_features(ann, extras, start, end)
+        X.append(vec)
+        y_labels.append(label)
+        starts.append(start)
+        ends.append(end - 1)
+        label_fracs.append(frac)
+    if not X:
+        raise ValueError("No labeled windows were created. Try lowering --min-label-fraction.")
+    return (
+        np.stack(X).astype(np.float32),
+        np.asarray(y_labels, dtype=object),
+        np.asarray(starts, dtype=np.int64),
+        np.asarray(ends, dtype=np.int64),
+        np.asarray(label_fracs, dtype=np.float32),
+        feature_manifest or [],
+    )
+def write_extra_reports(output_dir: Path, feature_manifest: list[dict], available_modalities: list[dict], args: argparse.Namespace) -> None:
+    (output_dir / "feature_manifest.json").write_text(json.dumps(feature_manifest, indent=2), encoding="utf-8")
+    (output_dir / "available_modalities.json").write_text(json.dumps(available_modalities, indent=2), encoding="utf-8")
+    with (output_dir / "feature_manifest.csv").open("w", newline="", encoding="utf-8") as fp:
+        writer = csv.DictWriter(fp, fieldnames=["name", "start", "end", "dim"])
+        writer.writeheader()
+        writer.writerows(feature_manifest)
+    notes = [
+        "This is an all-modality lightweight baseline.",
+        "RGB/stereo/fisheye/depth/point-cloud/calibration/text are compressed into handcrafted features.",
+        "It is not a deep multimodal model.",
+        "Do not treat random windows from one episode as a final generalization benchmark.",
+    ]
+    if args.include_label_text:
+        notes.append("WARNING: --include-label-text was used, so language input leaks target semantics.")
+    else:
+        notes.append("Label text was not included as input; only objects and interaction text were used.")
+    (output_dir / "README_model.txt").write_text("\n".join(notes) + "\n", encoding="utf-8")
+def main() -> int:
+    args = parse_args()
+    add_toolkit_to_path(args.workspace)
+    from data_loader import load_from_annotation_hdf5
+    if not args.annotation.exists():
+        raise FileNotFoundError(f"annotation.hdf5 not found: {args.annotation}")
+    print(f"Loading annotation: {args.annotation}")
+    ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=True)
+    extras, available_modalities = prepare_modalities(args, ann)
+    print("Building all-modality windowed feature dataset")
+    X, y_labels, starts, ends, label_fracs, feature_manifest = build_feature_dataset(
+        ann,
+        extras,
+        target=args.target,
+        window_frames=args.window_frames,
+        stride_frames=args.stride_frames,
+        min_label_fraction=args.min_label_fraction,
+    )
+    y, class_names = encode_labels(y_labels)
+    train_idx, test_idx = stratified_split(y, args.test_fraction, args.seed)
+    if len(test_idx) == 0:
+        raise ValueError("No test windows available. Lower --test-fraction or use more data.")
+    mean, std = fit_scaler(X[train_idx])
+    X_scaled = (X - mean) / std
+    print(f"Windows: {len(y)} total, {len(train_idx)} train, {len(test_idx)} test")
+    print(f"Features: {X.shape[1]}, classes: {len(class_names)}")
+    print("Feature blocks:")
+    for block in feature_manifest:
+        print(f"  {block['dim']:5d}  {block['name']}")
+    for name, count in Counter(y_labels).most_common():
+        print(f"  {count:4d} windows  {name}")
+    print("Training softmax classifier")
+    W, b, history = train_softmax_classifier(
+        X_scaled[train_idx],
+        y[train_idx],
+        n_classes=len(class_names),
+        epochs=args.epochs,
+        lr=args.learning_rate,
+        l2=args.l2,
+        use_class_weights=not args.no_class_weights,
+        seed=args.seed,
+    )
+    y_pred, probs = predict(X_scaled[test_idx], W, b)
+    metrics, per_class_rows, cm = compute_metrics(y[test_idx], y_pred, class_names)
+    majority_class = Counter(y[train_idx]).most_common(1)[0][0]
+    metrics["majority_baseline_accuracy"] = float(np.mean(y[test_idx] == majority_class))
+    metrics["train_final_accuracy"] = history[-1]["train_accuracy"] if history else float("nan")
+    metrics["train_final_loss"] = history[-1]["loss"] if history else float("nan")
+    metrics["feature_dim"] = int(X.shape[1])
+    metrics["num_windows"] = int(len(y))
+    save_artifacts(
+        args.output_dir,
+        X,
+        y,
+        y_labels,
+        starts,
+        ends,
+        label_fracs,
+        train_idx,
+        test_idx,
+        class_names,
+        mean,
+        std,
+        W,
+        b,
+        history,
+        metrics,
+        per_class_rows,
+        cm,
+        y_pred,
+        probs,
+        args,
+    )
+    write_extra_reports(args.output_dir, feature_manifest, available_modalities, args)
+    print("\nEvaluation")
+    print(f"  accuracy:          {metrics['accuracy']:.4f}")
+    print(f"  balanced_accuracy: {metrics['balanced_accuracy']:.4f}")
+    print(f"  macro_f1:          {metrics['macro_f1']:.4f}")
+    print(f"  weighted_f1:       {metrics['weighted_f1']:.4f}")
+    print(f"  majority_baseline: {metrics['majority_baseline_accuracy']:.4f}")
+    print(f"\nArtifacts written to: {args.output_dir}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/train_min_action_model.py ADDED Viewed

	@@ -0,0 +1,531 @@

+#!/usr/bin/env python3
+"""
+Minimal end-to-end action-recognition pipeline for a Ropedia/Xperience episode.
+Input:
+  annotation.hdf5
+Features:
+  hand joints, body joints, contacts, camera trajectory, IMU summary statistics.
+Target:
+  caption action_label by default. Use --target subtask for Sub Task labels.
+Model:
+  Numpy-only multinomial logistic regression.
+Outputs:
+  metrics.json, per_class_metrics.csv, confusion_matrix.csv, predictions.csv,
+  feature_dataset.npz, model.npz.
+"""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import math
+import sys
+from collections import Counter, OrderedDict
+from pathlib import Path
+import numpy as np
+def parse_args() -> argparse.Namespace:
+    workspace_default = Path(__file__).resolve().parents[1]
+    data_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
+    out_default = workspace_default / "outputs/min_action_model"
+    parser = argparse.ArgumentParser(description="Train a minimal action classifier on Ropedia annotation.hdf5.")
+    parser.add_argument("--workspace", type=Path, default=workspace_default, help="Ropedia workspace root.")
+    parser.add_argument("--annotation", type=Path, default=data_default, help="Path to annotation.hdf5.")
+    parser.add_argument("--output-dir", type=Path, default=out_default, help="Output artifact directory.")
+    parser.add_argument("--target", choices=["action", "subtask"], default="action", help="Prediction target.")
+    parser.add_argument("--window-frames", type=int, default=20, help="Frames per training window.")
+    parser.add_argument("--stride-frames", type=int, default=5, help="Stride between windows.")
+    parser.add_argument("--min-label-fraction", type=float, default=0.6, help="Minimum majority-label fraction in a window.")
+    parser.add_argument("--test-fraction", type=float, default=0.25, help="Stratified test fraction.")
+    parser.add_argument("--epochs", type=int, default=800, help="Training epochs.")
+    parser.add_argument("--learning-rate", type=float, default=0.2, help="Softmax learning rate.")
+    parser.add_argument("--l2", type=float, default=1e-3, help="L2 weight decay.")
+    parser.add_argument("--seed", type=int, default=7, help="Random seed.")
+    parser.add_argument("--no-class-weights", action="store_true", help="Disable inverse-frequency class weighting.")
+    return parser.parse_args()
+def add_toolkit_to_path(workspace: Path) -> None:
+    toolkit = workspace / "HOMIE-toolkit"
+    if not toolkit.exists():
+        raise FileNotFoundError(f"HOMIE-toolkit not found: {toolkit}")
+    sys.path.insert(0, str(toolkit))
+def portable_path(path: Path, workspace: Path | None = None) -> str:
+    roots = [workspace, Path.cwd()]
+    for root in roots:
+        if root is None:
+            continue
+        try:
+            return path.resolve().relative_to(Path(root).resolve()).as_posix()
+        except (FileNotFoundError, ValueError):
+            continue
+    return path.name
+def temporal_stats(arr: np.ndarray) -> np.ndarray:
+    """Return fixed statistics over time for an array shaped (T, ...)."""
+    arr = np.asarray(arr, dtype=np.float32)
+    if arr.ndim == 0:
+        arr = arr.reshape(1, 1)
+    elif arr.ndim == 1:
+        arr = arr[:, None]
+    flat = arr.reshape(arr.shape[0], -1)
+    flat = np.nan_to_num(flat, nan=0.0, posinf=0.0, neginf=0.0)
+    if flat.shape[0] == 0:
+        raise ValueError("temporal_stats received an empty time axis")
+    mean = flat.mean(axis=0)
+    std = flat.std(axis=0)
+    amin = flat.min(axis=0)
+    amax = flat.max(axis=0)
+    delta = flat[-1] - flat[0]
+    if flat.shape[0] > 1:
+        vel = np.diff(flat, axis=0)
+        vel_mean = vel.mean(axis=0)
+        vel_std = vel.std(axis=0)
+    else:
+        vel_mean = np.zeros(flat.shape[1], dtype=np.float32)
+        vel_std = np.zeros(flat.shape[1], dtype=np.float32)
+    return np.concatenate([mean, std, amin, amax, delta, vel_mean, vel_std]).astype(np.float32)
+def safe_window(arr: np.ndarray | None, start: int, end: int) -> np.ndarray | None:
+    if arr is None:
+        return None
+    if start >= len(arr):
+        return None
+    return np.asarray(arr[start:min(end, len(arr))])
+def center_by_body_root(values: np.ndarray, body: np.ndarray | None) -> np.ndarray:
+    if body is None or len(body) != len(values) or body.ndim < 3 or body.shape[-1] != 3:
+        return values
+    root = body[:, :1, :]
+    return values - root
+def extract_window_features(ann: dict, start: int, end: int) -> np.ndarray:
+    body = safe_window(ann.get("smplh_body_joints"), start, end)
+    left = safe_window(ann.get("hand_left_joints"), start, end)
+    right = safe_window(ann.get("hand_right_joints"), start, end)
+    contacts = safe_window(ann.get("contacts"), start, end)
+    cam_t = safe_window(ann.get("t_c2w_all"), start, end)
+    chunks: list[np.ndarray] = []
+    if left is not None:
+        chunks.append(temporal_stats(center_by_body_root(left, body)))
+    if right is not None:
+        chunks.append(temporal_stats(center_by_body_root(right, body)))
+    if body is not None:
+        root = body[:, :1, :] if body.ndim == 3 else 0.0
+        chunks.append(temporal_stats(body - root))
+    if contacts is not None:
+        chunks.append(temporal_stats(contacts))
+    if cam_t is not None:
+        cam_t = cam_t - cam_t[:1]
+        chunks.append(temporal_stats(cam_t))
+    imu_accel = ann.get("imu_accel_xyz")
+    imu_gyro = ann.get("imu_gyro_xyz")
+    imu_keyframes = ann.get("imu_keyframe_indices")
+    if imu_accel is not None and imu_gyro is not None and imu_keyframes is not None and len(imu_keyframes) > end - 1:
+        imu_start = int(max(0, imu_keyframes[start]))
+        imu_end = int(min(len(imu_accel), max(imu_start + 1, imu_keyframes[end - 1] + 1)))
+        imu = np.concatenate([imu_accel[imu_start:imu_end], imu_gyro[imu_start:imu_end]], axis=1)
+        chunks.append(temporal_stats(imu))
+    if not chunks:
+        raise ValueError("No usable numeric modalities found in annotation.")
+    return np.concatenate(chunks).astype(np.float32)
+def frame_label(info: dict, target: str) -> str:
+    if target == "subtask":
+        label = info.get("theme", "")
+    else:
+        label = info.get("action_label", "")
+    label = str(label).strip()
+    if not label or label.upper() == "N/A":
+        return ""
+    return label
+def majority_label(labels: list[str], min_fraction: float) -> tuple[str, float]:
+    labels = [x for x in labels if x]
+    if not labels:
+        return "", 0.0
+    label, count = Counter(labels).most_common(1)[0]
+    frac = count / len(labels)
+    if frac < min_fraction:
+        return "", frac
+    return label, frac
+def build_feature_dataset(ann: dict, target: str, window_frames: int, stride_frames: int, min_label_fraction: float):
+    frame_info = ann.get("caption_frame_info_map")
+    if frame_info is None:
+        raise ValueError("No caption_frame_info_map found in annotation.")
+    n_frames = len(ann["img_names"])
+    X, y_labels, starts, ends, label_fracs = [], [], [], [], []
+    for start in range(0, n_frames - window_frames + 1, stride_frames):
+        end = start + window_frames
+        labels = [frame_label(frame_info.get(i, {}), target) for i in range(start, end)]
+        label, frac = majority_label(labels, min_label_fraction)
+        if not label:
+            continue
+        X.append(extract_window_features(ann, start, end))
+        y_labels.append(label)
+        starts.append(start)
+        ends.append(end - 1)
+        label_fracs.append(frac)
+    if not X:
+        raise ValueError("No labeled windows were created. Try lowering --min-label-fraction.")
+    return (
+        np.stack(X).astype(np.float32),
+        np.asarray(y_labels, dtype=object),
+        np.asarray(starts, dtype=np.int64),
+        np.asarray(ends, dtype=np.int64),
+        np.asarray(label_fracs, dtype=np.float32),
+    )
+def encode_labels(y_labels: np.ndarray) -> tuple[np.ndarray, list[str]]:
+    seen = OrderedDict()
+    for label in y_labels:
+        if label not in seen:
+            seen[label] = len(seen)
+    class_names = list(seen.keys())
+    y = np.asarray([seen[label] for label in y_labels], dtype=np.int64)
+    return y, class_names
+def stratified_split(y: np.ndarray, test_fraction: float, seed: int) -> tuple[np.ndarray, np.ndarray]:
+    rng = np.random.default_rng(seed)
+    train_idx, test_idx = [], []
+    for cls in np.unique(y):
+        idx = np.flatnonzero(y == cls)
+        rng.shuffle(idx)
+        if len(idx) < 2:
+            train_idx.extend(idx.tolist())
+            continue
+        n_test = int(round(len(idx) * test_fraction))
+        n_test = max(1, min(n_test, len(idx) - 1))
+        test_idx.extend(idx[:n_test].tolist())
+        train_idx.extend(idx[n_test:].tolist())
+    rng.shuffle(train_idx)
+    rng.shuffle(test_idx)
+    return np.asarray(train_idx, dtype=np.int64), np.asarray(test_idx, dtype=np.int64)
+def fit_scaler(X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    mean = X.mean(axis=0)
+    std = X.std(axis=0)
+    std = np.where(std < 1e-6, 1.0, std)
+    return mean.astype(np.float32), std.astype(np.float32)
+def softmax(logits: np.ndarray) -> np.ndarray:
+    logits = logits - logits.max(axis=1, keepdims=True)
+    exp = np.exp(logits)
+    return exp / exp.sum(axis=1, keepdims=True)
+def train_softmax_classifier(
+    X: np.ndarray,
+    y: np.ndarray,
+    n_classes: int,
+    epochs: int,
+    lr: float,
+    l2: float,
+    use_class_weights: bool,
+    seed: int,
+) -> tuple[np.ndarray, np.ndarray, list[dict]]:
+    rng = np.random.default_rng(seed)
+    n, d = X.shape
+    W = rng.normal(0.0, 0.01, size=(d, n_classes)).astype(np.float32)
+    b = np.zeros(n_classes, dtype=np.float32)
+    onehot = np.eye(n_classes, dtype=np.float32)[y]
+    if use_class_weights:
+        counts = np.bincount(y, minlength=n_classes).astype(np.float32)
+        weights_by_class = n / np.maximum(counts, 1.0) / n_classes
+        sample_weights = weights_by_class[y]
+    else:
+        sample_weights = np.ones(n, dtype=np.float32)
+    sample_weights = sample_weights / sample_weights.mean()
+    history = []
+    report_every = max(1, epochs // 10)
+    for epoch in range(1, epochs + 1):
+        logits = X @ W + b
+        probs = softmax(logits)
+        weighted_diff = (probs - onehot) * sample_weights[:, None] / n
+        grad_W = X.T @ weighted_diff + l2 * W
+        grad_b = weighted_diff.sum(axis=0)
+        W -= lr * grad_W
+        b -= lr * grad_b
+        if epoch == 1 or epoch == epochs or epoch % report_every == 0:
+            p_true = np.clip(probs[np.arange(n), y], 1e-9, 1.0)
+            loss = float(-(sample_weights * np.log(p_true)).mean() + 0.5 * l2 * float(np.sum(W * W)))
+            acc = float(np.mean(np.argmax(probs, axis=1) == y))
+            history.append({"epoch": epoch, "loss": loss, "train_accuracy": acc})
+    return W.astype(np.float32), b.astype(np.float32), history
+def predict(X: np.ndarray, W: np.ndarray, b: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    probs = softmax(X @ W + b)
+    return np.argmax(probs, axis=1), probs
+def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray, class_names: list[str]) -> tuple[dict, list[dict], np.ndarray]:
+    n_classes = len(class_names)
+    cm = np.zeros((n_classes, n_classes), dtype=np.int64)
+    for t, p in zip(y_true, y_pred):
+        cm[int(t), int(p)] += 1
+    rows = []
+    recalls, f1s, weighted_f1_total = [], [], 0.0
+    support_total = int(cm.sum())
+    for i, name in enumerate(class_names):
+        tp = int(cm[i, i])
+        support = int(cm[i, :].sum())
+        pred_count = int(cm[:, i].sum())
+        precision = tp / pred_count if pred_count else 0.0
+        recall = tp / support if support else 0.0
+        f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
+        if support:
+            recalls.append(recall)
+            f1s.append(f1)
+            weighted_f1_total += f1 * support
+        rows.append({
+            "class_id": i,
+            "class_name": name,
+            "support": support,
+            "predicted": pred_count,
+            "precision": precision,
+            "recall": recall,
+            "f1": f1,
+        })
+    accuracy = float(np.mean(y_true == y_pred)) if len(y_true) else 0.0
+    macro_f1 = float(np.mean(f1s)) if f1s else 0.0
+    balanced_accuracy = float(np.mean(recalls)) if recalls else 0.0
+    weighted_f1 = float(weighted_f1_total / support_total) if support_total else 0.0
+    metrics = {
+        "accuracy": accuracy,
+        "balanced_accuracy": balanced_accuracy,
+        "macro_f1": macro_f1,
+        "weighted_f1": weighted_f1,
+        "num_eval_windows": int(len(y_true)),
+        "num_classes": n_classes,
+    }
+    return metrics, rows, cm
+def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None:
+    with path.open("w", newline="", encoding="utf-8") as fp:
+        writer = csv.DictWriter(fp, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+def save_artifacts(
+    output_dir: Path,
+    X: np.ndarray,
+    y: np.ndarray,
+    y_labels: np.ndarray,
+    starts: np.ndarray,
+    ends: np.ndarray,
+    label_fracs: np.ndarray,
+    train_idx: np.ndarray,
+    test_idx: np.ndarray,
+    class_names: list[str],
+    mean: np.ndarray,
+    std: np.ndarray,
+    W: np.ndarray,
+    b: np.ndarray,
+    history: list[dict],
+    metrics: dict,
+    per_class_rows: list[dict],
+    cm: np.ndarray,
+    y_pred: np.ndarray,
+    probs: np.ndarray,
+    args: argparse.Namespace,
+) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    np.savez_compressed(
+        output_dir / "feature_dataset.npz",
+        X=X,
+        y=y,
+        labels=y_labels.astype(str),
+        start_frame=starts,
+        end_frame=ends,
+        label_fraction=label_fracs,
+        train_idx=train_idx,
+        test_idx=test_idx,
+        class_names=np.asarray(class_names, dtype=object),
+    )
+    np.savez_compressed(output_dir / "model.npz", mean=mean, std=std, W=W, b=b, class_names=np.asarray(class_names, dtype=object))
+    metadata = {
+        "annotation": portable_path(args.annotation, args.workspace),
+        "target": args.target,
+        "window_frames": args.window_frames,
+        "stride_frames": args.stride_frames,
+        "min_label_fraction": args.min_label_fraction,
+        "test_fraction": args.test_fraction,
+        "epochs": args.epochs,
+        "learning_rate": args.learning_rate,
+        "l2": args.l2,
+        "class_weights": not args.no_class_weights,
+        "num_windows": int(len(y)),
+        "num_features": int(X.shape[1]),
+        "num_train_windows": int(len(train_idx)),
+        "num_test_windows": int(len(test_idx)),
+        "classes": class_names,
+        "history": history,
+    }
+    (output_dir / "metadata.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")
+    (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")
+    write_csv(
+        output_dir / "per_class_metrics.csv",
+        per_class_rows,
+        ["class_id", "class_name", "support", "predicted", "precision", "recall", "f1"],
+    )
+    with (output_dir / "confusion_matrix.csv").open("w", newline="", encoding="utf-8") as fp:
+        writer = csv.writer(fp)
+        writer.writerow(["true\\pred"] + class_names)
+        for i, name in enumerate(class_names):
+            writer.writerow([name] + [int(v) for v in cm[i]])
+    pred_rows = []
+    pred_lookup = {int(idx): k for k, idx in enumerate(test_idx)}
+    for idx in test_idx:
+        idx = int(idx)
+        k = pred_lookup[idx]
+        pred_id = int(y_pred[k])
+        true_id = int(y[idx])
+        pred_rows.append({
+            "window_index": idx,
+            "start_frame": int(starts[idx]),
+            "end_frame": int(ends[idx]),
+            "true_label": class_names[true_id],
+            "predicted_label": class_names[pred_id],
+            "confidence": float(probs[k, pred_id]),
+            "correct": int(pred_id == true_id),
+            "label_fraction": float(label_fracs[idx]),
+        })
+    write_csv(
+        output_dir / "predictions.csv",
+        pred_rows,
+        ["window_index", "start_frame", "end_frame", "true_label", "predicted_label", "confidence", "correct", "label_fraction"],
+    )
+def main() -> int:
+    args = parse_args()
+    add_toolkit_to_path(args.workspace)
+    from data_loader import load_from_annotation_hdf5
+    if not args.annotation.exists():
+        raise FileNotFoundError(f"annotation.hdf5 not found: {args.annotation}")
+    print(f"Loading annotation: {args.annotation}")
+    ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=False)
+    print("Building windowed feature dataset")
+    X, y_labels, starts, ends, label_fracs = build_feature_dataset(
+        ann,
+        target=args.target,
+        window_frames=args.window_frames,
+        stride_frames=args.stride_frames,
+        min_label_fraction=args.min_label_fraction,
+    )
+    y, class_names = encode_labels(y_labels)
+    train_idx, test_idx = stratified_split(y, args.test_fraction, args.seed)
+    if len(test_idx) == 0:
+        raise ValueError("No test windows available. Lower --test-fraction or use more data.")
+    mean, std = fit_scaler(X[train_idx])
+    X_scaled = (X - mean) / std
+    print(f"Windows: {len(y)} total, {len(train_idx)} train, {len(test_idx)} test")
+    print(f"Features: {X.shape[1]}, classes: {len(class_names)}")
+    for name, count in Counter(y_labels).most_common():
+        print(f"  {count:4d} windows  {name}")
+    print("Training softmax classifier")
+    W, b, history = train_softmax_classifier(
+        X_scaled[train_idx],
+        y[train_idx],
+        n_classes=len(class_names),
+        epochs=args.epochs,
+        lr=args.learning_rate,
+        l2=args.l2,
+        use_class_weights=not args.no_class_weights,
+        seed=args.seed,
+    )
+    y_pred, probs = predict(X_scaled[test_idx], W, b)
+    metrics, per_class_rows, cm = compute_metrics(y[test_idx], y_pred, class_names)
+    majority_class = Counter(y[train_idx]).most_common(1)[0][0]
+    metrics["majority_baseline_accuracy"] = float(np.mean(y[test_idx] == majority_class))
+    metrics["train_final_accuracy"] = history[-1]["train_accuracy"] if history else math.nan
+    metrics["train_final_loss"] = history[-1]["loss"] if history else math.nan
+    save_artifacts(
+        args.output_dir,
+        X,
+        y,
+        y_labels,
+        starts,
+        ends,
+        label_fracs,
+        train_idx,
+        test_idx,
+        class_names,
+        mean,
+        std,
+        W,
+        b,
+        history,
+        metrics,
+        per_class_rows,
+        cm,
+        y_pred,
+        probs,
+        args,
+    )
+    print("\nEvaluation")
+    print(f"  accuracy:          {metrics['accuracy']:.4f}")
+    print(f"  balanced_accuracy: {metrics['balanced_accuracy']:.4f}")
+    print(f"  macro_f1:          {metrics['macro_f1']:.4f}")
+    print(f"  weighted_f1:       {metrics['weighted_f1']:.4f}")
+    print(f"  majority_baseline: {metrics['majority_baseline_accuracy']:.4f}")
+    print(f"\nArtifacts written to: {args.output_dir}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())