cy0307 commited on
Commit
eea471e
·
verified ·
1 Parent(s): 4496d29

Publish Ropedia minimal task baseline weights

Browse files
Files changed (49) hide show
  1. .gitattributes +1 -0
  2. README.md +107 -0
  3. artifacts/episode_task_suite/available_modalities.json +83 -0
  4. artifacts/episode_task_suite/caption_grounding/metrics.json +15 -0
  5. artifacts/episode_task_suite/caption_grounding/model.npz +3 -0
  6. artifacts/episode_task_suite/contact_prediction/metrics.json +19 -0
  7. artifacts/episode_task_suite/contact_prediction/model.npz +3 -0
  8. artifacts/episode_task_suite/cross_modal_retrieval/metrics.json +15 -0
  9. artifacts/episode_task_suite/cross_modal_retrieval/model.npz +3 -0
  10. artifacts/episode_task_suite/feature_manifest.json +104 -0
  11. artifacts/episode_task_suite/hand_trajectory_forecast/metrics.json +15 -0
  12. artifacts/episode_task_suite/misalignment_detection/metrics.json +19 -0
  13. artifacts/episode_task_suite/misalignment_detection/model.npz +3 -0
  14. artifacts/episode_task_suite/modality_reconstruction/metrics.json +12 -0
  15. artifacts/episode_task_suite/next_action/metrics.json +24 -0
  16. artifacts/episode_task_suite/next_action/model.npz +3 -0
  17. artifacts/episode_task_suite/object_relevance/metrics.json +14 -0
  18. artifacts/episode_task_suite/object_relevance/model.npz +3 -0
  19. artifacts/episode_task_suite/temporal_order/metrics.json +19 -0
  20. artifacts/episode_task_suite/temporal_order/model.npz +3 -0
  21. artifacts/episode_task_suite/timeline_action/metrics.json +24 -0
  22. artifacts/episode_task_suite/timeline_action/model.npz +3 -0
  23. artifacts/episode_task_suite/timeline_subtask/metrics.json +24 -0
  24. artifacts/episode_task_suite/timeline_subtask/model.npz +3 -0
  25. artifacts/episode_task_suite/transition_detection/metrics.json +26 -0
  26. artifacts/episode_task_suite/transition_detection/model.npz +3 -0
  27. artifacts/min_action_model/metrics.json +11 -0
  28. artifacts/min_action_model/model.npz +3 -0
  29. artifacts/min_all_modalities_action_model/available_modalities.json +83 -0
  30. artifacts/min_all_modalities_action_model/feature_manifest.json +104 -0
  31. artifacts/min_all_modalities_action_model/metrics.json +13 -0
  32. artifacts/min_all_modalities_action_model/model.npz +3 -0
  33. artifacts/min_all_modalities_subtask_model/available_modalities.json +83 -0
  34. artifacts/min_all_modalities_subtask_model/feature_manifest.json +104 -0
  35. artifacts/min_all_modalities_subtask_model/metrics.json +13 -0
  36. artifacts/min_all_modalities_subtask_model/model.npz +3 -0
  37. artifacts/min_subtask_model/metrics.json +11 -0
  38. artifacts/min_subtask_model/model.npz +3 -0
  39. assets/task_architectures.svg +216 -0
  40. assets/task_suite_infographic.png +3 -0
  41. notes/all_modalities_model.md +148 -0
  42. notes/episode_task_suite.md +176 -0
  43. notes/min_action_model.md +85 -0
  44. notes/reproducibility_audit.md +124 -0
  45. scripts/episode_task_suite.py +776 -0
  46. scripts/generate_visualizations.py +474 -0
  47. scripts/render_task_suite_infographic.py +378 -0
  48. scripts/train_all_modalities_model.py +582 -0
  49. scripts/train_min_action_model.py +531 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/task_suite_infographic.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: numpy
4
+ tags:
5
+ - robotics
6
+ - embodied-ai
7
+ - multimodal
8
+ - ropedia
9
+ - xperience-10m
10
+ - baseline
11
+ - linear-model
12
+ - retrieval
13
+ metrics:
14
+ - accuracy
15
+ - f1
16
+ - mean-reciprocal-rank
17
+ - mean-squared-error
18
+ model-index:
19
+ - name: Ropedia Minimal Task Baselines
20
+ results:
21
+ - task:
22
+ type: robotics
23
+ name: Cross-modal retrieval
24
+ dataset:
25
+ type: ropedia-ai/xperience-10m-sample
26
+ name: Xperience-10M public sample episode
27
+ metrics:
28
+ - type: top_5_accuracy
29
+ value: 0.3764
30
+ name: top-5 retrieval accuracy
31
+ - type: mrr
32
+ value: 0.2634
33
+ name: mean reciprocal rank
34
+ - task:
35
+ type: robotics
36
+ name: Transition detection
37
+ dataset:
38
+ type: ropedia-ai/xperience-10m-sample
39
+ name: Xperience-10M public sample episode
40
+ metrics:
41
+ - type: f1
42
+ value: 0.6552
43
+ name: macro-F1
44
+ ---
45
+
46
+ # Ropedia Minimal Task Baselines
47
+
48
+ This repo stores the minimal baseline weights and metrics for the 12-task Ropedia episode suite.
49
+
50
+ These are intentionally small, transparent baselines:
51
+
52
+ - z-score + linear softmax classifiers,
53
+ - dual ridge regression/projection heads,
54
+ - sigmoid multi-label logistic regression,
55
+ - cosine ranking for retrieval tasks.
56
+
57
+ They are not deep robot policies or foundation models. Their purpose is to make every input/output contract auditable before scaling to many episodes.
58
+
59
+ ## Included
60
+
61
+ - `artifacts/**/model.npz`: minimal baseline weights, scalers, and labels
62
+ - `artifacts/**/metrics.json`: committed metrics
63
+ - `artifacts/**/feature_manifest.json`: feature block boundaries where relevant
64
+ - `scripts/*.py`: training and visualization scripts
65
+ - `notes/*.md`: interpretation and reproducibility notes
66
+
67
+ The companion artifact dataset repo stores CSV/JSON predictions and dashboard assets:
68
+
69
+ https://huggingface.co/datasets/cy0307/ropedia-episode-task-suite-artifacts
70
+
71
+ The public visual dashboard is here:
72
+
73
+ https://huggingface.co/spaces/cy0307/ropedia-episode-task-suite
74
+
75
+ ## Minimal Architecture
76
+
77
+ ![Minimal 12-task architecture](assets/task_architectures.svg)
78
+
79
+ ## Metrics Snapshot
80
+
81
+ | Task | Minimal head | Main metric |
82
+ | --- | --- | ---: |
83
+ | `timeline_action` | linear softmax | 0.0500 macro-F1 |
84
+ | `timeline_subtask` | linear softmax | 0.0495 macro-F1 |
85
+ | `transition_detection` | linear softmax | 0.6552 macro-F1 |
86
+ | `next_action` | linear softmax | 0.0593 macro-F1 |
87
+ | `hand_trajectory_forecast` | ridge regression | 0.8223 MPJPE |
88
+ | `contact_prediction` | linear softmax | 1.0000 macro-F1 |
89
+ | `object_relevance` | multi-label logistic | 0.1839 micro-F1 |
90
+ | `caption_grounding` | ridge + cosine rank | 0.0172 MRR |
91
+ | `cross_modal_retrieval` | ridge + cosine rank | 0.3764 top-5 |
92
+ | `modality_reconstruction` | ridge regression | -0.0160 R2 |
93
+ | `temporal_order` | binary softmax | 0.5487 F1 |
94
+ | `misalignment_detection` | binary softmax | 0.4866 F1 |
95
+
96
+ ## Data Notice
97
+
98
+ This repo does not redistribute raw Ropedia videos or raw `annotation.hdf5`. Download the original sample from Ropedia / Hugging Face and follow the dataset terms:
99
+
100
+ - https://huggingface.co/datasets/ropedia-ai/xperience-10m-sample
101
+ - https://ropedia.com/dataset
102
+
103
+ ## Source
104
+
105
+ GitHub:
106
+
107
+ https://github.com/ChaoYue0307/ropedia-episode-task-suite
artifacts/episode_task_suite/available_modalities.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "modality": "depth_confidence",
4
+ "shape": [
5
+ 5821,
6
+ 140
7
+ ]
8
+ },
9
+ {
10
+ "modality": "video/fisheye_cam0",
11
+ "path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
12
+ "shape": [
13
+ 5821,
14
+ 98
15
+ ],
16
+ "exists": true
17
+ },
18
+ {
19
+ "modality": "video/fisheye_cam1",
20
+ "path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
21
+ "shape": [
22
+ 5821,
23
+ 98
24
+ ],
25
+ "exists": true
26
+ },
27
+ {
28
+ "modality": "video/fisheye_cam2",
29
+ "path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
30
+ "shape": [
31
+ 5821,
32
+ 98
33
+ ],
34
+ "exists": true
35
+ },
36
+ {
37
+ "modality": "video/fisheye_cam3",
38
+ "path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
39
+ "shape": [
40
+ 5821,
41
+ 98
42
+ ],
43
+ "exists": true
44
+ },
45
+ {
46
+ "modality": "video/stereo_left",
47
+ "path": "data/sample/xperience-10m-sample/stereo_left.mp4",
48
+ "shape": [
49
+ 5821,
50
+ 98
51
+ ],
52
+ "exists": true
53
+ },
54
+ {
55
+ "modality": "video/stereo_right",
56
+ "path": "data/sample/xperience-10m-sample/stereo_right.mp4",
57
+ "shape": [
58
+ 5821,
59
+ 98
60
+ ],
61
+ "exists": true
62
+ },
63
+ {
64
+ "modality": "caption_text",
65
+ "shape": [
66
+ 5821,
67
+ 128
68
+ ],
69
+ "fields": "objects,interaction"
70
+ },
71
+ {
72
+ "modality": "slam_point_cloud_static",
73
+ "shape": [
74
+ 22
75
+ ]
76
+ },
77
+ {
78
+ "modality": "calibration_static",
79
+ "shape": [
80
+ 117
81
+ ]
82
+ }
83
+ ]
artifacts/episode_task_suite/caption_grounding/metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mrr": 0.017183946083791223,
3
+ "median_rank": 167.0,
4
+ "mean_rank": 174.39367816091954,
5
+ "num_queries": 348,
6
+ "top1_accuracy": 0.0028735632183908046,
7
+ "top5_accuracy": 0.011494252873563218,
8
+ "top10_accuracy": 0.017241379310344827,
9
+ "task": "caption_grounding",
10
+ "input": "caption objects/interaction text query + candidate sensor windows",
11
+ "output": "matching time window",
12
+ "split": "chronological",
13
+ "num_train_windows": 813,
14
+ "num_test_windows": 348
15
+ }
artifacts/episode_task_suite/caption_grounding/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:032da1fd5b5142b449e758a13bf5a450bb9ac22afde032bebf194987f97c1341
3
+ size 14459176
artifacts/episode_task_suite/contact_prediction/metrics.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 1.0,
3
+ "balanced_accuracy": 1.0,
4
+ "macro_f1": 1.0,
5
+ "weighted_f1": 1.0,
6
+ "num_eval_windows": 348,
7
+ "num_classes": 1,
8
+ "task": "contact_prediction",
9
+ "input": "all non-contact/non-caption-label modalities -> any body contact",
10
+ "split": "chronological",
11
+ "num_windows": 1161,
12
+ "num_train_windows": 813,
13
+ "num_test_windows": 348,
14
+ "feature_dim": 7335,
15
+ "majority_baseline_accuracy": 1.0,
16
+ "train_final_accuracy": 1.0,
17
+ "train_final_loss": 0.0005947681493125856,
18
+ "unseen_test_classes": []
19
+ }
artifacts/episode_task_suite/contact_prediction/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:050d2139076c55b251c2c23b62d6c58023cc7fb1c0431ded6795e775c9300a7b
3
+ size 82797
artifacts/episode_task_suite/cross_modal_retrieval/metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mrr": 0.26335984006618296,
3
+ "median_rank": 12.5,
4
+ "mean_rank": 43.33045977011494,
5
+ "num_queries": 348,
6
+ "top1_accuracy": 0.14942528735632185,
7
+ "top5_accuracy": 0.3764367816091954,
8
+ "top10_accuracy": 0.47413793103448276,
9
+ "task": "cross_modal_retrieval",
10
+ "input": "motion/IMU/camera query",
11
+ "output": "matching depth/video window",
12
+ "split": "chronological",
13
+ "num_train_windows": 813,
14
+ "num_test_windows": 348
15
+ }
artifacts/episode_task_suite/cross_modal_retrieval/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc5b2d0bc4350c4348be1e6098f9793a8ed5e479bad9ee20351bf2991c71347a
3
+ size 41310574
artifacts/episode_task_suite/feature_manifest.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "hand_left_joints",
4
+ "start": 0,
5
+ "end": 441,
6
+ "dim": 441
7
+ },
8
+ {
9
+ "name": "hand_right_joints",
10
+ "start": 441,
11
+ "end": 882,
12
+ "dim": 441
13
+ },
14
+ {
15
+ "name": "body_joints",
16
+ "start": 882,
17
+ "end": 1974,
18
+ "dim": 1092
19
+ },
20
+ {
21
+ "name": "body_contacts",
22
+ "start": 1974,
23
+ "end": 2121,
24
+ "dim": 147
25
+ },
26
+ {
27
+ "name": "camera_translation",
28
+ "start": 2121,
29
+ "end": 2142,
30
+ "dim": 21
31
+ },
32
+ {
33
+ "name": "camera_rotation_matrix",
34
+ "start": 2142,
35
+ "end": 2205,
36
+ "dim": 63
37
+ },
38
+ {
39
+ "name": "imu_accel_gyro",
40
+ "start": 2205,
41
+ "end": 2247,
42
+ "dim": 42
43
+ },
44
+ {
45
+ "name": "depth_confidence",
46
+ "start": 2247,
47
+ "end": 3227,
48
+ "dim": 980
49
+ },
50
+ {
51
+ "name": "video_fisheye_cam0",
52
+ "start": 3227,
53
+ "end": 3913,
54
+ "dim": 686
55
+ },
56
+ {
57
+ "name": "video_fisheye_cam1",
58
+ "start": 3913,
59
+ "end": 4599,
60
+ "dim": 686
61
+ },
62
+ {
63
+ "name": "video_fisheye_cam2",
64
+ "start": 4599,
65
+ "end": 5285,
66
+ "dim": 686
67
+ },
68
+ {
69
+ "name": "video_fisheye_cam3",
70
+ "start": 5285,
71
+ "end": 5971,
72
+ "dim": 686
73
+ },
74
+ {
75
+ "name": "video_stereo_left",
76
+ "start": 5971,
77
+ "end": 6657,
78
+ "dim": 686
79
+ },
80
+ {
81
+ "name": "video_stereo_right",
82
+ "start": 6657,
83
+ "end": 7343,
84
+ "dim": 686
85
+ },
86
+ {
87
+ "name": "caption_objects_interaction_text",
88
+ "start": 7343,
89
+ "end": 8239,
90
+ "dim": 896
91
+ },
92
+ {
93
+ "name": "slam_point_cloud",
94
+ "start": 8239,
95
+ "end": 8261,
96
+ "dim": 22
97
+ },
98
+ {
99
+ "name": "calibration",
100
+ "start": 8261,
101
+ "end": 8378,
102
+ "dim": 117
103
+ }
104
+ ]
artifacts/episode_task_suite/hand_trajectory_forecast/metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse": 11.323140144348145,
3
+ "mae": 0.40246668457984924,
4
+ "r2": -1334.788993815828,
5
+ "task": "hand_trajectory_forecast",
6
+ "input": "all modalities at t -> future left/right hand 3D joints",
7
+ "split": "chronological",
8
+ "num_windows": 1159,
9
+ "num_train_windows": 811,
10
+ "num_test_windows": 348,
11
+ "forecast_frames": 10,
12
+ "mpjpe": 0.8222644925117493,
13
+ "final_frame_mpjpe": 1.0649521350860596,
14
+ "target_dim": 1260
15
+ }
artifacts/episode_task_suite/misalignment_detection/metrics.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.5028901734104047,
3
+ "precision": 0.5030864197530864,
4
+ "recall": 0.47109826589595377,
5
+ "f1": 0.4865671641791045,
6
+ "tp": 163,
7
+ "tn": 185,
8
+ "fp": 161,
9
+ "fn": 183,
10
+ "positive_rate_true": 0.5,
11
+ "positive_rate_pred": 0.4682080924855491,
12
+ "task": "misalignment_detection",
13
+ "input": "motion+visual pair -> aligned vs shifted by 8 windows",
14
+ "split": "chronological",
15
+ "num_samples": 2306,
16
+ "num_train_samples": 1614,
17
+ "num_test_samples": 692,
18
+ "train_final_accuracy": 0.5018587360594795
19
+ }
artifacts/episode_task_suite/misalignment_detection/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:375daf8e2d5e8e926970c457eff3c48ab402608c02cc564135f367b133609063
3
+ size 110186
artifacts/episode_task_suite/modality_reconstruction/metrics.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mse": 1359.1639404296875,
3
+ "mae": 0.31084805727005005,
4
+ "r2": -0.016022846771134747,
5
+ "task": "modality_reconstruction",
6
+ "input": "motion/IMU/camera",
7
+ "output": "depth/video feature vector",
8
+ "split": "chronological",
9
+ "num_train_windows": 813,
10
+ "num_test_windows": 348,
11
+ "target_dim": 5096
12
+ }
artifacts/episode_task_suite/next_action/metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.034482758620689655,
3
+ "balanced_accuracy": 0.04,
4
+ "macro_f1": 0.05925925925925927,
5
+ "weighted_f1": 0.05108556832694764,
6
+ "num_eval_windows": 348,
7
+ "num_classes": 18,
8
+ "task": "next_action",
9
+ "input": "all modalities at t -> action at t+20 frames",
10
+ "split": "chronological",
11
+ "num_windows": 1161,
12
+ "num_train_windows": 813,
13
+ "num_test_windows": 348,
14
+ "feature_dim": 8378,
15
+ "majority_baseline_accuracy": 0.0,
16
+ "train_final_accuracy": 1.0,
17
+ "train_final_loss": 0.017629079520702362,
18
+ "unseen_test_classes": [
19
+ "Place item on table",
20
+ "Pour coffee",
21
+ "Pour milk into coffee",
22
+ "Wait/Prepare for pouring"
23
+ ]
24
+ }
artifacts/episode_task_suite/next_action/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fcfa0e624694a7b07fecac33d9385c54f5aeb1faf4517d11fbf6db3b973292d
3
+ size 620530
artifacts/episode_task_suite/object_relevance/metrics.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "micro_f1": 0.18393030009680542,
3
+ "macro_f1": 0.06427052187996415,
4
+ "exact_match": 0.005747126436781609,
5
+ "precision": 0.16360505166475317,
6
+ "recall": 0.21002210759027265,
7
+ "task": "object_relevance",
8
+ "input": "all non-caption modalities -> current relevant object set",
9
+ "split": "chronological",
10
+ "num_windows": 1161,
11
+ "num_train_windows": 813,
12
+ "num_test_windows": 348,
13
+ "num_objects": 34
14
+ }
artifacts/episode_task_suite/object_relevance/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aca088062b23a8fa8b05b261cf698c50c00b11be238eb4b9260f7609da70ff11
3
+ size 1002718
artifacts/episode_task_suite/temporal_order/metrics.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.46120689655172414,
3
+ "precision": 0.4720496894409938,
4
+ "recall": 0.6551724137931034,
5
+ "f1": 0.5487364620938628,
6
+ "tp": 228,
7
+ "tn": 93,
8
+ "fp": 255,
9
+ "fn": 120,
10
+ "positive_rate_true": 0.5,
11
+ "positive_rate_pred": 0.6939655172413793,
12
+ "task": "temporal_order",
13
+ "input": "two adjacent windows -> whether order is correct",
14
+ "split": "chronological",
15
+ "num_samples": 2320,
16
+ "num_train_samples": 1624,
17
+ "num_test_samples": 696,
18
+ "train_final_accuracy": 0.5104679802955665
19
+ }
artifacts/episode_task_suite/temporal_order/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04330ca7fe354ecb592f366d27764a538e2b51fd6d23f66d618ea86d33c34f4e
3
+ size 335170
artifacts/episode_task_suite/timeline_action/metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.029154518950437316,
3
+ "balanced_accuracy": 0.03125,
4
+ "macro_f1": 0.05,
5
+ "weighted_f1": 0.04664723032069971,
6
+ "num_eval_windows": 343,
7
+ "num_classes": 18,
8
+ "task": "timeline_action",
9
+ "input": "all modalities -> current action label",
10
+ "split": "chronological",
11
+ "num_windows": 1144,
12
+ "num_train_windows": 801,
13
+ "num_test_windows": 343,
14
+ "feature_dim": 8378,
15
+ "majority_baseline_accuracy": 0.0,
16
+ "train_final_accuracy": 1.0,
17
+ "train_final_loss": 0.01664665900170803,
18
+ "unseen_test_classes": [
19
+ "Place item on table",
20
+ "Pour coffee",
21
+ "Pour milk into coffee",
22
+ "Wait/Prepare for pouring"
23
+ ]
24
+ }
artifacts/episode_task_suite/timeline_action/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3052fc9442607895eb6dc5ca81d5a1c28f4cdf9e1f9a4931e6ef78403283a7c
3
+ size 620781
artifacts/episode_task_suite/timeline_subtask/metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.05813953488372093,
3
+ "balanced_accuracy": 0.05376979652090881,
4
+ "macro_f1": 0.04954121121178666,
5
+ "weighted_f1": 0.06731304264454903,
6
+ "num_eval_windows": 344,
7
+ "num_classes": 14,
8
+ "task": "timeline_subtask",
9
+ "input": "all modalities -> current subtask label",
10
+ "split": "chronological",
11
+ "num_windows": 1147,
12
+ "num_train_windows": 803,
13
+ "num_test_windows": 344,
14
+ "feature_dim": 8378,
15
+ "majority_baseline_accuracy": 0.0,
16
+ "train_final_accuracy": 1.0,
17
+ "train_final_loss": 0.014040183275938034,
18
+ "unseen_test_classes": [
19
+ "Move bottle to coffee equipment",
20
+ "Pour coffee",
21
+ "Pour milk into coffee",
22
+ "Prepare for pouring"
23
+ ]
24
+ }
artifacts/episode_task_suite/timeline_subtask/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39dace29541e90a947e902a7ba7afd39b7a2c1d3123ed513653d0704d45d2ad1
3
+ size 496518
artifacts/episode_task_suite/transition_detection/metrics.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.9252873563218391,
3
+ "balanced_accuracy": 0.6931475903614458,
4
+ "macro_f1": 0.6551829268292684,
5
+ "weighted_f1": 0.9323030557891787,
6
+ "num_eval_windows": 348,
7
+ "num_classes": 2,
8
+ "task": "transition_detection",
9
+ "input": "all modalities -> action boundary/steady",
10
+ "split": "chronological",
11
+ "num_windows": 1161,
12
+ "num_train_windows": 813,
13
+ "num_test_windows": 348,
14
+ "feature_dim": 8378,
15
+ "majority_baseline_accuracy": 0.9540229885057471,
16
+ "train_final_accuracy": 1.0,
17
+ "train_final_loss": 0.007071746978908777,
18
+ "unseen_test_classes": [],
19
+ "boundary_precision": 0.125,
20
+ "boundary_recall": 0.75,
21
+ "boundary_f1": 0.21428571428571427,
22
+ "matched_boundaries": 3,
23
+ "true_boundaries": 4,
24
+ "predicted_boundaries": 24,
25
+ "mean_abs_timing_error_frames": 2.6666666666666665
26
+ }
artifacts/episode_task_suite/transition_detection/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f179e3278c2b0e6563ed0bfe14a42faae28a5a0a0aa4a0b056113fc345aa4a27
3
+ size 122843
artifacts/min_action_model/metrics.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.9828178694158075,
3
+ "balanced_accuracy": 0.9643518518518519,
4
+ "macro_f1": 0.96884342657456,
5
+ "weighted_f1": 0.9824311468352843,
6
+ "num_eval_windows": 291,
7
+ "num_classes": 18,
8
+ "majority_baseline_accuracy": 0.13745704467353953,
9
+ "train_final_accuracy": 1.0,
10
+ "train_final_loss": 0.019042566418647766
11
+ }
artifacts/min_action_model/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b143a74aa94c882e08279adabfcf5806348ccb37c70c9192c8def206fda97895
3
+ size 163871
artifacts/min_all_modalities_action_model/available_modalities.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "modality": "depth_confidence",
4
+ "shape": [
5
+ 5821,
6
+ 140
7
+ ]
8
+ },
9
+ {
10
+ "modality": "video/fisheye_cam0",
11
+ "path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
12
+ "shape": [
13
+ 5821,
14
+ 98
15
+ ],
16
+ "exists": true
17
+ },
18
+ {
19
+ "modality": "video/fisheye_cam1",
20
+ "path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
21
+ "shape": [
22
+ 5821,
23
+ 98
24
+ ],
25
+ "exists": true
26
+ },
27
+ {
28
+ "modality": "video/fisheye_cam2",
29
+ "path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
30
+ "shape": [
31
+ 5821,
32
+ 98
33
+ ],
34
+ "exists": true
35
+ },
36
+ {
37
+ "modality": "video/fisheye_cam3",
38
+ "path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
39
+ "shape": [
40
+ 5821,
41
+ 98
42
+ ],
43
+ "exists": true
44
+ },
45
+ {
46
+ "modality": "video/stereo_left",
47
+ "path": "data/sample/xperience-10m-sample/stereo_left.mp4",
48
+ "shape": [
49
+ 5821,
50
+ 98
51
+ ],
52
+ "exists": true
53
+ },
54
+ {
55
+ "modality": "video/stereo_right",
56
+ "path": "data/sample/xperience-10m-sample/stereo_right.mp4",
57
+ "shape": [
58
+ 5821,
59
+ 98
60
+ ],
61
+ "exists": true
62
+ },
63
+ {
64
+ "modality": "caption_text",
65
+ "shape": [
66
+ 5821,
67
+ 128
68
+ ],
69
+ "fields": "objects,interaction"
70
+ },
71
+ {
72
+ "modality": "slam_point_cloud_static",
73
+ "shape": [
74
+ 22
75
+ ]
76
+ },
77
+ {
78
+ "modality": "calibration_static",
79
+ "shape": [
80
+ 117
81
+ ]
82
+ }
83
+ ]
artifacts/min_all_modalities_action_model/feature_manifest.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "hand_left_joints",
4
+ "start": 0,
5
+ "end": 441,
6
+ "dim": 441
7
+ },
8
+ {
9
+ "name": "hand_right_joints",
10
+ "start": 441,
11
+ "end": 882,
12
+ "dim": 441
13
+ },
14
+ {
15
+ "name": "body_joints",
16
+ "start": 882,
17
+ "end": 1974,
18
+ "dim": 1092
19
+ },
20
+ {
21
+ "name": "body_contacts",
22
+ "start": 1974,
23
+ "end": 2121,
24
+ "dim": 147
25
+ },
26
+ {
27
+ "name": "camera_translation",
28
+ "start": 2121,
29
+ "end": 2142,
30
+ "dim": 21
31
+ },
32
+ {
33
+ "name": "camera_rotation_matrix",
34
+ "start": 2142,
35
+ "end": 2205,
36
+ "dim": 63
37
+ },
38
+ {
39
+ "name": "imu_accel_gyro",
40
+ "start": 2205,
41
+ "end": 2247,
42
+ "dim": 42
43
+ },
44
+ {
45
+ "name": "depth_confidence",
46
+ "start": 2247,
47
+ "end": 3227,
48
+ "dim": 980
49
+ },
50
+ {
51
+ "name": "video_fisheye_cam0",
52
+ "start": 3227,
53
+ "end": 3913,
54
+ "dim": 686
55
+ },
56
+ {
57
+ "name": "video_fisheye_cam1",
58
+ "start": 3913,
59
+ "end": 4599,
60
+ "dim": 686
61
+ },
62
+ {
63
+ "name": "video_fisheye_cam2",
64
+ "start": 4599,
65
+ "end": 5285,
66
+ "dim": 686
67
+ },
68
+ {
69
+ "name": "video_fisheye_cam3",
70
+ "start": 5285,
71
+ "end": 5971,
72
+ "dim": 686
73
+ },
74
+ {
75
+ "name": "video_stereo_left",
76
+ "start": 5971,
77
+ "end": 6657,
78
+ "dim": 686
79
+ },
80
+ {
81
+ "name": "video_stereo_right",
82
+ "start": 6657,
83
+ "end": 7343,
84
+ "dim": 686
85
+ },
86
+ {
87
+ "name": "caption_objects_interaction_text",
88
+ "start": 7343,
89
+ "end": 8239,
90
+ "dim": 896
91
+ },
92
+ {
93
+ "name": "slam_point_cloud",
94
+ "start": 8239,
95
+ "end": 8261,
96
+ "dim": 22
97
+ },
98
+ {
99
+ "name": "calibration",
100
+ "start": 8261,
101
+ "end": 8378,
102
+ "dim": 117
103
+ }
104
+ ]
artifacts/min_all_modalities_action_model/metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.9828178694158075,
3
+ "balanced_accuracy": 0.9800925925925925,
4
+ "macro_f1": 0.9791023658779895,
5
+ "weighted_f1": 0.98276563540562,
6
+ "num_eval_windows": 291,
7
+ "num_classes": 18,
8
+ "majority_baseline_accuracy": 0.13745704467353953,
9
+ "train_final_accuracy": 1.0,
10
+ "train_final_loss": 0.014624637551605701,
11
+ "feature_dim": 8378,
12
+ "num_windows": 1144
13
+ }
artifacts/min_all_modalities_action_model/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:228cda0f036f86a7a1cb44e67d5c7112747bfc5cc27bf91c90516c6ba8322c81
3
+ size 621786
artifacts/min_all_modalities_subtask_model/available_modalities.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "modality": "depth_confidence",
4
+ "shape": [
5
+ 5821,
6
+ 140
7
+ ]
8
+ },
9
+ {
10
+ "modality": "video/fisheye_cam0",
11
+ "path": "data/sample/xperience-10m-sample/fisheye_cam0.mp4",
12
+ "shape": [
13
+ 5821,
14
+ 98
15
+ ],
16
+ "exists": true
17
+ },
18
+ {
19
+ "modality": "video/fisheye_cam1",
20
+ "path": "data/sample/xperience-10m-sample/fisheye_cam1.mp4",
21
+ "shape": [
22
+ 5821,
23
+ 98
24
+ ],
25
+ "exists": true
26
+ },
27
+ {
28
+ "modality": "video/fisheye_cam2",
29
+ "path": "data/sample/xperience-10m-sample/fisheye_cam2.mp4",
30
+ "shape": [
31
+ 5821,
32
+ 98
33
+ ],
34
+ "exists": true
35
+ },
36
+ {
37
+ "modality": "video/fisheye_cam3",
38
+ "path": "data/sample/xperience-10m-sample/fisheye_cam3.mp4",
39
+ "shape": [
40
+ 5821,
41
+ 98
42
+ ],
43
+ "exists": true
44
+ },
45
+ {
46
+ "modality": "video/stereo_left",
47
+ "path": "data/sample/xperience-10m-sample/stereo_left.mp4",
48
+ "shape": [
49
+ 5821,
50
+ 98
51
+ ],
52
+ "exists": true
53
+ },
54
+ {
55
+ "modality": "video/stereo_right",
56
+ "path": "data/sample/xperience-10m-sample/stereo_right.mp4",
57
+ "shape": [
58
+ 5821,
59
+ 98
60
+ ],
61
+ "exists": true
62
+ },
63
+ {
64
+ "modality": "caption_text",
65
+ "shape": [
66
+ 5821,
67
+ 128
68
+ ],
69
+ "fields": "objects,interaction"
70
+ },
71
+ {
72
+ "modality": "slam_point_cloud_static",
73
+ "shape": [
74
+ 22
75
+ ]
76
+ },
77
+ {
78
+ "modality": "calibration_static",
79
+ "shape": [
80
+ 117
81
+ ]
82
+ }
83
+ ]
artifacts/min_all_modalities_subtask_model/feature_manifest.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "hand_left_joints",
4
+ "start": 0,
5
+ "end": 441,
6
+ "dim": 441
7
+ },
8
+ {
9
+ "name": "hand_right_joints",
10
+ "start": 441,
11
+ "end": 882,
12
+ "dim": 441
13
+ },
14
+ {
15
+ "name": "body_joints",
16
+ "start": 882,
17
+ "end": 1974,
18
+ "dim": 1092
19
+ },
20
+ {
21
+ "name": "body_contacts",
22
+ "start": 1974,
23
+ "end": 2121,
24
+ "dim": 147
25
+ },
26
+ {
27
+ "name": "camera_translation",
28
+ "start": 2121,
29
+ "end": 2142,
30
+ "dim": 21
31
+ },
32
+ {
33
+ "name": "camera_rotation_matrix",
34
+ "start": 2142,
35
+ "end": 2205,
36
+ "dim": 63
37
+ },
38
+ {
39
+ "name": "imu_accel_gyro",
40
+ "start": 2205,
41
+ "end": 2247,
42
+ "dim": 42
43
+ },
44
+ {
45
+ "name": "depth_confidence",
46
+ "start": 2247,
47
+ "end": 3227,
48
+ "dim": 980
49
+ },
50
+ {
51
+ "name": "video_fisheye_cam0",
52
+ "start": 3227,
53
+ "end": 3913,
54
+ "dim": 686
55
+ },
56
+ {
57
+ "name": "video_fisheye_cam1",
58
+ "start": 3913,
59
+ "end": 4599,
60
+ "dim": 686
61
+ },
62
+ {
63
+ "name": "video_fisheye_cam2",
64
+ "start": 4599,
65
+ "end": 5285,
66
+ "dim": 686
67
+ },
68
+ {
69
+ "name": "video_fisheye_cam3",
70
+ "start": 5285,
71
+ "end": 5971,
72
+ "dim": 686
73
+ },
74
+ {
75
+ "name": "video_stereo_left",
76
+ "start": 5971,
77
+ "end": 6657,
78
+ "dim": 686
79
+ },
80
+ {
81
+ "name": "video_stereo_right",
82
+ "start": 6657,
83
+ "end": 7343,
84
+ "dim": 686
85
+ },
86
+ {
87
+ "name": "caption_objects_interaction_text",
88
+ "start": 7343,
89
+ "end": 8239,
90
+ "dim": 896
91
+ },
92
+ {
93
+ "name": "slam_point_cloud",
94
+ "start": 8239,
95
+ "end": 8261,
96
+ "dim": 22
97
+ },
98
+ {
99
+ "name": "calibration",
100
+ "start": 8261,
101
+ "end": 8378,
102
+ "dim": 117
103
+ }
104
+ ]
artifacts/min_all_modalities_subtask_model/metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.9827586206896551,
3
+ "balanced_accuracy": 0.9505102040816327,
4
+ "macro_f1": 0.9307645963773675,
5
+ "weighted_f1": 0.9837987833808578,
6
+ "num_eval_windows": 290,
7
+ "num_classes": 14,
8
+ "majority_baseline_accuracy": 0.14482758620689656,
9
+ "train_final_accuracy": 1.0,
10
+ "train_final_loss": 0.012823422439396381,
11
+ "feature_dim": 8378,
12
+ "num_windows": 1147
13
+ }
artifacts/min_all_modalities_subtask_model/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ec248d69f63d5acd00c83c024bbfe23cadf0ab0ba1b6c9ff3916d2b1d76ee94
3
+ size 497409
artifacts/min_subtask_model/metrics.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accuracy": 0.9758620689655172,
3
+ "balanced_accuracy": 0.9783924095954172,
4
+ "macro_f1": 0.9528048001232955,
5
+ "weighted_f1": 0.9778836359351952,
6
+ "num_eval_windows": 290,
7
+ "num_classes": 14,
8
+ "majority_baseline_accuracy": 0.14482758620689656,
9
+ "train_final_accuracy": 1.0,
10
+ "train_final_loss": 0.02664567530155182
11
+ }
artifacts/min_subtask_model/model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:226b91679611e731abf36ec55f1181ad2748b25e9e84c6f09e35b00dd43a863f
3
+ size 131612
assets/task_architectures.svg ADDED
assets/task_suite_infographic.png ADDED

Git LFS Details

  • SHA256: 38ba0968f53333b74069e36bec35382cb9c97568da8be528536acc2d69fdb168
  • Pointer size: 132 Bytes
  • Size of remote file: 1.32 MB
notes/all_modalities_model.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # All-Modality Minimal Model
2
+
3
+ Script:
4
+
5
+ ```text
6
+ scripts/train_all_modalities_model.py
7
+ ```
8
+
9
+ This extends the first minimal model by using every major sample modality in a lightweight way.
10
+
11
+ ## Modalities Used
12
+
13
+ Dynamic sensor/action modalities:
14
+
15
+ - `hand_mocap/left_joints_3d`
16
+ - `hand_mocap/right_joints_3d`
17
+ - `full_body_mocap/keypoints`
18
+ - `full_body_mocap/contacts`
19
+ - `slam/trans_xyz`
20
+ - `slam/quat_wxyz` converted by the toolkit into camera rotation matrices
21
+ - `imu/accel_xyz`
22
+ - `imu/gyro_xyz`
23
+ - `depth/depth`
24
+ - `depth/confidence`
25
+ - `fisheye_cam0.mp4`
26
+ - `fisheye_cam1.mp4`
27
+ - `fisheye_cam2.mp4`
28
+ - `fisheye_cam3.mp4`
29
+ - `stereo_left.mp4`
30
+ - `stereo_right.mp4`
31
+
32
+ Static/context modalities:
33
+
34
+ - `slam/point_cloud`
35
+ - `calibration/*`
36
+ - caption objects
37
+ - caption interaction text
38
+
39
+ By default, the script does **not** include `action_label`, `Sub Task`, or action-description text as input, because those are too close to the prediction target. You can force that with `--include-label-text`, but that should be treated as a leakage/debug run, not a fair action-recognition experiment.
40
+
41
+ ## Feature Design
42
+
43
+ The model is still intentionally small:
44
+
45
+ ```text
46
+ raw modality -> per-frame or static handcrafted features -> window temporal statistics -> softmax classifier
47
+ ```
48
+
49
+ For each 20-frame window:
50
+
51
+ - Motion signals use mean/std/min/max/delta/velocity statistics.
52
+ - Depth uses global depth stats plus a small normalized depth grid and confidence grid.
53
+ - Each video stream uses color stats, color histograms, a small grayscale grid, and simple edge stats.
54
+ - Text uses a hashed bag-of-words vector from objects and interaction text.
55
+ - Point cloud and calibration are included as static episode-level features.
56
+
57
+ Current feature blocks:
58
+
59
+ ```text
60
+ hand_left_joints: 441
61
+ hand_right_joints: 441
62
+ body_joints: 1092
63
+ body_contacts: 147
64
+ camera_translation: 21
65
+ camera_rotation_matrix: 63
66
+ imu_accel_gyro: 42
67
+ depth_confidence: 980
68
+ video_fisheye_cam0: 686
69
+ video_fisheye_cam1: 686
70
+ video_fisheye_cam2: 686
71
+ video_fisheye_cam3: 686
72
+ video_stereo_left: 686
73
+ video_stereo_right: 686
74
+ caption_objects_interaction_text: 896
75
+ slam_point_cloud: 22
76
+ calibration: 117
77
+ total: 8378
78
+ ```
79
+
80
+ ## Run Commands
81
+
82
+ Action prediction:
83
+
84
+ ```bash
85
+ cd /path/to/Ropedia
86
+ source .venv/bin/activate
87
+ python scripts/train_all_modalities_model.py
88
+ ```
89
+
90
+ Subtask prediction:
91
+
92
+ ```bash
93
+ python scripts/train_all_modalities_model.py --target subtask
94
+ ```
95
+
96
+ The first run builds reusable caches in:
97
+
98
+ ```text
99
+ outputs/feature_cache/
100
+ ```
101
+
102
+ ## Current Results
103
+
104
+ Action-label model:
105
+
106
+ ```text
107
+ outputs/min_all_modalities_action_model/
108
+ accuracy: 0.9828
109
+ balanced_accuracy: 0.9801
110
+ macro_f1: 0.9791
111
+ weighted_f1: 0.9828
112
+ majority_baseline: 0.1375
113
+ classes: 18
114
+ feature_dim: 8378
115
+ test_windows: 291
116
+ ```
117
+
118
+ Subtask-label model:
119
+
120
+ ```text
121
+ outputs/min_all_modalities_subtask_model/
122
+ accuracy: 0.9828
123
+ balanced_accuracy: 0.9505
124
+ macro_f1: 0.9308
125
+ weighted_f1: 0.9838
126
+ majority_baseline: 0.1448
127
+ classes: 14
128
+ feature_dim: 8378
129
+ test_windows: 290
130
+ ```
131
+
132
+ ## How To Interpret This
133
+
134
+ This proves that the full sample can be converted into a complete supervised learning pipeline on this Mac.
135
+
136
+ It does **not** prove real generalization, because the public sample is one episode and the split is random windows from that same episode. Neighboring windows are correlated.
137
+
138
+ For a serious embodied-AI experiment:
139
+
140
+ ```text
141
+ many episodes
142
+ -> cache features per episode
143
+ -> split by episode or task instance
144
+ -> train on some episodes
145
+ -> test on unseen episodes
146
+ ```
147
+
148
+ The next useful upgrade is not a bigger classifier. It is a better split and more episodes.
notes/episode_task_suite.md ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Episode Task Suite
2
+
3
+ Script:
4
+
5
+ ```text
6
+ scripts/episode_task_suite.py
7
+ ```
8
+
9
+ This script turns the single public Ropedia sample episode into many end-to-end tasks. It is designed for learning, debugging, and task design. It is **not** a generalization benchmark because the data is still one episode.
10
+
11
+ Run:
12
+
13
+ ```bash
14
+ cd /path/to/Ropedia
15
+ source .venv/bin/activate
16
+ python scripts/episode_task_suite.py
17
+ ```
18
+
19
+ Output:
20
+
21
+ ```text
22
+ outputs/episode_task_suite/
23
+ ```
24
+
25
+ Shared setup:
26
+
27
+ ```text
28
+ sample episode: 5821 frames
29
+ windows: 1161
30
+ window size: 20 frames
31
+ stride: 5 frames
32
+ feature dim: 8378
33
+ split: chronological, first 70% train and last 30% test
34
+ ```
35
+
36
+ ## Implemented Tasks
37
+
38
+ | Task | Input | Output | Main artifact |
39
+ |---|---|---|---|
40
+ | `timeline_action` | all modality window | current action label | `timeline_action/metrics.json` |
41
+ | `timeline_subtask` | all modality window | current subtask label | `timeline_subtask/metrics.json` |
42
+ | `transition_detection` | all modality window | steady vs action boundary | `transition_detection/metrics.json` |
43
+ | `next_action` | current all modality window | action 20 frames later | `next_action/metrics.json` |
44
+ | `hand_trajectory_forecast` | current all modality window | future 10-frame left/right hand joints | `hand_trajectory_forecast/predictions.npz` |
45
+ | `contact_prediction` | non-contact modalities | any body contact in window | `contact_prediction/metrics.json` |
46
+ | `object_relevance` | non-caption modalities | relevant object set | `object_relevance/predictions.csv` |
47
+ | `caption_grounding` | caption objects/interaction query + sensor candidates | matching time window | `caption_grounding/metrics.json` |
48
+ | `cross_modal_retrieval` | motion/IMU/camera query | matching depth/video window | `cross_modal_retrieval/metrics.json` |
49
+ | `modality_reconstruction` | motion/IMU/camera | depth/video feature vector | `modality_reconstruction/predictions.npz` |
50
+ | `temporal_order` | two adjacent windows | whether order is correct | `temporal_order/metrics.json` |
51
+ | `misalignment_detection` | motion+visual pair | aligned vs shifted | `misalignment_detection/metrics.json` |
52
+
53
+ ## Minimal Model Architectures
54
+
55
+ All tasks share the same window builder unless a task explicitly removes a
56
+ feature block to avoid label leakage.
57
+
58
+ ```text
59
+ raw sample episode
60
+ -> 20-frame sliding windows, stride 5
61
+ -> all-modality feature vector X_all, 8,378 dimensions
62
+ -> chronological split, first 70% train and last 30% test
63
+ -> train-only z-score scaler
64
+ -> task-specific minimal head
65
+ ```
66
+
67
+ The task suite intentionally uses simple heads:
68
+
69
+ | Family | Formula | Tasks |
70
+ |---|---|---|
71
+ | Linear softmax | `softmax(z(X)W + b)`, cross-entropy, L2 | `timeline_action`, `timeline_subtask`, `transition_detection`, `next_action`, `contact_prediction`, `temporal_order`, `misalignment_detection` |
72
+ | Ridge regression/projection | dual ridge regression with L2=10 on z-scored X/Y | `hand_trajectory_forecast`, `caption_grounding`, `cross_modal_retrieval`, `modality_reconstruction` |
73
+ | Multi-label logistic | `sigmoid(z(X)W + b)`, weighted object heads | `object_relevance` |
74
+
75
+ Task-specific architecture details:
76
+
77
+ | Task | Input tensor/vector | Minimal head | Output target |
78
+ |---|---|---|---|
79
+ | `timeline_action` | `X_all`, 8,378d | class-weighted linear softmax | current action label |
80
+ | `timeline_subtask` | `X_all`, 8,378d | class-weighted linear softmax | current subtask label |
81
+ | `transition_detection` | `X_all`, 8,378d | class-weighted linear softmax | steady vs transition near action boundary |
82
+ | `next_action` | `X_all(t)`, 8,378d | class-weighted linear softmax | action at `t+20` frames |
83
+ | `hand_trajectory_forecast` | `X_all(t)`, 8,378d | ridge regression | future 10 frames of left/right hand joints, 1,260d |
84
+ | `contact_prediction` | all features except `body_contacts` and caption text, 7,335d | linear softmax on observed labels | any body contact in window |
85
+ | `object_relevance` | all features except caption text, 7,482d | multi-label logistic regression | 34-object multi-hot vector |
86
+ | `caption_grounding` | sensor features, 7,482d, projected into 896d text space | ridge projection plus cosine ranking | matching time window for a text query |
87
+ | `cross_modal_retrieval` | motion/IMU/camera, 2,247d, projected into 5,096d visual space | ridge projection plus cosine ranking | matching depth/video window |
88
+ | `modality_reconstruction` | motion/IMU/camera, 2,247d | ridge regression | depth/video feature vector, 5,096d |
89
+ | `temporal_order` | `[x_t, x_t+1, x_t+1-x_t]`, 25,134d | binary linear softmax | correct vs reversed order |
90
+ | `misalignment_detection` | motion plus visual pair, 7,343d | binary linear softmax | aligned vs shifted by 8 windows |
91
+
92
+ Diagram:
93
+
94
+ ```text
95
+ docs/assets/task_architectures.svg
96
+ ```
97
+
98
+ ## Current Results
99
+
100
+ ```text
101
+ timeline_action:
102
+ accuracy: 0.0292
103
+ macro_f1: 0.0500
104
+ note: future test region contains unseen action classes
105
+
106
+ timeline_subtask:
107
+ accuracy: 0.0581
108
+ macro_f1: 0.0495
109
+ note: future test region contains unseen subtask classes
110
+
111
+ transition_detection:
112
+ accuracy: 0.9253
113
+ macro_f1: 0.6552
114
+ boundary_f1: 0.2143
115
+
116
+ next_action:
117
+ accuracy: 0.0345
118
+ macro_f1: 0.0593
119
+ note: same unseen-future-class problem as timeline_action
120
+
121
+ hand_trajectory_forecast:
122
+ MPJPE: 0.8223
123
+ final-frame MPJPE: 1.0650
124
+
125
+ contact_prediction:
126
+ accuracy: 1.0000
127
+ note: degenerate on this sample because the binary contact label has only one class
128
+
129
+ object_relevance:
130
+ micro_f1: 0.1839
131
+ macro_f1: 0.0643
132
+
133
+ caption_grounding:
134
+ top1: 0.0029
135
+ top5: 0.0115
136
+ MRR: 0.0172
137
+
138
+ cross_modal_retrieval:
139
+ top1: 0.1494
140
+ top5: 0.3764
141
+ top10: 0.4741
142
+ MRR: 0.2634
143
+
144
+ modality_reconstruction:
145
+ R2: -0.0160
146
+
147
+ temporal_order:
148
+ accuracy: 0.4612
149
+ f1: 0.5487
150
+
151
+ misalignment_detection:
152
+ accuracy: 0.5029
153
+ f1: 0.4866
154
+ ```
155
+
156
+ ## How To Read These Results
157
+
158
+ Low scores are useful here. They show which tasks are not learnable from this one chronological sample with this minimal model.
159
+
160
+ The strongest signal is `cross_modal_retrieval`: motion/IMU/camera features can retrieve the matching depth/video window better than random. That means the modalities are synchronized and contain shared temporal structure.
161
+
162
+ The weakest supervised timeline tasks are weak mainly because of the split. The last 30% of a single ordered episode contains actions/subtasks not present in the first 70%, so a classifier trained on the first part cannot predict labels it never saw.
163
+
164
+ For serious research, keep the same task code but change the dataset unit:
165
+
166
+ ```text
167
+ many episodes -> train episodes -> test unseen episodes
168
+ ```
169
+
170
+ For single-episode learning, these tasks are best used as:
171
+
172
+ - data pipeline tests
173
+ - modality ablations
174
+ - label-alignment checks
175
+ - self-supervised retrieval experiments
176
+ - debugging templates before scaling to many episodes
notes/min_action_model.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Minimal Action Model
2
+
3
+ This is the first modeling baseline for the Ropedia/Xperience sample.
4
+
5
+ The script is:
6
+
7
+ ```text
8
+ scripts/train_min_action_model.py
9
+ ```
10
+
11
+ It trains a small Numpy-only softmax classifier:
12
+
13
+ ```text
14
+ annotation.hdf5
15
+ -> hand/body/IMU/camera/contact windows
16
+ -> action or subtask labels from captions
17
+ -> stratified train/test split
18
+ -> multinomial logistic regression
19
+ -> metrics and predictions
20
+ ```
21
+
22
+ Run:
23
+
24
+ ```bash
25
+ cd /path/to/Ropedia
26
+ source .venv/bin/activate
27
+ python scripts/train_min_action_model.py
28
+ ```
29
+
30
+ Default output:
31
+
32
+ ```text
33
+ outputs/min_action_model/
34
+ ```
35
+
36
+ Important artifacts:
37
+
38
+ - `metrics.json`: accuracy, balanced accuracy, macro-F1, weighted-F1, majority baseline.
39
+ - `per_class_metrics.csv`: precision/recall/F1 per action class.
40
+ - `confusion_matrix.csv`: true label vs predicted label matrix.
41
+ - `predictions.csv`: one row per test window.
42
+ - `feature_dataset.npz`: processed numeric features and labels.
43
+ - `model.npz`: fitted scaler and softmax weights.
44
+
45
+ This is a learning baseline, not a publishable benchmark. The public sample is only one episode, so stratified windows from one episode are correlated. For serious evaluation, use many episodes and split by held-out episodes or held-out task instances.
46
+
47
+ ## Current Sample Results
48
+
49
+ Action-label model:
50
+
51
+ ```text
52
+ outputs/min_action_model/
53
+ accuracy: 0.9828
54
+ balanced_accuracy: 0.9644
55
+ macro_f1: 0.9688
56
+ weighted_f1: 0.9824
57
+ majority_baseline: 0.1375
58
+ classes: 18
59
+ test_windows: 291
60
+ ```
61
+
62
+ Subtask-label model:
63
+
64
+ ```text
65
+ outputs/min_subtask_model/
66
+ accuracy: 0.9759
67
+ balanced_accuracy: 0.9784
68
+ macro_f1: 0.9528
69
+ weighted_f1: 0.9779
70
+ majority_baseline: 0.1448
71
+ classes: 14
72
+ test_windows: 290
73
+ ```
74
+
75
+ Why the numbers are high:
76
+
77
+ - This is one public sample episode.
78
+ - Windows are stratified randomly, so train/test windows can be close in time.
79
+ - The result proves the pipeline works; it does not prove cross-episode generalization.
80
+
81
+ Next serious evaluation:
82
+
83
+ ```text
84
+ many episodes -> split by episode -> train on some episodes -> test on unseen episodes
85
+ ```
notes/reproducibility_audit.md ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reproducibility Audit
2
+
3
+ Audit date: 2026-05-30 Asia/Singapore.
4
+
5
+ Purpose: verify that the committed Ropedia Episode Task Suite artifacts are
6
+ real outputs from the scripts, not placeholder or fabricated metrics.
7
+
8
+ ## Raw Inputs Checked
9
+
10
+ The audit used the local public sample episode:
11
+
12
+ ```text
13
+ data/sample/xperience-10m-sample/
14
+ annotation.hdf5
15
+ fisheye_cam0.mp4
16
+ fisheye_cam1.mp4
17
+ fisheye_cam2.mp4
18
+ fisheye_cam3.mp4
19
+ stereo_left.mp4
20
+ stereo_right.mp4
21
+ ```
22
+
23
+ `annotation.hdf5` contains 5,821 aligned frames with depth, hand mocap, body
24
+ mocap, IMU, SLAM, calibration, and caption metadata. The video feature cache was
25
+ rebuilt from all six video files during the audit.
26
+
27
+ ## Commands Re-run
28
+
29
+ All audit outputs were written outside the repo:
30
+
31
+ ```bash
32
+ AUDIT=/private/tmp/ropedia-audit
33
+ WORKSPACE=/path/to/Ropedia
34
+ ANN=$WORKSPACE/data/sample/xperience-10m-sample/annotation.hdf5
35
+ PY=$WORKSPACE/.venv/bin/python
36
+
37
+ $PY -B scripts/train_min_action_model.py \
38
+ --workspace $WORKSPACE \
39
+ --annotation $ANN \
40
+ --output-dir $AUDIT/min_action_model \
41
+ --target action
42
+
43
+ $PY -B scripts/train_min_action_model.py \
44
+ --workspace $WORKSPACE \
45
+ --annotation $ANN \
46
+ --output-dir $AUDIT/min_subtask_model \
47
+ --target subtask
48
+
49
+ $PY -B scripts/train_all_modalities_model.py \
50
+ --workspace $WORKSPACE \
51
+ --annotation $ANN \
52
+ --output-dir $AUDIT/min_all_modalities_action_model \
53
+ --cache-dir $AUDIT/cache \
54
+ --target action
55
+
56
+ $PY -B scripts/train_all_modalities_model.py \
57
+ --workspace $WORKSPACE \
58
+ --annotation $ANN \
59
+ --output-dir $AUDIT/min_all_modalities_subtask_model \
60
+ --cache-dir $AUDIT/cache \
61
+ --target subtask
62
+
63
+ $PY -B scripts/episode_task_suite.py \
64
+ --workspace $WORKSPACE \
65
+ --annotation $ANN \
66
+ --output-dir $AUDIT/episode_task_suite \
67
+ --cache-dir $AUDIT/cache
68
+ ```
69
+
70
+ ## Exact Match Checks
71
+
72
+ The regenerated files matched the committed files:
73
+
74
+ ```text
75
+ min_action_model/metrics.json: MATCH
76
+ min_subtask_model/metrics.json: MATCH
77
+ min_all_modalities_action_model/metrics.json: MATCH
78
+ min_all_modalities_subtask_model/metrics.json: MATCH
79
+ episode_task_suite/summary_report.json: MATCH
80
+ episode_task_suite/feature_manifest.json: MATCH
81
+ episode_task_suite/available_modalities.json: MATCH
82
+ ```
83
+
84
+ Every per-task `metrics.json` also matched:
85
+
86
+ ```text
87
+ caption_grounding/metrics.json: MATCH
88
+ contact_prediction/metrics.json: MATCH
89
+ cross_modal_retrieval/metrics.json: MATCH
90
+ hand_trajectory_forecast/metrics.json: MATCH
91
+ misalignment_detection/metrics.json: MATCH
92
+ modality_reconstruction/metrics.json: MATCH
93
+ next_action/metrics.json: MATCH
94
+ object_relevance/metrics.json: MATCH
95
+ temporal_order/metrics.json: MATCH
96
+ timeline_action/metrics.json: MATCH
97
+ timeline_subtask/metrics.json: MATCH
98
+ transition_detection/metrics.json: MATCH
99
+ ```
100
+
101
+ ## Fresh Cache Evidence
102
+
103
+ The all-modality audit rebuilt a fresh feature cache:
104
+
105
+ ```text
106
+ depth_n5821_grid8.npz: shape=(5821, 140), nonzero=809107
107
+ video_fisheye_cam0_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570458
108
+ video_fisheye_cam1_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570400
109
+ video_fisheye_cam2_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570458
110
+ video_fisheye_cam3_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=568723
111
+ video_stereo_left_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570249
112
+ video_stereo_right_n5821_img32_grid8_hist8.npz: shape=(5821, 98), nonzero=570430
113
+ ```
114
+
115
+ This confirms the committed metrics are reproducible from the raw sample and
116
+ that the all-modality pipeline reads real depth/video files instead of using
117
+ empty placeholder features.
118
+
119
+ ## Caveats
120
+
121
+ The scripts contain a zero-feature fallback if a video file is missing. That is
122
+ not the path used in this audit: all six videos existed and produced nonzero
123
+ features. The repo remains a single-episode learning and pipeline-validation
124
+ project, not evidence of cross-episode generalization.
scripts/episode_task_suite.py ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ End-to-end task suite for one Ropedia/Xperience episode.
4
+
5
+ The purpose is not to prove generalization from one sample episode. It is to
6
+ turn the episode into multiple meaningful supervised/self-supervised learning
7
+ problems and write reproducible artifacts for each one.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import csv
14
+ import json
15
+ import math
16
+ import sys
17
+ from collections import Counter, OrderedDict
18
+ from pathlib import Path
19
+
20
+ import numpy as np
21
+
22
+ from train_all_modalities_model import (
23
+ extract_all_window_features,
24
+ prepare_modalities,
25
+ )
26
+ from train_min_action_model import (
27
+ add_toolkit_to_path,
28
+ compute_metrics,
29
+ encode_labels,
30
+ fit_scaler,
31
+ frame_label,
32
+ majority_label,
33
+ predict,
34
+ portable_path,
35
+ softmax,
36
+ train_softmax_classifier,
37
+ )
38
+
39
+
40
+ TASKS = [
41
+ "timeline_action",
42
+ "timeline_subtask",
43
+ "transition_detection",
44
+ "next_action",
45
+ "hand_trajectory_forecast",
46
+ "contact_prediction",
47
+ "object_relevance",
48
+ "caption_grounding",
49
+ "cross_modal_retrieval",
50
+ "modality_reconstruction",
51
+ "temporal_order",
52
+ "misalignment_detection",
53
+ ]
54
+
55
+
56
+ def parse_args() -> argparse.Namespace:
57
+ workspace_default = Path(__file__).resolve().parents[1]
58
+ annotation_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
59
+ parser = argparse.ArgumentParser(description="Run an end-to-end task suite on one Ropedia episode.")
60
+ parser.add_argument("--workspace", type=Path, default=workspace_default)
61
+ parser.add_argument("--annotation", type=Path, default=annotation_default)
62
+ parser.add_argument("--output-dir", type=Path, default=workspace_default / "outputs/episode_task_suite")
63
+ parser.add_argument("--cache-dir", type=Path, default=workspace_default / "outputs/feature_cache")
64
+ parser.add_argument("--window-frames", type=int, default=20)
65
+ parser.add_argument("--stride-frames", type=int, default=5)
66
+ parser.add_argument("--min-label-fraction", type=float, default=0.6)
67
+ parser.add_argument("--test-fraction", type=float, default=0.30)
68
+ parser.add_argument("--epochs", type=int, default=400)
69
+ parser.add_argument("--learning-rate", type=float, default=0.12)
70
+ parser.add_argument("--l2", type=float, default=2e-3)
71
+ parser.add_argument("--ridge-l2", type=float, default=10.0)
72
+ parser.add_argument("--seed", type=int, default=7)
73
+ parser.add_argument("--future-frames", type=int, default=20, help="Future offset for next-action prediction.")
74
+ parser.add_argument("--forecast-frames", type=int, default=10, help="Future hand trajectory length.")
75
+ parser.add_argument("--boundary-tolerance-frames", type=int, default=10)
76
+ parser.add_argument("--misalignment-shift-windows", type=int, default=8)
77
+ parser.add_argument("--tasks", default="all", help="Comma-separated task list or 'all'.")
78
+
79
+ # Match train_all_modalities_model defaults used by prepare_modalities.
80
+ parser.add_argument("--force-rebuild-cache", action="store_true")
81
+ parser.add_argument("--video-image-size", type=int, default=32)
82
+ parser.add_argument("--video-grid-size", type=int, default=8)
83
+ parser.add_argument("--video-hist-bins", type=int, default=8)
84
+ parser.add_argument("--depth-grid-size", type=int, default=8)
85
+ parser.add_argument("--text-hash-dim", type=int, default=128)
86
+ parser.add_argument("--include-label-text", action="store_true")
87
+ parser.add_argument("--no-class-weights", action="store_true")
88
+ return parser.parse_args()
89
+
90
+
91
+ def selected_tasks(spec: str) -> list[str]:
92
+ if spec.strip().lower() == "all":
93
+ return TASKS
94
+ chosen = [x.strip() for x in spec.split(",") if x.strip()]
95
+ unknown = [x for x in chosen if x not in TASKS]
96
+ if unknown:
97
+ raise ValueError(f"Unknown tasks: {unknown}. Valid tasks: {TASKS}")
98
+ return chosen
99
+
100
+
101
+ def write_json(path: Path, data: dict | list) -> None:
102
+ path.parent.mkdir(parents=True, exist_ok=True)
103
+ path.write_text(json.dumps(data, indent=2), encoding="utf-8")
104
+
105
+
106
+ def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None:
107
+ path.parent.mkdir(parents=True, exist_ok=True)
108
+ with path.open("w", newline="", encoding="utf-8") as fp:
109
+ writer = csv.DictWriter(fp, fieldnames=fieldnames)
110
+ writer.writeheader()
111
+ writer.writerows(rows)
112
+
113
+
114
+ def write_confusion(path: Path, cm: np.ndarray, class_names: list[str]) -> None:
115
+ path.parent.mkdir(parents=True, exist_ok=True)
116
+ with path.open("w", newline="", encoding="utf-8") as fp:
117
+ writer = csv.writer(fp)
118
+ writer.writerow(["true\\pred"] + class_names)
119
+ for i, name in enumerate(class_names):
120
+ writer.writerow([name] + [int(v) for v in cm[i]])
121
+
122
+
123
+ def chronological_split_indices(n: int, test_fraction: float) -> tuple[np.ndarray, np.ndarray]:
124
+ if n < 2:
125
+ raise ValueError("Need at least two samples for train/test split.")
126
+ split = int(round(n * (1.0 - test_fraction)))
127
+ split = max(1, min(split, n - 1))
128
+ return np.arange(split, dtype=np.int64), np.arange(split, n, dtype=np.int64)
129
+
130
+
131
+ def build_windows(args: argparse.Namespace, ann: dict, extras: dict):
132
+ frame_info = ann["caption_frame_info_map"]
133
+ n_frames = len(ann["img_names"])
134
+ rows = []
135
+ X = []
136
+ feature_manifest = None
137
+
138
+ for start in range(0, n_frames - args.window_frames + 1, args.stride_frames):
139
+ end = start + args.window_frames
140
+ action_labels = [frame_label(frame_info.get(i, {}), "action") for i in range(start, end)]
141
+ subtask_labels = [frame_label(frame_info.get(i, {}), "subtask") for i in range(start, end)]
142
+ action, action_frac = majority_label(action_labels, args.min_label_fraction)
143
+ subtask, subtask_frac = majority_label(subtask_labels, args.min_label_fraction)
144
+
145
+ if feature_manifest is None:
146
+ vec, blocks = extract_all_window_features(ann, extras, start, end, return_blocks=True)
147
+ offset = 0
148
+ feature_manifest = []
149
+ for name, dim in blocks:
150
+ feature_manifest.append({"name": name, "start": offset, "end": offset + dim, "dim": dim})
151
+ offset += dim
152
+ else:
153
+ vec = extract_all_window_features(ann, extras, start, end)
154
+
155
+ X.append(vec)
156
+ rows.append({
157
+ "window_index": len(rows),
158
+ "start_frame": start,
159
+ "end_frame": end - 1,
160
+ "center_frame": (start + end - 1) // 2,
161
+ "action_label": action,
162
+ "action_fraction": action_frac,
163
+ "subtask_label": subtask,
164
+ "subtask_fraction": subtask_frac,
165
+ })
166
+
167
+ return np.stack(X).astype(np.float32), rows, feature_manifest or []
168
+
169
+
170
+ def block_indices(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> np.ndarray:
171
+ include = include or []
172
+ exclude = exclude or []
173
+ idxs = []
174
+ for block in feature_manifest:
175
+ name = block["name"]
176
+ if include and not any(name == p or name.startswith(p) for p in include):
177
+ continue
178
+ if exclude and any(name == p or name.startswith(p) for p in exclude):
179
+ continue
180
+ idxs.extend(range(int(block["start"]), int(block["end"])))
181
+ return np.asarray(idxs, dtype=np.int64)
182
+
183
+
184
+ def label_array(rows: list[dict], key: str) -> np.ndarray:
185
+ return np.asarray([str(row.get(key, "") or "") for row in rows], dtype=object)
186
+
187
+
188
+ def classification_task(
189
+ out_dir: Path,
190
+ X: np.ndarray,
191
+ labels: np.ndarray,
192
+ rows: list[dict],
193
+ args: argparse.Namespace,
194
+ task_name: str,
195
+ input_description: str,
196
+ ) -> dict:
197
+ out_dir.mkdir(parents=True, exist_ok=True)
198
+ valid = np.asarray([bool(x) for x in labels])
199
+ valid_idx = np.flatnonzero(valid)
200
+ Xv = X[valid_idx]
201
+ labelv = labels[valid_idx]
202
+ rowv = [rows[int(i)] for i in valid_idx]
203
+ y, class_names = encode_labels(labelv)
204
+ train_local, test_local = chronological_split_indices(len(y), args.test_fraction)
205
+
206
+ train_classes = set(int(x) for x in y[train_local])
207
+ test_classes = set(int(x) for x in y[test_local])
208
+ unseen_test_classes = sorted(class_names[i] for i in (test_classes - train_classes))
209
+
210
+ mean, std = fit_scaler(Xv[train_local])
211
+ Xs = (Xv - mean) / std
212
+ W, b, history = train_softmax_classifier(
213
+ Xs[train_local],
214
+ y[train_local],
215
+ n_classes=len(class_names),
216
+ epochs=args.epochs,
217
+ lr=args.learning_rate,
218
+ l2=args.l2,
219
+ use_class_weights=not args.no_class_weights,
220
+ seed=args.seed,
221
+ )
222
+ pred, probs = predict(Xs[test_local], W, b)
223
+ metrics, per_class, cm = compute_metrics(y[test_local], pred, class_names)
224
+ majority = Counter(y[train_local]).most_common(1)[0][0]
225
+ metrics.update({
226
+ "task": task_name,
227
+ "input": input_description,
228
+ "split": "chronological",
229
+ "num_windows": int(len(y)),
230
+ "num_train_windows": int(len(train_local)),
231
+ "num_test_windows": int(len(test_local)),
232
+ "num_classes": int(len(class_names)),
233
+ "feature_dim": int(X.shape[1]),
234
+ "majority_baseline_accuracy": float(np.mean(y[test_local] == majority)),
235
+ "train_final_accuracy": float(history[-1]["train_accuracy"]),
236
+ "train_final_loss": float(history[-1]["loss"]),
237
+ "unseen_test_classes": unseen_test_classes,
238
+ })
239
+
240
+ pred_rows = []
241
+ for local_pos, pred_id in zip(test_local, pred):
242
+ row = rowv[int(local_pos)]
243
+ true_id = int(y[int(local_pos)])
244
+ pred_rows.append({
245
+ "window_index": row["window_index"],
246
+ "start_frame": row["start_frame"],
247
+ "end_frame": row["end_frame"],
248
+ "center_frame": row["center_frame"],
249
+ "true_label": class_names[true_id],
250
+ "predicted_label": class_names[int(pred_id)],
251
+ "confidence": float(probs[list(test_local).index(local_pos), int(pred_id)]),
252
+ "correct": int(true_id == int(pred_id)),
253
+ })
254
+
255
+ write_json(out_dir / "metrics.json", metrics)
256
+ write_csv(out_dir / "per_class_metrics.csv", per_class, ["class_id", "class_name", "support", "predicted", "precision", "recall", "f1"])
257
+ write_confusion(out_dir / "confusion_matrix.csv", cm, class_names)
258
+ write_csv(out_dir / "predictions.csv", pred_rows, ["window_index", "start_frame", "end_frame", "center_frame", "true_label", "predicted_label", "confidence", "correct"])
259
+ np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, class_names=np.asarray(class_names, dtype=object))
260
+ return metrics
261
+
262
+
263
+ def binary_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
264
+ y_true = y_true.astype(np.int64)
265
+ y_pred = y_pred.astype(np.int64)
266
+ tp = int(np.sum((y_true == 1) & (y_pred == 1)))
267
+ tn = int(np.sum((y_true == 0) & (y_pred == 0)))
268
+ fp = int(np.sum((y_true == 0) & (y_pred == 1)))
269
+ fn = int(np.sum((y_true == 1) & (y_pred == 0)))
270
+ precision = tp / (tp + fp) if tp + fp else 0.0
271
+ recall = tp / (tp + fn) if tp + fn else 0.0
272
+ f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
273
+ return {
274
+ "accuracy": float((tp + tn) / max(len(y_true), 1)),
275
+ "precision": precision,
276
+ "recall": recall,
277
+ "f1": f1,
278
+ "tp": tp,
279
+ "tn": tn,
280
+ "fp": fp,
281
+ "fn": fn,
282
+ "positive_rate_true": float(np.mean(y_true)) if len(y_true) else 0.0,
283
+ "positive_rate_pred": float(np.mean(y_pred)) if len(y_pred) else 0.0,
284
+ }
285
+
286
+
287
+ def boundary_f1(true_frames: list[int], pred_frames: list[int], tolerance: int) -> dict:
288
+ used = set()
289
+ matches = 0
290
+ errors = []
291
+ for pf in pred_frames:
292
+ candidates = [(abs(pf - tf), j, tf) for j, tf in enumerate(true_frames) if j not in used and abs(pf - tf) <= tolerance]
293
+ if not candidates:
294
+ continue
295
+ diff, j, tf = min(candidates)
296
+ used.add(j)
297
+ matches += 1
298
+ errors.append(diff)
299
+ precision = matches / len(pred_frames) if pred_frames else 0.0
300
+ recall = matches / len(true_frames) if true_frames else 0.0
301
+ f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
302
+ return {
303
+ "boundary_precision": precision,
304
+ "boundary_recall": recall,
305
+ "boundary_f1": f1,
306
+ "matched_boundaries": matches,
307
+ "true_boundaries": len(true_frames),
308
+ "predicted_boundaries": len(pred_frames),
309
+ "mean_abs_timing_error_frames": float(np.mean(errors)) if errors else None,
310
+ }
311
+
312
+
313
+ def task_transition_detection(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
314
+ frame_info = ann["caption_frame_info_map"]
315
+ n_frames = len(ann["img_names"])
316
+ per_frame = [frame_label(frame_info.get(i, {}), "action") for i in range(n_frames)]
317
+ true_boundaries = [i for i in range(1, n_frames) if per_frame[i] and per_frame[i - 1] and per_frame[i] != per_frame[i - 1]]
318
+
319
+ y = []
320
+ for row in rows:
321
+ c = int(row["center_frame"])
322
+ y.append(int(any(abs(c - b) <= args.boundary_tolerance_frames for b in true_boundaries)))
323
+ labels = np.asarray(["transition" if v else "steady" for v in y], dtype=object)
324
+ metrics = classification_task(out_dir, X, labels, rows, args, "transition_detection", "all modalities -> action boundary/steady")
325
+
326
+ pred_path = out_dir / "predictions.csv"
327
+ pred_rows = []
328
+ with pred_path.open("r", encoding="utf-8") as fp:
329
+ for row in csv.DictReader(fp):
330
+ pred_rows.append(row)
331
+ pred_frames = [int(r["center_frame"]) for r in pred_rows if r["predicted_label"] == "transition"]
332
+ test_start = min((int(r["center_frame"]) for r in pred_rows), default=0)
333
+ test_end = max((int(r["center_frame"]) for r in pred_rows), default=0)
334
+ true_test = [b for b in true_boundaries if test_start <= b <= test_end]
335
+ metrics.update(boundary_f1(true_test, pred_frames, args.boundary_tolerance_frames))
336
+ write_json(out_dir / "metrics.json", metrics)
337
+ write_csv(out_dir / "true_boundaries.csv", [{"frame": x} for x in true_boundaries], ["frame"])
338
+ return metrics
339
+
340
+
341
+ def task_next_action(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
342
+ frame_info = ann["caption_frame_info_map"]
343
+ labels = []
344
+ for row in rows:
345
+ future_frame = min(len(ann["img_names"]) - 1, int(row["end_frame"]) + args.future_frames)
346
+ labels.append(frame_label(frame_info.get(future_frame, {}), "action"))
347
+ return classification_task(out_dir, X, np.asarray(labels, dtype=object), rows, args, "next_action", f"all modalities at t -> action at t+{args.future_frames} frames")
348
+
349
+
350
+ def ridge_fit_predict(X_train: np.ndarray, Y_train: np.ndarray, X_test: np.ndarray, l2: float):
351
+ x_mean, x_std = fit_scaler(X_train)
352
+ y_mean = Y_train.mean(axis=0)
353
+ y_std = Y_train.std(axis=0)
354
+ y_std = np.where(y_std < 1e-6, 1.0, y_std)
355
+ Xtr = (X_train - x_mean) / x_std
356
+ Xte = (X_test - x_mean) / x_std
357
+ Ytr = (Y_train - y_mean) / y_std
358
+ Xtr_aug = np.concatenate([Xtr, np.ones((len(Xtr), 1), dtype=np.float32)], axis=1)
359
+ Xte_aug = np.concatenate([Xte, np.ones((len(Xte), 1), dtype=np.float32)], axis=1)
360
+ K = Xtr_aug @ Xtr_aug.T
361
+ alpha = np.linalg.solve(K + l2 * np.eye(K.shape[0], dtype=np.float32), Ytr)
362
+ W = Xtr_aug.T @ alpha
363
+ pred = (Xte_aug @ W) * y_std + y_mean
364
+ return pred.astype(np.float32), {"x_mean": x_mean, "x_std": x_std, "y_mean": y_mean.astype(np.float32), "y_std": y_std.astype(np.float32), "W": W.astype(np.float32)}
365
+
366
+
367
+ def regression_metrics(Y_true: np.ndarray, Y_pred: np.ndarray) -> dict:
368
+ mse = float(np.mean((Y_true - Y_pred) ** 2))
369
+ mae = float(np.mean(np.abs(Y_true - Y_pred)))
370
+ ss_res = float(np.sum((Y_true - Y_pred) ** 2))
371
+ ss_tot = float(np.sum((Y_true - Y_true.mean(axis=0)) ** 2))
372
+ r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0
373
+ return {"mse": mse, "mae": mae, "r2": r2}
374
+
375
+
376
+ def task_hand_forecast(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, args: argparse.Namespace) -> dict:
377
+ left = ann.get("hand_left_joints")
378
+ right = ann.get("hand_right_joints")
379
+ body = ann.get("smplh_body_joints")
380
+ if left is None or right is None:
381
+ raise ValueError("Hand joints not available.")
382
+
383
+ valid_idx, Y = [], []
384
+ n_frames = len(left)
385
+ for i, row in enumerate(rows):
386
+ future_start = int(row["end_frame"]) + 1
387
+ future_end = future_start + args.forecast_frames
388
+ if future_end > n_frames:
389
+ continue
390
+ hand = np.concatenate([left[future_start:future_end], right[future_start:future_end]], axis=1)
391
+ if body is not None and future_end <= len(body):
392
+ root = body[future_start:future_end, :1, :]
393
+ hand = hand - root
394
+ valid_idx.append(i)
395
+ Y.append(hand.reshape(-1))
396
+
397
+ valid_idx = np.asarray(valid_idx, dtype=np.int64)
398
+ Y = np.stack(Y).astype(np.float32)
399
+ train, test = chronological_split_indices(len(valid_idx), args.test_fraction)
400
+ pred, model = ridge_fit_predict(X[valid_idx[train]], Y[train], X[valid_idx[test]], args.ridge_l2)
401
+ metrics = regression_metrics(Y[test], pred)
402
+ true_hand = Y[test].reshape(len(test), args.forecast_frames, 42, 3)
403
+ pred_hand = pred.reshape(len(test), args.forecast_frames, 42, 3)
404
+ mpjpe = np.linalg.norm(true_hand - pred_hand, axis=-1).mean()
405
+ final_error = np.linalg.norm(true_hand[:, -1] - pred_hand[:, -1], axis=-1).mean()
406
+ metrics.update({
407
+ "task": "hand_trajectory_forecast",
408
+ "input": "all modalities at t -> future left/right hand 3D joints",
409
+ "split": "chronological",
410
+ "num_windows": int(len(valid_idx)),
411
+ "num_train_windows": int(len(train)),
412
+ "num_test_windows": int(len(test)),
413
+ "forecast_frames": int(args.forecast_frames),
414
+ "mpjpe": float(mpjpe),
415
+ "final_frame_mpjpe": float(final_error),
416
+ "target_dim": int(Y.shape[1]),
417
+ })
418
+ out_dir.mkdir(parents=True, exist_ok=True)
419
+ write_json(out_dir / "metrics.json", metrics)
420
+ np.savez_compressed(out_dir / "predictions.npz", y_true=Y[test], y_pred=pred, test_window_indices=valid_idx[test], **model)
421
+ return metrics
422
+
423
+
424
+ def task_contact_prediction(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict:
425
+ contacts = ann.get("contacts")
426
+ if contacts is None:
427
+ raise ValueError("Contacts not available.")
428
+ y = []
429
+ for row in rows:
430
+ c = contacts[int(row["start_frame"]):int(row["end_frame"]) + 1]
431
+ y.append("contact" if np.any(c > 0) else "no_contact")
432
+ keep = block_indices(manifest, exclude=["body_contacts", "caption_objects_interaction_text"])
433
+ return classification_task(out_dir, X[:, keep], np.asarray(y, dtype=object), rows, args, "contact_prediction", "all non-contact/non-caption-label modalities -> any body contact")
434
+
435
+
436
+ def extract_objects(info: dict) -> list[str]:
437
+ objects = info.get("objects")
438
+ if isinstance(objects, list):
439
+ return [str(x).strip() for x in objects if str(x).strip()]
440
+ if objects:
441
+ return [str(objects).strip()]
442
+ return []
443
+
444
+
445
+ def sigmoid(z: np.ndarray) -> np.ndarray:
446
+ return 1.0 / (1.0 + np.exp(-np.clip(z, -40, 40)))
447
+
448
+
449
+ def train_multilabel_logistic(X: np.ndarray, Y: np.ndarray, epochs: int, lr: float, l2: float, seed: int):
450
+ rng = np.random.default_rng(seed)
451
+ n, d = X.shape
452
+ c = Y.shape[1]
453
+ W = rng.normal(0, 0.01, size=(d, c)).astype(np.float32)
454
+ b = np.zeros(c, dtype=np.float32)
455
+ counts = Y.sum(axis=0)
456
+ pos_weight = (n - counts) / np.maximum(counts, 1.0)
457
+ pos_weight = np.clip(pos_weight, 1.0, 20.0).astype(np.float32)
458
+ history = []
459
+ for epoch in range(1, epochs + 1):
460
+ P = sigmoid(X @ W + b)
461
+ weights = np.where(Y > 0, pos_weight[None, :], 1.0)
462
+ diff = (P - Y) * weights / n
463
+ W -= lr * (X.T @ diff + l2 * W)
464
+ b -= lr * diff.sum(axis=0)
465
+ if epoch == 1 or epoch == epochs or epoch % max(1, epochs // 5) == 0:
466
+ pred = (P >= 0.5).astype(np.float32)
467
+ history.append({"epoch": epoch, **multilabel_metrics(Y, pred)})
468
+ return W.astype(np.float32), b.astype(np.float32), history
469
+
470
+
471
+ def multilabel_metrics(Y: np.ndarray, P: np.ndarray) -> dict:
472
+ Y = Y.astype(np.int64)
473
+ P = P.astype(np.int64)
474
+ tp = int(np.sum((Y == 1) & (P == 1)))
475
+ fp = int(np.sum((Y == 0) & (P == 1)))
476
+ fn = int(np.sum((Y == 1) & (P == 0)))
477
+ precision = tp / (tp + fp) if tp + fp else 0.0
478
+ recall = tp / (tp + fn) if tp + fn else 0.0
479
+ micro_f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
480
+ per_f1 = []
481
+ for j in range(Y.shape[1]):
482
+ tpj = np.sum((Y[:, j] == 1) & (P[:, j] == 1))
483
+ fpj = np.sum((Y[:, j] == 0) & (P[:, j] == 1))
484
+ fnj = np.sum((Y[:, j] == 1) & (P[:, j] == 0))
485
+ pj = tpj / (tpj + fpj) if tpj + fpj else 0.0
486
+ rj = tpj / (tpj + fnj) if tpj + fnj else 0.0
487
+ per_f1.append(2 * pj * rj / (pj + rj) if pj + rj else 0.0)
488
+ exact = float(np.mean(np.all(Y == P, axis=1)))
489
+ return {"micro_f1": float(micro_f1), "macro_f1": float(np.mean(per_f1)), "exact_match": exact, "precision": precision, "recall": recall}
490
+
491
+
492
+ def task_object_relevance(out_dir: Path, X: np.ndarray, rows: list[dict], ann: dict, manifest: list[dict], args: argparse.Namespace) -> dict:
493
+ frame_info = ann["caption_frame_info_map"]
494
+ vocab = OrderedDict()
495
+ labels = []
496
+ for row in rows:
497
+ counts = Counter()
498
+ for frame in range(int(row["start_frame"]), int(row["end_frame"]) + 1):
499
+ counts.update(extract_objects(frame_info.get(frame, {})))
500
+ objects = [obj for obj, count in counts.items() if count > 0]
501
+ for obj in objects:
502
+ if obj not in vocab:
503
+ vocab[obj] = len(vocab)
504
+ labels.append(objects)
505
+ if not vocab:
506
+ raise ValueError("No object labels found.")
507
+ Y = np.zeros((len(rows), len(vocab)), dtype=np.float32)
508
+ for i, objects in enumerate(labels):
509
+ for obj in objects:
510
+ Y[i, vocab[obj]] = 1.0
511
+
512
+ keep = block_indices(manifest, exclude=["caption_objects_interaction_text"])
513
+ Xo = X[:, keep]
514
+ train, test = chronological_split_indices(len(rows), args.test_fraction)
515
+ mean, std = fit_scaler(Xo[train])
516
+ Xs = (Xo - mean) / std
517
+ W, b, history = train_multilabel_logistic(Xs[train], Y[train], args.epochs, 0.05, args.l2, args.seed)
518
+ prob = sigmoid(Xs[test] @ W + b)
519
+ pred = (prob >= 0.5).astype(np.float32)
520
+ # Ensure at least one object is emitted per row.
521
+ empty = np.where(pred.sum(axis=1) == 0)[0]
522
+ if len(empty):
523
+ pred[empty, np.argmax(prob[empty], axis=1)] = 1
524
+ metrics = multilabel_metrics(Y[test], pred)
525
+ metrics.update({
526
+ "task": "object_relevance",
527
+ "input": "all non-caption modalities -> current relevant object set",
528
+ "split": "chronological",
529
+ "num_windows": int(len(rows)),
530
+ "num_train_windows": int(len(train)),
531
+ "num_test_windows": int(len(test)),
532
+ "num_objects": int(len(vocab)),
533
+ })
534
+ out_dir.mkdir(parents=True, exist_ok=True)
535
+ write_json(out_dir / "metrics.json", metrics)
536
+ write_json(out_dir / "object_vocab.json", list(vocab.keys()))
537
+ rows_out = []
538
+ names = list(vocab.keys())
539
+ for local_i, global_i in enumerate(test):
540
+ true_objs = [names[j] for j in np.flatnonzero(Y[global_i] > 0)]
541
+ pred_objs = [names[j] for j in np.flatnonzero(pred[local_i] > 0)]
542
+ rows_out.append({
543
+ "window_index": int(global_i),
544
+ "start_frame": rows[int(global_i)]["start_frame"],
545
+ "end_frame": rows[int(global_i)]["end_frame"],
546
+ "true_objects": "|".join(true_objs),
547
+ "predicted_objects": "|".join(pred_objs),
548
+ })
549
+ write_csv(out_dir / "predictions.csv", rows_out, ["window_index", "start_frame", "end_frame", "true_objects", "predicted_objects"])
550
+ np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b, object_vocab=np.asarray(names, dtype=object), history=np.asarray(history, dtype=object))
551
+ return metrics
552
+
553
+
554
+ def normalize_rows(A: np.ndarray) -> np.ndarray:
555
+ norm = np.linalg.norm(A, axis=1, keepdims=True)
556
+ return A / np.maximum(norm, 1e-8)
557
+
558
+
559
+ def retrieval_metrics(query: np.ndarray, candidates: np.ndarray, positive_indices: np.ndarray, topks=(1, 5, 10)) -> dict:
560
+ Q = normalize_rows(query)
561
+ C = normalize_rows(candidates)
562
+ sims = Q @ C.T
563
+ ranks = []
564
+ for i, pos in enumerate(positive_indices):
565
+ order = np.argsort(-sims[i])
566
+ rank = int(np.where(order == pos)[0][0]) + 1
567
+ ranks.append(rank)
568
+ ranks = np.asarray(ranks)
569
+ out = {
570
+ "mrr": float(np.mean(1.0 / ranks)),
571
+ "median_rank": float(np.median(ranks)),
572
+ "mean_rank": float(np.mean(ranks)),
573
+ "num_queries": int(len(ranks)),
574
+ }
575
+ for k in topks:
576
+ out[f"top{k}_accuracy"] = float(np.mean(ranks <= k))
577
+ return out
578
+
579
+
580
+ def task_caption_grounding(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
581
+ text_idx = block_indices(manifest, include=["caption_objects_interaction_text"])
582
+ sensor_idx = block_indices(manifest, exclude=["caption_objects_interaction_text"])
583
+ train, test = chronological_split_indices(len(X), args.test_fraction)
584
+ pred_text, model = ridge_fit_predict(X[train][:, sensor_idx], X[train][:, text_idx], X[test][:, sensor_idx], args.ridge_l2)
585
+ # Query is true text; candidates are sensor windows projected into text space.
586
+ metrics = retrieval_metrics(X[test][:, text_idx], pred_text, np.arange(len(test)))
587
+ metrics.update({
588
+ "task": "caption_grounding",
589
+ "input": "caption objects/interaction text query + candidate sensor windows",
590
+ "output": "matching time window",
591
+ "split": "chronological",
592
+ "num_train_windows": int(len(train)),
593
+ "num_test_windows": int(len(test)),
594
+ })
595
+ out_dir.mkdir(parents=True, exist_ok=True)
596
+ write_json(out_dir / "metrics.json", metrics)
597
+ np.savez_compressed(out_dir / "model.npz", **model)
598
+ return metrics
599
+
600
+
601
+ def task_cross_modal_retrieval(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
602
+ motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
603
+ visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
604
+ train, test = chronological_split_indices(len(X), args.test_fraction)
605
+ pred_visual, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2)
606
+ metrics = retrieval_metrics(pred_visual, X[test][:, visual_idx], np.arange(len(test)))
607
+ metrics.update({
608
+ "task": "cross_modal_retrieval",
609
+ "input": "motion/IMU/camera query",
610
+ "output": "matching depth/video window",
611
+ "split": "chronological",
612
+ "num_train_windows": int(len(train)),
613
+ "num_test_windows": int(len(test)),
614
+ })
615
+ out_dir.mkdir(parents=True, exist_ok=True)
616
+ write_json(out_dir / "metrics.json", metrics)
617
+ np.savez_compressed(out_dir / "model.npz", **model)
618
+ return metrics
619
+
620
+
621
+ def task_modality_reconstruction(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
622
+ motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
623
+ visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
624
+ train, test = chronological_split_indices(len(X), args.test_fraction)
625
+ pred, model = ridge_fit_predict(X[train][:, motion_idx], X[train][:, visual_idx], X[test][:, motion_idx], args.ridge_l2)
626
+ metrics = regression_metrics(X[test][:, visual_idx], pred)
627
+ metrics.update({
628
+ "task": "modality_reconstruction",
629
+ "input": "motion/IMU/camera",
630
+ "output": "depth/video feature vector",
631
+ "split": "chronological",
632
+ "num_train_windows": int(len(train)),
633
+ "num_test_windows": int(len(test)),
634
+ "target_dim": int(len(visual_idx)),
635
+ })
636
+ out_dir.mkdir(parents=True, exist_ok=True)
637
+ write_json(out_dir / "metrics.json", metrics)
638
+ np.savez_compressed(out_dir / "predictions.npz", y_true=X[test][:, visual_idx], y_pred=pred, **model)
639
+ return metrics
640
+
641
+
642
+ def binary_classification_from_arrays(out_dir: Path, X: np.ndarray, y: np.ndarray, args: argparse.Namespace, task: str, input_desc: str) -> dict:
643
+ train, test = chronological_split_indices(len(y), args.test_fraction)
644
+ mean, std = fit_scaler(X[train])
645
+ Xs = (X - mean) / std
646
+ W, b, history = train_softmax_classifier(
647
+ Xs[train],
648
+ y[train].astype(np.int64),
649
+ n_classes=2,
650
+ epochs=args.epochs,
651
+ lr=args.learning_rate,
652
+ l2=args.l2,
653
+ use_class_weights=True,
654
+ seed=args.seed,
655
+ )
656
+ pred, prob = predict(Xs[test], W, b)
657
+ metrics = binary_metrics(y[test], pred)
658
+ metrics.update({
659
+ "task": task,
660
+ "input": input_desc,
661
+ "split": "chronological",
662
+ "num_samples": int(len(y)),
663
+ "num_train_samples": int(len(train)),
664
+ "num_test_samples": int(len(test)),
665
+ "train_final_accuracy": float(history[-1]["train_accuracy"]),
666
+ })
667
+ out_dir.mkdir(parents=True, exist_ok=True)
668
+ write_json(out_dir / "metrics.json", metrics)
669
+ pred_rows = []
670
+ for k, idx in enumerate(test):
671
+ pred_rows.append({"sample_index": int(idx), "true": int(y[idx]), "predicted": int(pred[k]), "prob_positive": float(prob[k, 1])})
672
+ write_csv(out_dir / "predictions.csv", pred_rows, ["sample_index", "true", "predicted", "prob_positive"])
673
+ np.savez_compressed(out_dir / "model.npz", mean=mean, std=std, W=W, b=b)
674
+ return metrics
675
+
676
+
677
+ def task_temporal_order(out_dir: Path, X: np.ndarray, args: argparse.Namespace) -> dict:
678
+ pairs, y = [], []
679
+ for i in range(len(X) - 1):
680
+ a, b = X[i], X[i + 1]
681
+ pairs.append(np.concatenate([a, b, b - a]))
682
+ y.append(1)
683
+ pairs.append(np.concatenate([b, a, a - b]))
684
+ y.append(0)
685
+ return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "temporal_order", "two adjacent windows -> whether order is correct")
686
+
687
+
688
+ def task_misalignment(out_dir: Path, X: np.ndarray, manifest: list[dict], args: argparse.Namespace) -> dict:
689
+ motion_idx = block_indices(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
690
+ visual_idx = block_indices(manifest, include=["depth_confidence", "video_"])
691
+ shift = args.misalignment_shift_windows
692
+ pairs, y = [], []
693
+ limit = len(X) - shift
694
+ for i in range(limit):
695
+ pairs.append(np.concatenate([X[i, motion_idx], X[i, visual_idx]]))
696
+ y.append(1)
697
+ pairs.append(np.concatenate([X[i, motion_idx], X[i + shift, visual_idx]]))
698
+ y.append(0)
699
+ return binary_classification_from_arrays(out_dir, np.stack(pairs).astype(np.float32), np.asarray(y, dtype=np.int64), args, "misalignment_detection", f"motion+visual pair -> aligned vs shifted by {shift} windows")
700
+
701
+
702
+ def main() -> int:
703
+ args = parse_args()
704
+ add_toolkit_to_path(args.workspace)
705
+ from data_loader import load_from_annotation_hdf5
706
+
707
+ args.output_dir.mkdir(parents=True, exist_ok=True)
708
+ tasks = selected_tasks(args.tasks)
709
+
710
+ print(f"Loading annotation: {args.annotation}")
711
+ ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=True)
712
+ extras, available_modalities = prepare_modalities(args, ann)
713
+ print("Building shared all-modality windows")
714
+ X, rows, manifest = build_windows(args, ann, extras)
715
+
716
+ write_json(args.output_dir / "available_modalities.json", available_modalities)
717
+ write_json(args.output_dir / "feature_manifest.json", manifest)
718
+ write_csv(args.output_dir / "windows.csv", rows, ["window_index", "start_frame", "end_frame", "center_frame", "action_label", "action_fraction", "subtask_label", "subtask_fraction"])
719
+ np.savez_compressed(args.output_dir / "shared_windows.npz", X=X, starts=np.asarray([r["start_frame"] for r in rows]), ends=np.asarray([r["end_frame"] for r in rows]))
720
+
721
+ summary = {
722
+ "annotation": portable_path(args.annotation, args.workspace),
723
+ "num_frames": int(len(ann["img_names"])),
724
+ "num_windows": int(len(rows)),
725
+ "feature_dim": int(X.shape[1]),
726
+ "window_frames": int(args.window_frames),
727
+ "stride_frames": int(args.stride_frames),
728
+ "tasks": {},
729
+ }
730
+
731
+ print(f"Windows: {len(rows)}, feature_dim: {X.shape[1]}")
732
+ for task in tasks:
733
+ print(f"\nRunning task: {task}")
734
+ out = args.output_dir / task
735
+ try:
736
+ if task == "timeline_action":
737
+ metrics = classification_task(out, X, label_array(rows, "action_label"), rows, args, task, "all modalities -> current action label")
738
+ elif task == "timeline_subtask":
739
+ metrics = classification_task(out, X, label_array(rows, "subtask_label"), rows, args, task, "all modalities -> current subtask label")
740
+ elif task == "transition_detection":
741
+ metrics = task_transition_detection(out, X, rows, ann, args)
742
+ elif task == "next_action":
743
+ metrics = task_next_action(out, X, rows, ann, args)
744
+ elif task == "hand_trajectory_forecast":
745
+ metrics = task_hand_forecast(out, X, rows, ann, args)
746
+ elif task == "contact_prediction":
747
+ metrics = task_contact_prediction(out, X, rows, ann, manifest, args)
748
+ elif task == "object_relevance":
749
+ metrics = task_object_relevance(out, X, rows, ann, manifest, args)
750
+ elif task == "caption_grounding":
751
+ metrics = task_caption_grounding(out, X, manifest, args)
752
+ elif task == "cross_modal_retrieval":
753
+ metrics = task_cross_modal_retrieval(out, X, manifest, args)
754
+ elif task == "modality_reconstruction":
755
+ metrics = task_modality_reconstruction(out, X, manifest, args)
756
+ elif task == "temporal_order":
757
+ metrics = task_temporal_order(out, X, args)
758
+ elif task == "misalignment_detection":
759
+ metrics = task_misalignment(out, X, manifest, args)
760
+ else:
761
+ raise ValueError(task)
762
+ summary["tasks"][task] = metrics
763
+ key_metrics = {k: metrics[k] for k in ("accuracy", "macro_f1", "f1", "mpjpe", "mrr", "r2", "micro_f1") if k in metrics}
764
+ print(f" done: {key_metrics}")
765
+ except Exception as exc:
766
+ summary["tasks"][task] = {"error": str(exc)}
767
+ write_json(out / "error.json", {"task": task, "error": str(exc)})
768
+ print(f" error: {exc}")
769
+
770
+ write_json(args.output_dir / "summary_report.json", summary)
771
+ print(f"\nSuite artifacts written to: {args.output_dir}")
772
+ return 0
773
+
774
+
775
+ if __name__ == "__main__":
776
+ raise SystemExit(main())
scripts/generate_visualizations.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate static SVG visualizations and website data for the Ropedia task suite.
4
+
5
+ No plotting dependencies are required; this uses only the Python standard
6
+ library so the repo stays easy to run.
7
+
8
+ The polished GitHub Pages homepage in docs/index.html is hand-curated and is
9
+ not overwritten by this script. This script refreshes docs/assets/*.svg,
10
+ docs/assets/charts/*.svg, and docs/data/summary_metrics.json.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import html
16
+ import json
17
+ import textwrap
18
+ from pathlib import Path
19
+
20
+
21
+ ROOT = Path(__file__).resolve().parents[1]
22
+ RESULTS = ROOT / "results"
23
+ DOCS = ROOT / "docs"
24
+ ASSETS = DOCS / "assets"
25
+ CHARTS = ASSETS / "charts"
26
+
27
+
28
+ def read_json(path: Path) -> dict:
29
+ return json.loads(path.read_text(encoding="utf-8"))
30
+
31
+
32
+ def svg_bar_chart(path: Path, title: str, rows: list[tuple[str, float]], x_label: str = "score", max_value: float | None = None) -> None:
33
+ path.parent.mkdir(parents=True, exist_ok=True)
34
+ width = 1100
35
+ row_h = 34
36
+ top = 78
37
+ left = 310
38
+ right = 70
39
+ height = top + row_h * len(rows) + 70
40
+ max_value = max_value if max_value is not None else max([v for _, v in rows] + [1.0])
41
+ max_value = max(max_value, 1e-9)
42
+ plot_w = width - left - right
43
+ colors = ["#2563eb", "#059669", "#ea580c", "#7b5d12", "#0891b2", "#dc2626"]
44
+ parts = [
45
+ f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
46
+ '<rect width="100%" height="100%" fill="#ffffff"/>',
47
+ f'<text x="32" y="42" font-family="Arial, sans-serif" font-size="26" font-weight="700" fill="#111827">{html.escape(title)}</text>',
48
+ f'<text x="{left}" y="{height - 24}" font-family="Arial, sans-serif" font-size="13" fill="#6b7280">{html.escape(x_label)}</text>',
49
+ ]
50
+ for tick in range(6):
51
+ x = left + plot_w * tick / 5
52
+ val = max_value * tick / 5
53
+ parts.append(f'<line x1="{x:.1f}" y1="{top - 18}" x2="{x:.1f}" y2="{height - 50}" stroke="#e5e7eb" stroke-width="1"/>')
54
+ parts.append(f'<text x="{x:.1f}" y="{height - 30}" text-anchor="middle" font-family="Arial, sans-serif" font-size="12" fill="#6b7280">{val:.2f}</text>')
55
+ for i, (label, value) in enumerate(rows):
56
+ y = top + i * row_h
57
+ bar_w = max(0.0, min(value / max_value, 1.0)) * plot_w
58
+ color = colors[i % len(colors)]
59
+ parts.append(f'<text x="{left - 14}" y="{y + 21}" text-anchor="end" font-family="Arial, sans-serif" font-size="14" fill="#111827">{html.escape(label)}</text>')
60
+ parts.append(f'<rect x="{left}" y="{y + 5}" width="{bar_w:.1f}" height="20" rx="4" fill="{color}"/>')
61
+ parts.append(f'<text x="{left + bar_w + 8:.1f}" y="{y + 21}" font-family="Arial, sans-serif" font-size="13" fill="#374151">{value:.4f}</text>')
62
+ parts.append("</svg>")
63
+ path.write_text("\n".join(parts), encoding="utf-8")
64
+
65
+
66
+ def svg_feature_blocks(path: Path, feature_manifest: list[dict]) -> None:
67
+ rows = [(block["name"], float(block["dim"])) for block in feature_manifest]
68
+ svg_bar_chart(path, "All-Modality Feature Blocks", rows, x_label="feature dimensions", max_value=max(v for _, v in rows) * 1.08)
69
+
70
+
71
+ def svg_pipeline_diagram(path: Path, summary: dict) -> None:
72
+ path.parent.mkdir(parents=True, exist_ok=True)
73
+ suite = summary["suite"]
74
+ task_count = len(suite["tasks"])
75
+ width, height = 1400, 760
76
+ boxes = [
77
+ (60, 110, 250, 132, "1. Raw public sample", [
78
+ "annotation.hdf5",
79
+ "6 video files",
80
+ f"{suite['num_frames']:,} aligned frames",
81
+ ], "#1f63e9"),
82
+ (365, 110, 250, 132, "2. HOMIE loader", [
83
+ "mocap, IMU, depth",
84
+ "caption map",
85
+ "SLAM and calibration",
86
+ ], "#008b9a"),
87
+ (670, 110, 250, 132, "3. Window builder", [
88
+ f"{suite['window_frames']}-frame windows",
89
+ f"{suite['stride_frames']}-frame stride",
90
+ f"{suite['num_windows']:,} windows",
91
+ ], "#0a7f55"),
92
+ (975, 110, 300, 132, "4. Feature vector", [
93
+ f"{suite['feature_dim']:,} dimensions",
94
+ "17 named feature blocks",
95
+ "stored manifest",
96
+ ], "#b65b04"),
97
+ (60, 380, 360, 168, "5. Baseline models", [
98
+ "motion-only action/subtask",
99
+ "all-modality action/subtask",
100
+ "numpy softmax classifier",
101
+ "metrics and predictions",
102
+ ], "#1f63e9"),
103
+ (520, 380, 360, 168, "6. Episode task suite", [
104
+ f"{task_count} supervised/self-supervised tasks",
105
+ "chronological split",
106
+ "retrieval, forecast, alignment",
107
+ "per-task artifacts",
108
+ ], "#008b9a"),
109
+ (980, 380, 300, 168, "7. Published artifacts", [
110
+ "results/**/*.json/csv/npz",
111
+ "docs/data/summary_metrics.json",
112
+ "GitHub Pages dashboard",
113
+ "reproducibility audit",
114
+ ], "#0a7f55"),
115
+ ]
116
+ parts = [
117
+ f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
118
+ '<rect width="100%" height="100%" fill="#ffffff"/>',
119
+ '<rect x="0" y="0" width="1400" height="760" fill="#ffffff"/>',
120
+ '<text x="60" y="58" font-family="Arial, sans-serif" font-size="32" font-weight="700" fill="#10141f">Verified Ropedia Episode Pipeline</text>',
121
+ '<text x="60" y="88" font-family="Arial, sans-serif" font-size="16" fill="#5b6475">Generated from committed scripts and metrics; no conceptual placeholder stages.</text>',
122
+ ]
123
+ arrows = [
124
+ (310, 176, 365, 176),
125
+ (615, 176, 670, 176),
126
+ (920, 176, 975, 176),
127
+ (215, 242, 240, 380),
128
+ (1095, 242, 700, 380),
129
+ (420, 464, 520, 464),
130
+ (880, 464, 980, 464),
131
+ ]
132
+ for x1, y1, x2, y2 in arrows:
133
+ parts.append(f'<line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}" stroke="#cbd5e1" stroke-width="3" marker-end="url(#arrow)"/>')
134
+ parts.insert(1, '<defs><marker id="arrow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M 0 0 L 10 5 L 0 10 z" fill="#cbd5e1"/></marker></defs>')
135
+ for x, y, w, h, title, lines, color in boxes:
136
+ parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
137
+ parts.append(f'<rect x="{x}" y="{y}" width="8" height="{h}" rx="4" fill="{color}"/>')
138
+ parts.append(f'<text x="{x + 24}" y="{y + 34}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="#10141f">{html.escape(title)}</text>')
139
+ for i, line in enumerate(lines):
140
+ parts.append(f'<text x="{x + 24}" y="{y + 66 + i * 22}" font-family="Arial, sans-serif" font-size="14" fill="#394255">{html.escape(line)}</text>')
141
+ checks = [
142
+ "Audit check: rerunning scripts to /private/tmp reproduced committed metrics exactly.",
143
+ "Video/depth check: fresh cache read depth plus fisheye_cam0/1/2/3 and stereo_left/right from raw files.",
144
+ "Scope check: this validates one public sample episode, not cross-episode generalization.",
145
+ ]
146
+ parts.append('<rect x="60" y="620" width="1220" height="96" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
147
+ for i, line in enumerate(checks):
148
+ parts.append(f'<text x="84" y="{650 + i * 24}" font-family="Arial, sans-serif" font-size="15" fill="#273143">{html.escape(line)}</text>')
149
+ parts.append("</svg>")
150
+ path.write_text("\n".join(parts), encoding="utf-8")
151
+
152
+
153
+ def feature_dim(feature_manifest: list[dict], include: list[str] | None = None, exclude: list[str] | None = None) -> int:
154
+ include = include or []
155
+ exclude = exclude or []
156
+ total = 0
157
+ for block in feature_manifest:
158
+ name = block["name"]
159
+ if include and not any(name == prefix or name.startswith(prefix) for prefix in include):
160
+ continue
161
+ if exclude and any(name == prefix or name.startswith(prefix) for prefix in exclude):
162
+ continue
163
+ total += int(block["dim"])
164
+ return total
165
+
166
+
167
+ def metric_text(task_name: str, metrics: dict) -> str:
168
+ if task_name == "hand_trajectory_forecast":
169
+ return f"MPJPE {metrics['mpjpe']:.4f}"
170
+ if task_name == "cross_modal_retrieval":
171
+ return f"top-5 {metrics['top5_accuracy']:.4f}"
172
+ if task_name == "caption_grounding":
173
+ return f"MRR {metrics['mrr']:.4f}"
174
+ if task_name == "object_relevance":
175
+ return f"micro-F1 {metrics['micro_f1']:.4f}"
176
+ if task_name == "modality_reconstruction":
177
+ return f"R2 {metrics['r2']:.4f}"
178
+ if task_name in {"temporal_order", "misalignment_detection"}:
179
+ return f"F1 {metrics['f1']:.4f}"
180
+ if "macro_f1" in metrics:
181
+ return f"macro-F1 {metrics['macro_f1']:.4f}"
182
+ if "accuracy" in metrics:
183
+ return f"accuracy {metrics['accuracy']:.4f}"
184
+ return "metric in summary_report.json"
185
+
186
+
187
+ def draw_text_block(parts: list[str], x: int, y: int, lines: list[str], size: int = 13, color: str = "#394255", weight: str = "500", max_chars: int = 42, line_h: int = 18) -> int:
188
+ cursor = y
189
+ for line in lines:
190
+ wrapped = textwrap.wrap(line, width=max_chars) or [""]
191
+ for item in wrapped:
192
+ parts.append(f'<text x="{x}" y="{cursor}" font-family="Arial, sans-serif" font-size="{size}" font-weight="{weight}" fill="{color}">{html.escape(item)}</text>')
193
+ cursor += line_h
194
+ return cursor
195
+
196
+
197
+ def task_architecture_rows(summary: dict) -> list[dict]:
198
+ suite = summary["suite"]
199
+ tasks = suite["tasks"]
200
+ manifest = summary["feature_manifest"]
201
+ all_dim = int(suite["feature_dim"])
202
+ no_contact_text_dim = feature_dim(manifest, exclude=["body_contacts", "caption_objects_interaction_text"])
203
+ no_text_dim = feature_dim(manifest, exclude=["caption_objects_interaction_text"])
204
+ sensor_dim = no_text_dim
205
+ text_dim = feature_dim(manifest, include=["caption_objects_interaction_text"])
206
+ motion_dim = feature_dim(manifest, include=["hand_", "body_joints", "body_contacts", "camera_", "imu_"])
207
+ visual_dim = feature_dim(manifest, include=["depth_confidence", "video_"])
208
+ pair_dim = all_dim * 3
209
+ align_dim = motion_dim + visual_dim
210
+
211
+ return [
212
+ {
213
+ "task": "timeline_action",
214
+ "family": "softmax",
215
+ "input": f"X_all window, {all_dim:,}d",
216
+ "head": "z-score -> linear softmax, class-weighted CE + L2",
217
+ "output": f"current action class, {tasks['timeline_action']['num_classes']} classes",
218
+ "metric": metric_text("timeline_action", tasks["timeline_action"]),
219
+ },
220
+ {
221
+ "task": "timeline_subtask",
222
+ "family": "softmax",
223
+ "input": f"X_all window, {all_dim:,}d",
224
+ "head": "z-score -> linear softmax, class-weighted CE + L2",
225
+ "output": f"current subtask class, {tasks['timeline_subtask']['num_classes']} classes",
226
+ "metric": metric_text("timeline_subtask", tasks["timeline_subtask"]),
227
+ },
228
+ {
229
+ "task": "transition_detection",
230
+ "family": "softmax",
231
+ "input": f"X_all window, {all_dim:,}d",
232
+ "head": "z-score -> linear softmax, class-weighted CE + L2",
233
+ "output": "steady vs transition near action boundary",
234
+ "metric": f"{metric_text('transition_detection', tasks['transition_detection'])}; boundary-F1 {tasks['transition_detection']['boundary_f1']:.4f}",
235
+ },
236
+ {
237
+ "task": "next_action",
238
+ "family": "softmax",
239
+ "input": f"X_all at time t, {all_dim:,}d",
240
+ "head": "z-score -> linear softmax, class-weighted CE + L2",
241
+ "output": f"action at t+{tasks['next_action'].get('future_frames', 20)} frames",
242
+ "metric": metric_text("next_action", tasks["next_action"]),
243
+ },
244
+ {
245
+ "task": "hand_trajectory_forecast",
246
+ "family": "ridge",
247
+ "input": f"X_all at time t, {all_dim:,}d",
248
+ "head": "z-score X/Y -> dual ridge regression, L2=10",
249
+ "output": f"future hand joints, {tasks['hand_trajectory_forecast']['target_dim']}d",
250
+ "metric": metric_text("hand_trajectory_forecast", tasks["hand_trajectory_forecast"]),
251
+ },
252
+ {
253
+ "task": "contact_prediction",
254
+ "family": "softmax",
255
+ "input": f"X without contact/text leakage, {no_contact_text_dim:,}d",
256
+ "head": "z-score -> linear softmax on observed labels",
257
+ "output": "any body contact in window; degenerate one-class sample",
258
+ "metric": metric_text("contact_prediction", tasks["contact_prediction"]),
259
+ },
260
+ {
261
+ "task": "object_relevance",
262
+ "family": "multilabel",
263
+ "input": f"X without caption text, {no_text_dim:,}d",
264
+ "head": "z-score -> sigmoid multi-label logistic, weighted",
265
+ "output": f"multi-hot object set, {tasks['object_relevance']['num_objects']} objects",
266
+ "metric": metric_text("object_relevance", tasks["object_relevance"]),
267
+ },
268
+ {
269
+ "task": "caption_grounding",
270
+ "family": "ridge+rank",
271
+ "input": f"sensor {sensor_dim:,}d -> text space {text_dim:,}d",
272
+ "head": "ridge projection, then cosine ranking",
273
+ "output": "text query retrieves matching time window",
274
+ "metric": metric_text("caption_grounding", tasks["caption_grounding"]),
275
+ },
276
+ {
277
+ "task": "cross_modal_retrieval",
278
+ "family": "ridge+rank",
279
+ "input": f"motion/IMU/camera {motion_dim:,}d -> visual {visual_dim:,}d",
280
+ "head": "ridge projection, then cosine ranking",
281
+ "output": "retrieve matching depth/video window",
282
+ "metric": metric_text("cross_modal_retrieval", tasks["cross_modal_retrieval"]),
283
+ },
284
+ {
285
+ "task": "modality_reconstruction",
286
+ "family": "ridge",
287
+ "input": f"motion/IMU/camera {motion_dim:,}d",
288
+ "head": "z-score X/Y -> dual ridge regression, L2=10",
289
+ "output": f"depth/video feature vector, {visual_dim:,}d",
290
+ "metric": metric_text("modality_reconstruction", tasks["modality_reconstruction"]),
291
+ },
292
+ {
293
+ "task": "temporal_order",
294
+ "family": "softmax",
295
+ "input": f"concat[x_t, x_t+1, diff], {pair_dim:,}d",
296
+ "head": "z-score -> binary linear softmax, CE + L2",
297
+ "output": "correct vs reversed adjacent windows",
298
+ "metric": metric_text("temporal_order", tasks["temporal_order"]),
299
+ },
300
+ {
301
+ "task": "misalignment_detection",
302
+ "family": "softmax",
303
+ "input": f"concat[motion_t, visual_t/visual_t+8], {align_dim:,}d",
304
+ "head": "z-score -> binary linear softmax, CE + L2",
305
+ "output": "aligned vs shifted by 8 windows",
306
+ "metric": metric_text("misalignment_detection", tasks["misalignment_detection"]),
307
+ },
308
+ ]
309
+
310
+
311
+ def svg_task_architectures(path: Path, summary: dict) -> None:
312
+ path.parent.mkdir(parents=True, exist_ok=True)
313
+ suite = summary["suite"]
314
+ rows = task_architecture_rows(summary)
315
+ family_colors = {
316
+ "softmax": "#1f63e9",
317
+ "ridge": "#0a7f55",
318
+ "ridge+rank": "#008b9a",
319
+ "multilabel": "#b65b04",
320
+ }
321
+ width, height = 1500, 1840
322
+ parts = [
323
+ f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
324
+ '<defs><marker id="arrow2" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M 0 0 L 10 5 L 0 10 z" fill="#cbd5e1"/></marker></defs>',
325
+ '<rect width="100%" height="100%" fill="#ffffff"/>',
326
+ '<text x="60" y="56" font-family="Arial, sans-serif" font-size="34" font-weight="700" fill="#10141f">Minimal Architectures for the 12 Ropedia Episode Tasks</text>',
327
+ '<text x="60" y="88" font-family="Arial, sans-serif" font-size="16" fill="#5b6475">Generated from scripts/episode_task_suite.py semantics and committed summary metrics. These are minimal baselines, not deep foundation models.</text>',
328
+ ]
329
+
330
+ setup = [
331
+ (60, 122, 310, 110, "Shared episode windows", [
332
+ f"{suite['num_frames']:,} frames -> {suite['num_windows']:,} windows",
333
+ f"{suite['window_frames']}-frame window, {suite['stride_frames']}-frame stride",
334
+ "chronological 70/30 split",
335
+ ], "#1f63e9"),
336
+ (410, 122, 310, 110, "Feature vector", [
337
+ f"X_all = {suite['feature_dim']:,} dimensions",
338
+ "17 named modality blocks",
339
+ "mean/std fit on train only",
340
+ ], "#008b9a"),
341
+ (760, 122, 320, 110, "Reusable heads", [
342
+ "linear softmax classifier",
343
+ "dual ridge regression/projection",
344
+ "multi-label logistic + cosine rank",
345
+ ], "#0a7f55"),
346
+ (1120, 122, 320, 110, "Artifacts", [
347
+ "metrics.json, predictions.csv/npz",
348
+ "model.npz with scaler and weights",
349
+ "summary_report.json source of numbers",
350
+ ], "#b65b04"),
351
+ ]
352
+ for i in range(len(setup) - 1):
353
+ x1 = setup[i][0] + setup[i][2]
354
+ x2 = setup[i + 1][0]
355
+ y = setup[i][1] + 55
356
+ parts.append(f'<line x1="{x1 + 12}" y1="{y}" x2="{x2 - 14}" y2="{y}" stroke="#cbd5e1" stroke-width="3" marker-end="url(#arrow2)"/>')
357
+ for x, y, w, h, title, lines, color in setup:
358
+ parts.append(f'<rect x="{x}" y="{y}" width="{w}" height="{h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
359
+ parts.append(f'<rect x="{x}" y="{y}" width="8" height="{h}" rx="4" fill="{color}"/>')
360
+ parts.append(f'<text x="{x + 24}" y="{y + 31}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="#10141f">{html.escape(title)}</text>')
361
+ draw_text_block(parts, x + 24, y + 58, lines, size=13, color="#394255", max_chars=34, line_h=18)
362
+
363
+ families = [
364
+ ("Softmax classifier", "logits = z(X)W + b; CE + L2; class weights for classifiers", "#1f63e9", 60, 270),
365
+ ("Ridge regression/projection", "closed-form dual ridge on z(X), z(Y); used for forecast and reconstruction", "#0a7f55", 780, 270),
366
+ ("Ridge + cosine ranking", "project one modality into another feature space, then rank candidates by cosine", "#008b9a", 60, 394),
367
+ ("Multi-label logistic", "sigmoid heads for object vocabulary; threshold 0.5 with top-1 fallback", "#b65b04", 780, 394),
368
+ ]
369
+ for title, desc, color, x, y in families:
370
+ parts.append(f'<rect x="{x}" y="{y}" width="660" height="100" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
371
+ parts.append(f'<text x="{x + 18}" y="{y + 33}" font-family="Arial, sans-serif" font-size="18" font-weight="700" fill="{color}">{html.escape(title)}</text>')
372
+ draw_text_block(parts, x + 18, y + 60, [desc], size=13, color="#394255", max_chars=76, line_h=18)
373
+
374
+ card_w, card_h = 440, 248
375
+ gap_x, gap_y = 30, 30
376
+ start_x, start_y = 60, 540
377
+ for idx, row in enumerate(rows):
378
+ col, card_row = idx % 3, idx // 3
379
+ x = start_x + col * (card_w + gap_x)
380
+ y = start_y + card_row * (card_h + gap_y)
381
+ color = family_colors[row["family"]]
382
+ parts.append(f'<rect x="{x}" y="{y}" width="{card_w}" height="{card_h}" rx="8" fill="#ffffff" stroke="#dce2ec" stroke-width="2"/>')
383
+ parts.append(f'<rect x="{x}" y="{y}" width="8" height="{card_h}" rx="4" fill="{color}"/>')
384
+ parts.append(f'<rect x="{x + 20}" y="{y + 18}" width="96" height="24" rx="6" fill="#f8fafc" stroke="{color}"/>')
385
+ parts.append(f'<text x="{x + 68}" y="{y + 35}" text-anchor="middle" font-family="Arial, sans-serif" font-size="11" font-weight="700" fill="{color}">{html.escape(row["family"])}</text>')
386
+ parts.append(f'<text x="{x + 20}" y="{y + 72}" font-family="Arial, sans-serif" font-size="20" font-weight="700" fill="#10141f">{html.escape(row["task"])}</text>')
387
+ cursor = y + 104
388
+ for label in ("input", "head", "output", "metric"):
389
+ parts.append(f'<text x="{x + 20}" y="{cursor}" font-family="Arial, sans-serif" font-size="12" font-weight="700" fill="{color}">{label.upper()}</text>')
390
+ cursor = draw_text_block(parts, x + 92, cursor, [row[label]], size=13, color="#394255", max_chars=41, line_h=17)
391
+ cursor += 8
392
+
393
+ notes = [
394
+ "Interpretation: this suite tests whether each input/output contract is wired correctly before scaling to many episodes.",
395
+ "Research-grade claims need held-out episode splits and stronger sequence/vision-language/robot-policy models.",
396
+ ]
397
+ parts.append('<rect x="60" y="1688" width="1380" height="72" rx="8" fill="#f8fafc" stroke="#dce2ec"/>')
398
+ for i, line in enumerate(notes):
399
+ parts.append(f'<text x="84" y="{1718 + i * 24}" font-family="Arial, sans-serif" font-size="15" fill="#273143">{html.escape(line)}</text>')
400
+ parts.append("</svg>")
401
+ path.write_text("\n".join(parts), encoding="utf-8")
402
+
403
+
404
+ def collect_summary() -> dict:
405
+ all_action = read_json(RESULTS / "min_all_modalities_action_model/metrics.json")
406
+ all_subtask = read_json(RESULTS / "min_all_modalities_subtask_model/metrics.json")
407
+ min_action = read_json(RESULTS / "min_action_model/metrics.json")
408
+ min_subtask = read_json(RESULTS / "min_subtask_model/metrics.json")
409
+ suite = read_json(RESULTS / "episode_task_suite/summary_report.json")
410
+ manifest = read_json(RESULTS / "episode_task_suite/feature_manifest.json")
411
+ return {
412
+ "models": {
413
+ "motion_action": min_action,
414
+ "motion_subtask": min_subtask,
415
+ "all_modalities_action": all_action,
416
+ "all_modalities_subtask": all_subtask,
417
+ },
418
+ "suite": suite,
419
+ "feature_manifest": manifest,
420
+ }
421
+
422
+
423
+ def generate_charts(summary: dict) -> None:
424
+ CHARTS.mkdir(parents=True, exist_ok=True)
425
+ svg_pipeline_diagram(ASSETS / "pipeline_diagram.svg", summary)
426
+ svg_task_architectures(ASSETS / "task_architectures.svg", summary)
427
+ model_rows = [
428
+ ("Motion-only action macro-F1", summary["models"]["motion_action"]["macro_f1"]),
429
+ ("All-modality action macro-F1", summary["models"]["all_modalities_action"]["macro_f1"]),
430
+ ("Motion-only subtask macro-F1", summary["models"]["motion_subtask"]["macro_f1"]),
431
+ ("All-modality subtask macro-F1", summary["models"]["all_modalities_subtask"]["macro_f1"]),
432
+ ]
433
+ svg_bar_chart(CHARTS / "model_macro_f1.svg", "Minimal Model Macro-F1 Comparison", model_rows, max_value=1.0)
434
+
435
+ suite = summary["suite"]["tasks"]
436
+ task_rows = []
437
+ for task_name, metrics in suite.items():
438
+ score = metrics.get("macro_f1", metrics.get("f1", metrics.get("micro_f1", metrics.get("top5_accuracy", metrics.get("r2", 0.0)))))
439
+ if score is None:
440
+ score = 0.0
441
+ score = max(float(score), 0.0)
442
+ task_rows.append((task_name, score))
443
+ svg_bar_chart(CHARTS / "episode_task_scores.svg", "Episode Task Suite: Main Scores", task_rows, max_value=1.0)
444
+ svg_feature_blocks(CHARTS / "feature_blocks.svg", summary["feature_manifest"])
445
+
446
+ retrieval = suite["cross_modal_retrieval"]
447
+ retrieval_rows = [
448
+ ("top1", retrieval["top1_accuracy"]),
449
+ ("top5", retrieval["top5_accuracy"]),
450
+ ("top10", retrieval["top10_accuracy"]),
451
+ ("MRR", retrieval["mrr"]),
452
+ ]
453
+ svg_bar_chart(CHARTS / "cross_modal_retrieval.svg", "Cross-Modal Retrieval", retrieval_rows, max_value=1.0)
454
+
455
+
456
+ def write_summary_data(summary: dict) -> None:
457
+ DOCS.mkdir(parents=True, exist_ok=True)
458
+ (DOCS / "data").mkdir(parents=True, exist_ok=True)
459
+ (DOCS / "data/summary_metrics.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
460
+
461
+
462
+ def main() -> int:
463
+ summary = collect_summary()
464
+ generate_charts(summary)
465
+ write_summary_data(summary)
466
+ print(f"Wrote pipeline diagram: {ASSETS / 'pipeline_diagram.svg'}")
467
+ print(f"Wrote task architectures diagram: {ASSETS / 'task_architectures.svg'}")
468
+ print(f"Wrote charts: {CHARTS}")
469
+ print(f"Wrote data: {DOCS / 'data/summary_metrics.json'}")
470
+ return 0
471
+
472
+
473
+ if __name__ == "__main__":
474
+ raise SystemExit(main())
scripts/render_task_suite_infographic.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Render a ChatGPT-image-backed 12-task infographic.
4
+
5
+ The background bitmap is AI-generated. The task names, inputs, and metrics are
6
+ read from results/episode_task_suite/summary_report.json so the published image
7
+ does not rely on image-model text generation.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import html
14
+ import json
15
+ import subprocess
16
+ import tempfile
17
+ from pathlib import Path
18
+
19
+
20
+ ROOT = Path(__file__).resolve().parents[1]
21
+ SUMMARY_PATH = ROOT / "results/episode_task_suite/summary_report.json"
22
+ DEFAULT_BASE = ROOT / "docs/assets/task_suite_infographic_base.png"
23
+ DEFAULT_OUTPUT = ROOT / "docs/assets/task_suite_infographic.png"
24
+
25
+
26
+ GROUPS = [
27
+ {
28
+ "name": "Label + State",
29
+ "color": "#008b9a",
30
+ "left": 94,
31
+ "top": 374,
32
+ "width": 246,
33
+ "tasks": [
34
+ ("timeline_action", "supervised"),
35
+ ("timeline_subtask", "supervised"),
36
+ ("next_action", "supervised"),
37
+ ],
38
+ },
39
+ {
40
+ "name": "Prediction + Reconstruction",
41
+ "color": "#1f63e9",
42
+ "left": 472,
43
+ "top": 374,
44
+ "width": 248,
45
+ "tasks": [
46
+ ("hand_trajectory_forecast", "forecast"),
47
+ ("modality_reconstruction", "forecast"),
48
+ ("contact_prediction", "supervised"),
49
+ ],
50
+ },
51
+ {
52
+ "name": "Grounding + Retrieval",
53
+ "color": "#b65b04",
54
+ "left": 848,
55
+ "top": 374,
56
+ "width": 220,
57
+ "tasks": [
58
+ ("caption_grounding", "retrieval"),
59
+ ("cross_modal_retrieval", "retrieval"),
60
+ ("object_relevance", "supervised"),
61
+ ],
62
+ },
63
+ {
64
+ "name": "Temporal Diagnostics",
65
+ "color": "#b42318",
66
+ "left": 1202,
67
+ "top": 374,
68
+ "width": 244,
69
+ "tasks": [
70
+ ("transition_detection", "diagnostic"),
71
+ ("temporal_order", "diagnostic"),
72
+ ("misalignment_detection", "diagnostic"),
73
+ ],
74
+ },
75
+ ]
76
+
77
+
78
+ def load_summary() -> dict:
79
+ return json.loads(SUMMARY_PATH.read_text(encoding="utf-8"))
80
+
81
+
82
+ def fmt(value: float) -> str:
83
+ return f"{float(value):.4f}"
84
+
85
+
86
+ def metric_for(task_name: str, metrics: dict) -> tuple[str, str]:
87
+ if task_name == "hand_trajectory_forecast":
88
+ return "MPJPE", fmt(metrics["mpjpe"])
89
+ if task_name == "cross_modal_retrieval":
90
+ return "top-5", fmt(metrics["top5_accuracy"])
91
+ if task_name == "caption_grounding":
92
+ return "MRR", fmt(metrics["mrr"])
93
+ if task_name == "object_relevance":
94
+ return "micro-F1", fmt(metrics["micro_f1"])
95
+ if task_name == "modality_reconstruction":
96
+ return "R2", fmt(metrics["r2"])
97
+ if task_name in {"temporal_order", "misalignment_detection"}:
98
+ return "F1", fmt(metrics["f1"])
99
+ if "macro_f1" in metrics:
100
+ return "macro-F1", fmt(metrics["macro_f1"])
101
+ if "accuracy" in metrics:
102
+ return "accuracy", fmt(metrics["accuracy"])
103
+ raise KeyError(f"No main metric configured for {task_name}")
104
+
105
+
106
+ def short_io(task_name: str, metrics: dict) -> str:
107
+ custom = {
108
+ "timeline_action": "all modalities -> action label",
109
+ "timeline_subtask": "all modalities -> subtask label",
110
+ "transition_detection": "all modalities -> boundary / steady",
111
+ "next_action": "window at t -> action at t+20",
112
+ "hand_trajectory_forecast": "all modalities -> future hand joints",
113
+ "contact_prediction": "non-contact modalities -> contact",
114
+ "object_relevance": "non-caption modalities -> object set",
115
+ "caption_grounding": "text query -> matching window",
116
+ "cross_modal_retrieval": "motion / IMU / camera -> depth / video",
117
+ "modality_reconstruction": "motion / IMU / camera -> depth / video vec",
118
+ "temporal_order": "two windows -> correct order?",
119
+ "misalignment_detection": "motion + visual -> aligned / shifted",
120
+ }
121
+ return custom.get(task_name, metrics.get("input", ""))
122
+
123
+
124
+ def task_html(task_name: str, kind: str, metrics: dict, top: int, group: dict) -> str:
125
+ label, value = metric_for(task_name, metrics)
126
+ io = short_io(task_name, metrics)
127
+ name_size = 17 if len(task_name) > 22 else 18
128
+ return f"""
129
+ <section class="task" style="left:{group['left']}px;top:{top}px;width:{group['width']}px;--accent:{group['color']};">
130
+ <div class="kind">{html.escape(kind)}</div>
131
+ <div class="task-name" style="font-size:{name_size}px;">{html.escape(task_name)}</div>
132
+ <div class="io">{html.escape(io)}</div>
133
+ <div class="metric"><span>{html.escape(label)}</span><strong>{html.escape(value)}</strong></div>
134
+ </section>
135
+ """
136
+
137
+
138
+ def build_html(summary: dict, base_image: Path) -> str:
139
+ suite = summary["tasks"]
140
+ task_count = len(suite)
141
+ group_headers = []
142
+ cards = []
143
+ row_tops = [374, 552, 730]
144
+ header_lefts = [38, 417, 792, 1143]
145
+ for group, header_left in zip(GROUPS, header_lefts):
146
+ group_headers.append(
147
+ f'<div class="group-title" style="left:{header_left}px;top:333px;color:{group["color"]};">{html.escape(group["name"])}</div>'
148
+ )
149
+ for row_idx, (task_name, kind) in enumerate(group["tasks"]):
150
+ cards.append(task_html(task_name, kind, suite[task_name], row_tops[row_idx], group))
151
+
152
+ stats = [
153
+ f"{summary['num_frames']:,} frames",
154
+ f"{summary['num_windows']:,} windows",
155
+ f"{summary['feature_dim']:,} features",
156
+ f"{task_count} tasks",
157
+ "chronological split",
158
+ ]
159
+ stat_html = "".join(f"<span>{html.escape(item)}</span>" for item in stats)
160
+ base_uri = base_image.resolve().as_uri()
161
+ return f"""<!doctype html>
162
+ <html lang="en">
163
+ <head>
164
+ <meta charset="utf-8">
165
+ <meta name="viewport" content="width=1536, initial-scale=1">
166
+ <title>Ropedia 12-Task Episode Suite Infographic</title>
167
+ <style>
168
+ * {{ box-sizing: border-box; }}
169
+ html, body {{ margin: 0; width: 1536px; height: 1024px; background: #ffffff; }}
170
+ body {{
171
+ font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
172
+ color: #10141f;
173
+ }}
174
+ .canvas {{
175
+ position: relative;
176
+ width: 1536px;
177
+ height: 1024px;
178
+ overflow: hidden;
179
+ background-image: url("{base_uri}");
180
+ background-size: 1536px 1024px;
181
+ background-repeat: no-repeat;
182
+ }}
183
+ .title {{
184
+ position: absolute;
185
+ left: 330px;
186
+ top: 42px;
187
+ width: 876px;
188
+ text-align: center;
189
+ }}
190
+ h1 {{
191
+ margin: 0;
192
+ font-size: 38px;
193
+ line-height: 1.05;
194
+ letter-spacing: 0;
195
+ font-weight: 820;
196
+ }}
197
+ .subtitle {{
198
+ margin-top: 8px;
199
+ color: #425067;
200
+ font-size: 15px;
201
+ line-height: 1.35;
202
+ font-weight: 520;
203
+ }}
204
+ .stats {{
205
+ margin-top: 12px;
206
+ display: flex;
207
+ justify-content: center;
208
+ gap: 8px;
209
+ }}
210
+ .stats span {{
211
+ display: inline-flex;
212
+ align-items: center;
213
+ height: 24px;
214
+ padding: 0 10px;
215
+ border: 1px solid #cdd8e8;
216
+ background: rgba(255, 255, 255, 0.82);
217
+ border-radius: 999px;
218
+ color: #253046;
219
+ font-size: 12px;
220
+ font-weight: 720;
221
+ }}
222
+ .modality {{
223
+ position: absolute;
224
+ top: 256px;
225
+ width: 180px;
226
+ text-align: center;
227
+ font-size: 12px;
228
+ color: #536074;
229
+ font-weight: 720;
230
+ text-transform: uppercase;
231
+ letter-spacing: 0;
232
+ }}
233
+ .group-title {{
234
+ position: absolute;
235
+ width: 322px;
236
+ text-align: center;
237
+ font-size: 18px;
238
+ line-height: 1;
239
+ font-weight: 830;
240
+ letter-spacing: 0;
241
+ }}
242
+ .task {{
243
+ position: absolute;
244
+ padding: 0;
245
+ }}
246
+ .kind {{
247
+ display: inline-flex;
248
+ align-items: center;
249
+ height: 22px;
250
+ padding: 0 8px;
251
+ border-radius: 6px;
252
+ border: 1px solid color-mix(in srgb, var(--accent) 35%, #ffffff);
253
+ color: var(--accent);
254
+ background: rgba(255, 255, 255, 0.76);
255
+ text-transform: uppercase;
256
+ font-size: 10px;
257
+ line-height: 1;
258
+ font-weight: 840;
259
+ letter-spacing: 0;
260
+ }}
261
+ .task-name {{
262
+ margin-top: 7px;
263
+ color: #111827;
264
+ line-height: 1.05;
265
+ font-weight: 850;
266
+ letter-spacing: 0;
267
+ white-space: nowrap;
268
+ }}
269
+ .io {{
270
+ margin-top: 8px;
271
+ min-height: 36px;
272
+ color: #475569;
273
+ font-size: 13.5px;
274
+ line-height: 1.28;
275
+ font-weight: 570;
276
+ }}
277
+ .metric {{
278
+ display: inline-flex;
279
+ align-items: center;
280
+ gap: 9px;
281
+ margin-top: 8px;
282
+ height: 30px;
283
+ padding: 0 10px;
284
+ border-radius: 7px;
285
+ border: 1px solid color-mix(in srgb, var(--accent) 36%, #ffffff);
286
+ background: rgba(255, 255, 255, 0.90);
287
+ box-shadow: 0 7px 20px rgba(16, 20, 31, 0.07);
288
+ }}
289
+ .metric span {{
290
+ color: #64748b;
291
+ font-size: 12px;
292
+ font-weight: 760;
293
+ }}
294
+ .metric strong {{
295
+ color: var(--accent);
296
+ font-size: 16px;
297
+ line-height: 1;
298
+ font-weight: 860;
299
+ }}
300
+ .footer {{
301
+ position: absolute;
302
+ left: 360px;
303
+ top: 932px;
304
+ width: 816px;
305
+ text-align: center;
306
+ color: #536074;
307
+ font-size: 14px;
308
+ font-weight: 650;
309
+ }}
310
+ </style>
311
+ </head>
312
+ <body>
313
+ <main class="canvas" aria-label="Ropedia 12-task episode suite infographic">
314
+ <div class="title">
315
+ <h1>Ropedia 12-Task Episode Suite</h1>
316
+ <div class="subtitle">All labels and metrics are overlaid from the verified single-episode results.</div>
317
+ <div class="stats">{stat_html}</div>
318
+ </div>
319
+ <div class="modality" style="left:50px;">fisheye video</div>
320
+ <div class="modality" style="left:270px;">depth</div>
321
+ <div class="modality" style="left:530px;">3D / SLAM</div>
322
+ <div class="modality" style="left:770px;">IMU</div>
323
+ <div class="modality" style="left:1030px;">hands</div>
324
+ <div class="modality" style="left:1278px;">text / objects</div>
325
+ {''.join(group_headers)}
326
+ {''.join(cards)}
327
+ <div class="footer">Single public sample episode: useful for pipeline validation and task design, not cross-episode generalization.</div>
328
+ </main>
329
+ </body>
330
+ </html>
331
+ """
332
+
333
+
334
+ def render_html(html_path: Path, output_path: Path) -> None:
335
+ output_path.parent.mkdir(parents=True, exist_ok=True)
336
+ subprocess.run(
337
+ [
338
+ "npx",
339
+ "--yes",
340
+ "playwright",
341
+ "screenshot",
342
+ "--full-page",
343
+ "--viewport-size=1536,1024",
344
+ html_path.resolve().as_uri(),
345
+ str(output_path),
346
+ ],
347
+ check=True,
348
+ )
349
+
350
+
351
+ def main() -> int:
352
+ parser = argparse.ArgumentParser()
353
+ parser.add_argument("--base-image", type=Path, default=DEFAULT_BASE)
354
+ parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
355
+ parser.add_argument("--html", type=Path)
356
+ parser.add_argument("--no-export", action="store_true", help="Only write the HTML overlay.")
357
+ args = parser.parse_args()
358
+
359
+ summary = load_summary()
360
+ html_text = build_html(summary, args.base_image)
361
+ if args.html is None:
362
+ with tempfile.NamedTemporaryFile("w", suffix=".html", encoding="utf-8", delete=False) as handle:
363
+ handle.write(html_text)
364
+ html_path = Path(handle.name)
365
+ else:
366
+ html_path = args.html
367
+ html_path.parent.mkdir(parents=True, exist_ok=True)
368
+ html_path.write_text(html_text, encoding="utf-8")
369
+
370
+ if not args.no_export:
371
+ render_html(html_path, args.output)
372
+ print(f"Wrote image: {args.output}")
373
+ print(f"Wrote overlay HTML: {html_path}")
374
+ return 0
375
+
376
+
377
+ if __name__ == "__main__":
378
+ raise SystemExit(main())
scripts/train_all_modalities_model.py ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ All-modality lightweight baseline for a Ropedia/Xperience episode.
4
+
5
+ This intentionally stays small enough for a MacBook:
6
+ - no deep video training
7
+ - no CUDA
8
+ - no PyTorch dependency
9
+
10
+ Each modality is compressed into window-level statistics, then the same
11
+ Numpy softmax classifier from train_min_action_model.py is used.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import csv
18
+ import hashlib
19
+ import json
20
+ import re
21
+ import sys
22
+ from collections import Counter, OrderedDict
23
+ from pathlib import Path
24
+
25
+ import cv2
26
+ import h5py
27
+ import numpy as np
28
+
29
+ from train_min_action_model import (
30
+ add_toolkit_to_path,
31
+ center_by_body_root,
32
+ compute_metrics,
33
+ encode_labels,
34
+ fit_scaler,
35
+ frame_label,
36
+ majority_label,
37
+ predict,
38
+ portable_path,
39
+ safe_window,
40
+ save_artifacts,
41
+ stratified_split,
42
+ temporal_stats,
43
+ train_softmax_classifier,
44
+ )
45
+
46
+
47
+ VIDEO_FILES = OrderedDict([
48
+ ("fisheye_cam0", "fisheye_cam0.mp4"),
49
+ ("fisheye_cam1", "fisheye_cam1.mp4"),
50
+ ("fisheye_cam2", "fisheye_cam2.mp4"),
51
+ ("fisheye_cam3", "fisheye_cam3.mp4"),
52
+ ("stereo_left", "stereo_left.mp4"),
53
+ ("stereo_right", "stereo_right.mp4"),
54
+ ])
55
+
56
+
57
+ def parse_args() -> argparse.Namespace:
58
+ workspace_default = Path(__file__).resolve().parents[1]
59
+ annotation_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
60
+
61
+ parser = argparse.ArgumentParser(description="Train a lightweight all-modality Ropedia classifier.")
62
+ parser.add_argument("--workspace", type=Path, default=workspace_default, help="Ropedia workspace root.")
63
+ parser.add_argument("--annotation", type=Path, default=annotation_default, help="Path to annotation.hdf5.")
64
+ parser.add_argument("--output-dir", type=Path, default=None, help="Output artifact directory.")
65
+ parser.add_argument("--cache-dir", type=Path, default=None, help="Feature cache directory.")
66
+ parser.add_argument("--target", choices=["action", "subtask"], default="action", help="Prediction target.")
67
+ parser.add_argument("--window-frames", type=int, default=20, help="Frames per training window.")
68
+ parser.add_argument("--stride-frames", type=int, default=5, help="Stride between windows.")
69
+ parser.add_argument("--min-label-fraction", type=float, default=0.6, help="Minimum majority-label fraction.")
70
+ parser.add_argument("--test-fraction", type=float, default=0.25, help="Stratified test fraction.")
71
+ parser.add_argument("--epochs", type=int, default=800, help="Training epochs.")
72
+ parser.add_argument("--learning-rate", type=float, default=0.12, help="Softmax learning rate.")
73
+ parser.add_argument("--l2", type=float, default=2e-3, help="L2 weight decay.")
74
+ parser.add_argument("--seed", type=int, default=7, help="Random seed.")
75
+ parser.add_argument("--no-class-weights", action="store_true", help="Disable inverse-frequency class weighting.")
76
+ parser.add_argument("--force-rebuild-cache", action="store_true", help="Recompute cached depth/video features.")
77
+ parser.add_argument("--video-image-size", type=int, default=32, help="Resize video frames before visual features.")
78
+ parser.add_argument("--video-grid-size", type=int, default=8, help="Small grayscale grid per video frame.")
79
+ parser.add_argument("--video-hist-bins", type=int, default=8, help="Color histogram bins per channel.")
80
+ parser.add_argument("--depth-grid-size", type=int, default=8, help="Small depth/confidence grid per frame.")
81
+ parser.add_argument("--text-hash-dim", type=int, default=128, help="Hashed bag-of-words dimension.")
82
+ parser.add_argument(
83
+ "--include-label-text",
84
+ action="store_true",
85
+ help="Also include action/subtask/action-description text as input. This leaks target semantics.",
86
+ )
87
+ args = parser.parse_args()
88
+
89
+ if args.output_dir is None:
90
+ name = "min_all_modalities_action_model" if args.target == "action" else "min_all_modalities_subtask_model"
91
+ args.output_dir = args.workspace / "outputs" / name
92
+ if args.cache_dir is None:
93
+ args.cache_dir = args.workspace / "outputs/feature_cache"
94
+ return args
95
+
96
+
97
+ def numeric_array(value) -> np.ndarray | None:
98
+ try:
99
+ arr = np.asarray(value, dtype=np.float32)
100
+ except (TypeError, ValueError):
101
+ return None
102
+ if arr.size == 0:
103
+ return None
104
+ return np.nan_to_num(arr.reshape(-1), nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
105
+
106
+
107
+ def calibration_features(calib_data: dict | None) -> np.ndarray:
108
+ if not calib_data:
109
+ return np.zeros(0, dtype=np.float32)
110
+ chunks: list[np.ndarray] = []
111
+ for cam_id in sorted(calib_data):
112
+ cam = calib_data.get(cam_id, {})
113
+ if not isinstance(cam, dict):
114
+ continue
115
+ for key in sorted(cam):
116
+ arr = numeric_array(cam.get(key))
117
+ if arr is not None:
118
+ chunks.append(arr)
119
+ if not chunks:
120
+ return np.zeros(0, dtype=np.float32)
121
+ return np.concatenate(chunks).astype(np.float32)
122
+
123
+
124
+ def point_cloud_features(points: np.ndarray | None) -> np.ndarray:
125
+ if points is None:
126
+ return np.zeros(0, dtype=np.float32)
127
+ pts = np.asarray(points, dtype=np.float32)
128
+ if pts.ndim != 2 or pts.shape[1] != 3 or len(pts) == 0:
129
+ return np.zeros(0, dtype=np.float32)
130
+ pts = np.nan_to_num(pts, nan=0.0, posinf=0.0, neginf=0.0)
131
+ stats = [
132
+ pts.mean(axis=0),
133
+ pts.std(axis=0),
134
+ pts.min(axis=0),
135
+ pts.max(axis=0),
136
+ np.percentile(pts, 10, axis=0),
137
+ np.percentile(pts, 50, axis=0),
138
+ np.percentile(pts, 90, axis=0),
139
+ np.asarray([np.log1p(len(pts))], dtype=np.float32),
140
+ ]
141
+ return np.concatenate(stats).astype(np.float32)
142
+
143
+
144
+ def video_frame_features(frame: np.ndarray, image_size: int, grid_size: int, hist_bins: int) -> np.ndarray:
145
+ small = cv2.resize(frame, (image_size, image_size), interpolation=cv2.INTER_AREA)
146
+ rgb = cv2.cvtColor(small, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
147
+ mean = rgb.reshape(-1, 3).mean(axis=0)
148
+ std = rgb.reshape(-1, 3).std(axis=0)
149
+
150
+ hists = []
151
+ for channel in range(3):
152
+ hist, _ = np.histogram(rgb[:, :, channel], bins=hist_bins, range=(0.0, 1.0))
153
+ hist = hist.astype(np.float32)
154
+ hist /= max(float(hist.sum()), 1.0)
155
+ hists.append(hist)
156
+
157
+ gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY).astype(np.float32) / 255.0
158
+ grid = cv2.resize(gray, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1)
159
+ gy, gx = np.gradient(gray)
160
+ edge = np.asarray([np.abs(gx).mean(), np.abs(gy).mean(), np.abs(gx).std(), np.abs(gy).std()], dtype=np.float32)
161
+
162
+ return np.concatenate([mean, std, *hists, grid, edge]).astype(np.float32)
163
+
164
+
165
+ def read_video_feature_cache(
166
+ path: Path,
167
+ n_frames: int,
168
+ cache_dir: Path,
169
+ image_size: int,
170
+ grid_size: int,
171
+ hist_bins: int,
172
+ force: bool,
173
+ ) -> np.ndarray:
174
+ cache_dir.mkdir(parents=True, exist_ok=True)
175
+ cache_path = cache_dir / f"video_{path.stem}_n{n_frames}_img{image_size}_grid{grid_size}_hist{hist_bins}.npz"
176
+ if cache_path.exists() and not force:
177
+ return np.load(cache_path)["features"].astype(np.float32)
178
+
179
+ dummy_dim = 6 + 3 * hist_bins + grid_size * grid_size + 4
180
+ features = np.zeros((n_frames, dummy_dim), dtype=np.float32)
181
+ if not path.exists():
182
+ np.savez_compressed(cache_path, features=features)
183
+ return features
184
+
185
+ cap = cv2.VideoCapture(str(path))
186
+ if not cap.isOpened():
187
+ np.savez_compressed(cache_path, features=features)
188
+ return features
189
+
190
+ last = np.zeros(dummy_dim, dtype=np.float32)
191
+ for idx in range(n_frames):
192
+ ok, frame = cap.read()
193
+ if ok:
194
+ last = video_frame_features(frame, image_size, grid_size, hist_bins)
195
+ features[idx] = last
196
+ if idx and idx % 1000 == 0:
197
+ print(f" {path.name}: {idx}/{n_frames} frames")
198
+ cap.release()
199
+ np.savez_compressed(cache_path, features=features)
200
+ return features
201
+
202
+
203
+ def depth_frame_features(depth: np.ndarray, confidence: np.ndarray | None, depth_min: float, depth_max: float, grid_size: int) -> np.ndarray:
204
+ d = np.asarray(depth, dtype=np.float32)
205
+ valid = np.isfinite(d) & (d > 0)
206
+ if valid.any():
207
+ vals = d[valid]
208
+ d_stats = np.asarray([
209
+ vals.mean(),
210
+ vals.std(),
211
+ vals.min(),
212
+ vals.max(),
213
+ np.percentile(vals, 10),
214
+ np.percentile(vals, 50),
215
+ np.percentile(vals, 90),
216
+ valid.mean(),
217
+ ], dtype=np.float32)
218
+ else:
219
+ d_stats = np.zeros(8, dtype=np.float32)
220
+
221
+ denom = max(depth_max - depth_min, 1e-6)
222
+ d_norm = np.clip((np.nan_to_num(d, nan=0.0) - depth_min) / denom, 0.0, 1.0)
223
+ d_grid = cv2.resize(d_norm, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1).astype(np.float32)
224
+
225
+ if confidence is None:
226
+ c_stats = np.zeros(4, dtype=np.float32)
227
+ c_grid = np.zeros(grid_size * grid_size, dtype=np.float32)
228
+ else:
229
+ c = np.asarray(confidence, dtype=np.float32)
230
+ c_scale = 255.0 if c.max(initial=0) > 1.0 else 1.0
231
+ c = np.clip(c / c_scale, 0.0, 1.0)
232
+ c_stats = np.asarray([c.mean(), c.std(), c.min(initial=0), c.max(initial=0)], dtype=np.float32)
233
+ c_grid = cv2.resize(c, (grid_size, grid_size), interpolation=cv2.INTER_AREA).reshape(-1).astype(np.float32)
234
+
235
+ return np.concatenate([d_stats, d_grid, c_stats, c_grid]).astype(np.float32)
236
+
237
+
238
+ def read_depth_feature_cache(annotation: Path, n_frames: int, cache_dir: Path, grid_size: int, force: bool) -> np.ndarray:
239
+ cache_dir.mkdir(parents=True, exist_ok=True)
240
+ cache_path = cache_dir / f"depth_n{n_frames}_grid{grid_size}.npz"
241
+ if cache_path.exists() and not force:
242
+ return np.load(cache_path)["features"].astype(np.float32)
243
+
244
+ feature_dim = 8 + grid_size * grid_size + 4 + grid_size * grid_size
245
+ features = np.zeros((n_frames, feature_dim), dtype=np.float32)
246
+ with h5py.File(annotation, "r") as f:
247
+ if "depth/depth" not in f:
248
+ np.savez_compressed(cache_path, features=features)
249
+ return features
250
+ depth_ds = f["depth/depth"]
251
+ conf_ds = f["depth/confidence"] if "depth/confidence" in f else None
252
+ depth_min = float(np.asarray(f["depth/depth_min"][()]).flat[0]) if "depth/depth_min" in f else 0.0
253
+ depth_max = float(np.asarray(f["depth/depth_max"][()]).flat[0]) if "depth/depth_max" in f else 4.0
254
+ limit = min(n_frames, depth_ds.shape[0])
255
+ for idx in range(limit):
256
+ confidence = conf_ds[idx] if conf_ds is not None else None
257
+ features[idx] = depth_frame_features(depth_ds[idx], confidence, depth_min, depth_max, grid_size)
258
+ if idx and idx % 1000 == 0:
259
+ print(f" depth: {idx}/{limit} frames")
260
+ np.savez_compressed(cache_path, features=features)
261
+ return features
262
+
263
+
264
+ TOKEN_RE = re.compile(r"[a-zA-Z0-9_]+")
265
+
266
+
267
+ def hashed_text(text: str, dim: int) -> np.ndarray:
268
+ vec = np.zeros(dim, dtype=np.float32)
269
+ for token in TOKEN_RE.findall(text.lower()):
270
+ digest = hashlib.blake2b(token.encode("utf-8"), digest_size=8).digest()
271
+ bucket = int.from_bytes(digest[:4], "little") % dim
272
+ sign = 1.0 if digest[4] & 1 else -1.0
273
+ vec[bucket] += sign
274
+ norm = np.linalg.norm(vec)
275
+ if norm > 0:
276
+ vec /= norm
277
+ return vec
278
+
279
+
280
+ def text_for_frame(info: dict, include_label_text: bool) -> str:
281
+ parts: list[str] = []
282
+ objects = info.get("objects")
283
+ if isinstance(objects, list):
284
+ parts.extend(str(x) for x in objects)
285
+ elif objects:
286
+ parts.append(str(objects))
287
+ if info.get("interaction"):
288
+ parts.append(str(info["interaction"]))
289
+ if include_label_text:
290
+ for key in ("theme", "action_label", "action_desc"):
291
+ if info.get(key):
292
+ parts.append(str(info[key]))
293
+ return " ".join(parts)
294
+
295
+
296
+ def build_text_features(frame_info_map: dict, n_frames: int, dim: int, include_label_text: bool) -> np.ndarray:
297
+ features = np.zeros((n_frames, dim), dtype=np.float32)
298
+ for idx in range(n_frames):
299
+ info = frame_info_map.get(idx, {})
300
+ features[idx] = hashed_text(text_for_frame(info, include_label_text), dim)
301
+ return features
302
+
303
+
304
+ def prepare_modalities(args: argparse.Namespace, ann: dict) -> tuple[dict, list[dict]]:
305
+ data_root = args.annotation.parent
306
+ n_frames = len(ann["img_names"])
307
+ extras: dict = {
308
+ "video": OrderedDict(),
309
+ "depth": None,
310
+ "text": None,
311
+ "static": OrderedDict(),
312
+ }
313
+ available = []
314
+
315
+ print("Preparing all-modality feature caches")
316
+ print(" depth/confidence")
317
+ depth = read_depth_feature_cache(args.annotation, n_frames, args.cache_dir, args.depth_grid_size, args.force_rebuild_cache)
318
+ extras["depth"] = depth
319
+ available.append({"modality": "depth_confidence", "shape": list(depth.shape)})
320
+
321
+ print(" videos")
322
+ for name, filename in VIDEO_FILES.items():
323
+ path = data_root / filename
324
+ feats = read_video_feature_cache(
325
+ path,
326
+ n_frames,
327
+ args.cache_dir,
328
+ args.video_image_size,
329
+ args.video_grid_size,
330
+ args.video_hist_bins,
331
+ args.force_rebuild_cache,
332
+ )
333
+ extras["video"][name] = feats
334
+ available.append({
335
+ "modality": f"video/{name}",
336
+ "path": portable_path(path, args.workspace),
337
+ "shape": list(feats.shape),
338
+ "exists": path.exists(),
339
+ })
340
+
341
+ print(" caption objects/interaction text")
342
+ text = build_text_features(
343
+ ann["caption_frame_info_map"],
344
+ n_frames,
345
+ args.text_hash_dim,
346
+ args.include_label_text,
347
+ )
348
+ extras["text"] = text
349
+ available.append({
350
+ "modality": "caption_text",
351
+ "shape": list(text.shape),
352
+ "fields": "objects,interaction" + (",theme,action_label,action_desc" if args.include_label_text else ""),
353
+ })
354
+
355
+ pc = point_cloud_features(ann.get("slam_point_cloud"))
356
+ if len(pc):
357
+ extras["static"]["slam_point_cloud"] = pc
358
+ available.append({"modality": "slam_point_cloud_static", "shape": [int(len(pc))]})
359
+
360
+ calib = calibration_features(ann.get("calib_data"))
361
+ if len(calib):
362
+ extras["static"]["calibration"] = calib
363
+ available.append({"modality": "calibration_static", "shape": [int(len(calib))]})
364
+
365
+ return extras, available
366
+
367
+
368
+ def extract_all_window_features(ann: dict, extras: dict, start: int, end: int, return_blocks: bool = False):
369
+ body = safe_window(ann.get("smplh_body_joints"), start, end)
370
+ left = safe_window(ann.get("hand_left_joints"), start, end)
371
+ right = safe_window(ann.get("hand_right_joints"), start, end)
372
+ contacts = safe_window(ann.get("contacts"), start, end)
373
+ cam_t = safe_window(ann.get("t_c2w_all"), start, end)
374
+ cam_R = safe_window(ann.get("R_c2w_all"), start, end)
375
+
376
+ blocks: list[tuple[str, np.ndarray]] = []
377
+
378
+ def add(name: str, vec: np.ndarray | None) -> None:
379
+ if vec is None:
380
+ return
381
+ arr = np.asarray(vec, dtype=np.float32).reshape(-1)
382
+ if arr.size:
383
+ blocks.append((name, np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)))
384
+
385
+ if left is not None:
386
+ add("hand_left_joints", temporal_stats(center_by_body_root(left, body)))
387
+ if right is not None:
388
+ add("hand_right_joints", temporal_stats(center_by_body_root(right, body)))
389
+ if body is not None:
390
+ root = body[:, :1, :] if body.ndim == 3 else 0.0
391
+ add("body_joints", temporal_stats(body - root))
392
+ if contacts is not None:
393
+ add("body_contacts", temporal_stats(contacts))
394
+ if cam_t is not None:
395
+ add("camera_translation", temporal_stats(cam_t - cam_t[:1]))
396
+ if cam_R is not None:
397
+ add("camera_rotation_matrix", temporal_stats(cam_R))
398
+
399
+ imu_accel = ann.get("imu_accel_xyz")
400
+ imu_gyro = ann.get("imu_gyro_xyz")
401
+ imu_keyframes = ann.get("imu_keyframe_indices")
402
+ if imu_accel is not None and imu_gyro is not None and imu_keyframes is not None and len(imu_keyframes) > end - 1:
403
+ imu_start = int(max(0, imu_keyframes[start]))
404
+ imu_end = int(min(len(imu_accel), max(imu_start + 1, imu_keyframes[end - 1] + 1)))
405
+ imu = np.concatenate([imu_accel[imu_start:imu_end], imu_gyro[imu_start:imu_end]], axis=1)
406
+ add("imu_accel_gyro", temporal_stats(imu))
407
+
408
+ if extras.get("depth") is not None:
409
+ add("depth_confidence", temporal_stats(extras["depth"][start:end]))
410
+ for name, feats in extras.get("video", {}).items():
411
+ add(f"video_{name}", temporal_stats(feats[start:end]))
412
+ if extras.get("text") is not None:
413
+ add("caption_objects_interaction_text", temporal_stats(extras["text"][start:end]))
414
+ for name, vec in extras.get("static", {}).items():
415
+ add(name, vec)
416
+
417
+ if not blocks:
418
+ raise ValueError("No usable modalities found.")
419
+ full = np.concatenate([vec for _, vec in blocks]).astype(np.float32)
420
+ if return_blocks:
421
+ return full, [(name, int(len(vec))) for name, vec in blocks]
422
+ return full
423
+
424
+
425
+ def build_feature_dataset(ann: dict, extras: dict, target: str, window_frames: int, stride_frames: int, min_label_fraction: float):
426
+ frame_info = ann.get("caption_frame_info_map")
427
+ if frame_info is None:
428
+ raise ValueError("No caption_frame_info_map found in annotation.")
429
+
430
+ n_frames = len(ann["img_names"])
431
+ X, y_labels, starts, ends, label_fracs = [], [], [], [], []
432
+ feature_manifest = None
433
+ for start in range(0, n_frames - window_frames + 1, stride_frames):
434
+ end = start + window_frames
435
+ labels = [frame_label(frame_info.get(i, {}), target) for i in range(start, end)]
436
+ label, frac = majority_label(labels, min_label_fraction)
437
+ if not label:
438
+ continue
439
+ if feature_manifest is None:
440
+ vec, blocks = extract_all_window_features(ann, extras, start, end, return_blocks=True)
441
+ offset = 0
442
+ feature_manifest = []
443
+ for name, length in blocks:
444
+ feature_manifest.append({"name": name, "start": offset, "end": offset + length, "dim": length})
445
+ offset += length
446
+ else:
447
+ vec = extract_all_window_features(ann, extras, start, end)
448
+ X.append(vec)
449
+ y_labels.append(label)
450
+ starts.append(start)
451
+ ends.append(end - 1)
452
+ label_fracs.append(frac)
453
+
454
+ if not X:
455
+ raise ValueError("No labeled windows were created. Try lowering --min-label-fraction.")
456
+
457
+ return (
458
+ np.stack(X).astype(np.float32),
459
+ np.asarray(y_labels, dtype=object),
460
+ np.asarray(starts, dtype=np.int64),
461
+ np.asarray(ends, dtype=np.int64),
462
+ np.asarray(label_fracs, dtype=np.float32),
463
+ feature_manifest or [],
464
+ )
465
+
466
+
467
+ def write_extra_reports(output_dir: Path, feature_manifest: list[dict], available_modalities: list[dict], args: argparse.Namespace) -> None:
468
+ (output_dir / "feature_manifest.json").write_text(json.dumps(feature_manifest, indent=2), encoding="utf-8")
469
+ (output_dir / "available_modalities.json").write_text(json.dumps(available_modalities, indent=2), encoding="utf-8")
470
+ with (output_dir / "feature_manifest.csv").open("w", newline="", encoding="utf-8") as fp:
471
+ writer = csv.DictWriter(fp, fieldnames=["name", "start", "end", "dim"])
472
+ writer.writeheader()
473
+ writer.writerows(feature_manifest)
474
+ notes = [
475
+ "This is an all-modality lightweight baseline.",
476
+ "RGB/stereo/fisheye/depth/point-cloud/calibration/text are compressed into handcrafted features.",
477
+ "It is not a deep multimodal model.",
478
+ "Do not treat random windows from one episode as a final generalization benchmark.",
479
+ ]
480
+ if args.include_label_text:
481
+ notes.append("WARNING: --include-label-text was used, so language input leaks target semantics.")
482
+ else:
483
+ notes.append("Label text was not included as input; only objects and interaction text were used.")
484
+ (output_dir / "README_model.txt").write_text("\n".join(notes) + "\n", encoding="utf-8")
485
+
486
+
487
+ def main() -> int:
488
+ args = parse_args()
489
+ add_toolkit_to_path(args.workspace)
490
+ from data_loader import load_from_annotation_hdf5
491
+
492
+ if not args.annotation.exists():
493
+ raise FileNotFoundError(f"annotation.hdf5 not found: {args.annotation}")
494
+
495
+ print(f"Loading annotation: {args.annotation}")
496
+ ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=True)
497
+
498
+ extras, available_modalities = prepare_modalities(args, ann)
499
+
500
+ print("Building all-modality windowed feature dataset")
501
+ X, y_labels, starts, ends, label_fracs, feature_manifest = build_feature_dataset(
502
+ ann,
503
+ extras,
504
+ target=args.target,
505
+ window_frames=args.window_frames,
506
+ stride_frames=args.stride_frames,
507
+ min_label_fraction=args.min_label_fraction,
508
+ )
509
+ y, class_names = encode_labels(y_labels)
510
+ train_idx, test_idx = stratified_split(y, args.test_fraction, args.seed)
511
+ if len(test_idx) == 0:
512
+ raise ValueError("No test windows available. Lower --test-fraction or use more data.")
513
+
514
+ mean, std = fit_scaler(X[train_idx])
515
+ X_scaled = (X - mean) / std
516
+
517
+ print(f"Windows: {len(y)} total, {len(train_idx)} train, {len(test_idx)} test")
518
+ print(f"Features: {X.shape[1]}, classes: {len(class_names)}")
519
+ print("Feature blocks:")
520
+ for block in feature_manifest:
521
+ print(f" {block['dim']:5d} {block['name']}")
522
+ for name, count in Counter(y_labels).most_common():
523
+ print(f" {count:4d} windows {name}")
524
+
525
+ print("Training softmax classifier")
526
+ W, b, history = train_softmax_classifier(
527
+ X_scaled[train_idx],
528
+ y[train_idx],
529
+ n_classes=len(class_names),
530
+ epochs=args.epochs,
531
+ lr=args.learning_rate,
532
+ l2=args.l2,
533
+ use_class_weights=not args.no_class_weights,
534
+ seed=args.seed,
535
+ )
536
+
537
+ y_pred, probs = predict(X_scaled[test_idx], W, b)
538
+ metrics, per_class_rows, cm = compute_metrics(y[test_idx], y_pred, class_names)
539
+ majority_class = Counter(y[train_idx]).most_common(1)[0][0]
540
+ metrics["majority_baseline_accuracy"] = float(np.mean(y[test_idx] == majority_class))
541
+ metrics["train_final_accuracy"] = history[-1]["train_accuracy"] if history else float("nan")
542
+ metrics["train_final_loss"] = history[-1]["loss"] if history else float("nan")
543
+ metrics["feature_dim"] = int(X.shape[1])
544
+ metrics["num_windows"] = int(len(y))
545
+
546
+ save_artifacts(
547
+ args.output_dir,
548
+ X,
549
+ y,
550
+ y_labels,
551
+ starts,
552
+ ends,
553
+ label_fracs,
554
+ train_idx,
555
+ test_idx,
556
+ class_names,
557
+ mean,
558
+ std,
559
+ W,
560
+ b,
561
+ history,
562
+ metrics,
563
+ per_class_rows,
564
+ cm,
565
+ y_pred,
566
+ probs,
567
+ args,
568
+ )
569
+ write_extra_reports(args.output_dir, feature_manifest, available_modalities, args)
570
+
571
+ print("\nEvaluation")
572
+ print(f" accuracy: {metrics['accuracy']:.4f}")
573
+ print(f" balanced_accuracy: {metrics['balanced_accuracy']:.4f}")
574
+ print(f" macro_f1: {metrics['macro_f1']:.4f}")
575
+ print(f" weighted_f1: {metrics['weighted_f1']:.4f}")
576
+ print(f" majority_baseline: {metrics['majority_baseline_accuracy']:.4f}")
577
+ print(f"\nArtifacts written to: {args.output_dir}")
578
+ return 0
579
+
580
+
581
+ if __name__ == "__main__":
582
+ raise SystemExit(main())
scripts/train_min_action_model.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Minimal end-to-end action-recognition pipeline for a Ropedia/Xperience episode.
4
+
5
+ Input:
6
+ annotation.hdf5
7
+
8
+ Features:
9
+ hand joints, body joints, contacts, camera trajectory, IMU summary statistics.
10
+
11
+ Target:
12
+ caption action_label by default. Use --target subtask for Sub Task labels.
13
+
14
+ Model:
15
+ Numpy-only multinomial logistic regression.
16
+
17
+ Outputs:
18
+ metrics.json, per_class_metrics.csv, confusion_matrix.csv, predictions.csv,
19
+ feature_dataset.npz, model.npz.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import csv
26
+ import json
27
+ import math
28
+ import sys
29
+ from collections import Counter, OrderedDict
30
+ from pathlib import Path
31
+
32
+ import numpy as np
33
+
34
+
35
+ def parse_args() -> argparse.Namespace:
36
+ workspace_default = Path(__file__).resolve().parents[1]
37
+ data_default = workspace_default / "data/sample/xperience-10m-sample/annotation.hdf5"
38
+ out_default = workspace_default / "outputs/min_action_model"
39
+
40
+ parser = argparse.ArgumentParser(description="Train a minimal action classifier on Ropedia annotation.hdf5.")
41
+ parser.add_argument("--workspace", type=Path, default=workspace_default, help="Ropedia workspace root.")
42
+ parser.add_argument("--annotation", type=Path, default=data_default, help="Path to annotation.hdf5.")
43
+ parser.add_argument("--output-dir", type=Path, default=out_default, help="Output artifact directory.")
44
+ parser.add_argument("--target", choices=["action", "subtask"], default="action", help="Prediction target.")
45
+ parser.add_argument("--window-frames", type=int, default=20, help="Frames per training window.")
46
+ parser.add_argument("--stride-frames", type=int, default=5, help="Stride between windows.")
47
+ parser.add_argument("--min-label-fraction", type=float, default=0.6, help="Minimum majority-label fraction in a window.")
48
+ parser.add_argument("--test-fraction", type=float, default=0.25, help="Stratified test fraction.")
49
+ parser.add_argument("--epochs", type=int, default=800, help="Training epochs.")
50
+ parser.add_argument("--learning-rate", type=float, default=0.2, help="Softmax learning rate.")
51
+ parser.add_argument("--l2", type=float, default=1e-3, help="L2 weight decay.")
52
+ parser.add_argument("--seed", type=int, default=7, help="Random seed.")
53
+ parser.add_argument("--no-class-weights", action="store_true", help="Disable inverse-frequency class weighting.")
54
+ return parser.parse_args()
55
+
56
+
57
+ def add_toolkit_to_path(workspace: Path) -> None:
58
+ toolkit = workspace / "HOMIE-toolkit"
59
+ if not toolkit.exists():
60
+ raise FileNotFoundError(f"HOMIE-toolkit not found: {toolkit}")
61
+ sys.path.insert(0, str(toolkit))
62
+
63
+
64
+ def portable_path(path: Path, workspace: Path | None = None) -> str:
65
+ roots = [workspace, Path.cwd()]
66
+ for root in roots:
67
+ if root is None:
68
+ continue
69
+ try:
70
+ return path.resolve().relative_to(Path(root).resolve()).as_posix()
71
+ except (FileNotFoundError, ValueError):
72
+ continue
73
+ return path.name
74
+
75
+
76
+ def temporal_stats(arr: np.ndarray) -> np.ndarray:
77
+ """Return fixed statistics over time for an array shaped (T, ...)."""
78
+ arr = np.asarray(arr, dtype=np.float32)
79
+ if arr.ndim == 0:
80
+ arr = arr.reshape(1, 1)
81
+ elif arr.ndim == 1:
82
+ arr = arr[:, None]
83
+ flat = arr.reshape(arr.shape[0], -1)
84
+ flat = np.nan_to_num(flat, nan=0.0, posinf=0.0, neginf=0.0)
85
+ if flat.shape[0] == 0:
86
+ raise ValueError("temporal_stats received an empty time axis")
87
+
88
+ mean = flat.mean(axis=0)
89
+ std = flat.std(axis=0)
90
+ amin = flat.min(axis=0)
91
+ amax = flat.max(axis=0)
92
+ delta = flat[-1] - flat[0]
93
+ if flat.shape[0] > 1:
94
+ vel = np.diff(flat, axis=0)
95
+ vel_mean = vel.mean(axis=0)
96
+ vel_std = vel.std(axis=0)
97
+ else:
98
+ vel_mean = np.zeros(flat.shape[1], dtype=np.float32)
99
+ vel_std = np.zeros(flat.shape[1], dtype=np.float32)
100
+ return np.concatenate([mean, std, amin, amax, delta, vel_mean, vel_std]).astype(np.float32)
101
+
102
+
103
+ def safe_window(arr: np.ndarray | None, start: int, end: int) -> np.ndarray | None:
104
+ if arr is None:
105
+ return None
106
+ if start >= len(arr):
107
+ return None
108
+ return np.asarray(arr[start:min(end, len(arr))])
109
+
110
+
111
+ def center_by_body_root(values: np.ndarray, body: np.ndarray | None) -> np.ndarray:
112
+ if body is None or len(body) != len(values) or body.ndim < 3 or body.shape[-1] != 3:
113
+ return values
114
+ root = body[:, :1, :]
115
+ return values - root
116
+
117
+
118
+ def extract_window_features(ann: dict, start: int, end: int) -> np.ndarray:
119
+ body = safe_window(ann.get("smplh_body_joints"), start, end)
120
+ left = safe_window(ann.get("hand_left_joints"), start, end)
121
+ right = safe_window(ann.get("hand_right_joints"), start, end)
122
+ contacts = safe_window(ann.get("contacts"), start, end)
123
+ cam_t = safe_window(ann.get("t_c2w_all"), start, end)
124
+
125
+ chunks: list[np.ndarray] = []
126
+
127
+ if left is not None:
128
+ chunks.append(temporal_stats(center_by_body_root(left, body)))
129
+ if right is not None:
130
+ chunks.append(temporal_stats(center_by_body_root(right, body)))
131
+ if body is not None:
132
+ root = body[:, :1, :] if body.ndim == 3 else 0.0
133
+ chunks.append(temporal_stats(body - root))
134
+ if contacts is not None:
135
+ chunks.append(temporal_stats(contacts))
136
+ if cam_t is not None:
137
+ cam_t = cam_t - cam_t[:1]
138
+ chunks.append(temporal_stats(cam_t))
139
+
140
+ imu_accel = ann.get("imu_accel_xyz")
141
+ imu_gyro = ann.get("imu_gyro_xyz")
142
+ imu_keyframes = ann.get("imu_keyframe_indices")
143
+ if imu_accel is not None and imu_gyro is not None and imu_keyframes is not None and len(imu_keyframes) > end - 1:
144
+ imu_start = int(max(0, imu_keyframes[start]))
145
+ imu_end = int(min(len(imu_accel), max(imu_start + 1, imu_keyframes[end - 1] + 1)))
146
+ imu = np.concatenate([imu_accel[imu_start:imu_end], imu_gyro[imu_start:imu_end]], axis=1)
147
+ chunks.append(temporal_stats(imu))
148
+
149
+ if not chunks:
150
+ raise ValueError("No usable numeric modalities found in annotation.")
151
+ return np.concatenate(chunks).astype(np.float32)
152
+
153
+
154
+ def frame_label(info: dict, target: str) -> str:
155
+ if target == "subtask":
156
+ label = info.get("theme", "")
157
+ else:
158
+ label = info.get("action_label", "")
159
+ label = str(label).strip()
160
+ if not label or label.upper() == "N/A":
161
+ return ""
162
+ return label
163
+
164
+
165
+ def majority_label(labels: list[str], min_fraction: float) -> tuple[str, float]:
166
+ labels = [x for x in labels if x]
167
+ if not labels:
168
+ return "", 0.0
169
+ label, count = Counter(labels).most_common(1)[0]
170
+ frac = count / len(labels)
171
+ if frac < min_fraction:
172
+ return "", frac
173
+ return label, frac
174
+
175
+
176
+ def build_feature_dataset(ann: dict, target: str, window_frames: int, stride_frames: int, min_label_fraction: float):
177
+ frame_info = ann.get("caption_frame_info_map")
178
+ if frame_info is None:
179
+ raise ValueError("No caption_frame_info_map found in annotation.")
180
+
181
+ n_frames = len(ann["img_names"])
182
+ X, y_labels, starts, ends, label_fracs = [], [], [], [], []
183
+ for start in range(0, n_frames - window_frames + 1, stride_frames):
184
+ end = start + window_frames
185
+ labels = [frame_label(frame_info.get(i, {}), target) for i in range(start, end)]
186
+ label, frac = majority_label(labels, min_label_fraction)
187
+ if not label:
188
+ continue
189
+ X.append(extract_window_features(ann, start, end))
190
+ y_labels.append(label)
191
+ starts.append(start)
192
+ ends.append(end - 1)
193
+ label_fracs.append(frac)
194
+
195
+ if not X:
196
+ raise ValueError("No labeled windows were created. Try lowering --min-label-fraction.")
197
+
198
+ return (
199
+ np.stack(X).astype(np.float32),
200
+ np.asarray(y_labels, dtype=object),
201
+ np.asarray(starts, dtype=np.int64),
202
+ np.asarray(ends, dtype=np.int64),
203
+ np.asarray(label_fracs, dtype=np.float32),
204
+ )
205
+
206
+
207
+ def encode_labels(y_labels: np.ndarray) -> tuple[np.ndarray, list[str]]:
208
+ seen = OrderedDict()
209
+ for label in y_labels:
210
+ if label not in seen:
211
+ seen[label] = len(seen)
212
+ class_names = list(seen.keys())
213
+ y = np.asarray([seen[label] for label in y_labels], dtype=np.int64)
214
+ return y, class_names
215
+
216
+
217
+ def stratified_split(y: np.ndarray, test_fraction: float, seed: int) -> tuple[np.ndarray, np.ndarray]:
218
+ rng = np.random.default_rng(seed)
219
+ train_idx, test_idx = [], []
220
+ for cls in np.unique(y):
221
+ idx = np.flatnonzero(y == cls)
222
+ rng.shuffle(idx)
223
+ if len(idx) < 2:
224
+ train_idx.extend(idx.tolist())
225
+ continue
226
+ n_test = int(round(len(idx) * test_fraction))
227
+ n_test = max(1, min(n_test, len(idx) - 1))
228
+ test_idx.extend(idx[:n_test].tolist())
229
+ train_idx.extend(idx[n_test:].tolist())
230
+ rng.shuffle(train_idx)
231
+ rng.shuffle(test_idx)
232
+ return np.asarray(train_idx, dtype=np.int64), np.asarray(test_idx, dtype=np.int64)
233
+
234
+
235
+ def fit_scaler(X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
236
+ mean = X.mean(axis=0)
237
+ std = X.std(axis=0)
238
+ std = np.where(std < 1e-6, 1.0, std)
239
+ return mean.astype(np.float32), std.astype(np.float32)
240
+
241
+
242
+ def softmax(logits: np.ndarray) -> np.ndarray:
243
+ logits = logits - logits.max(axis=1, keepdims=True)
244
+ exp = np.exp(logits)
245
+ return exp / exp.sum(axis=1, keepdims=True)
246
+
247
+
248
+ def train_softmax_classifier(
249
+ X: np.ndarray,
250
+ y: np.ndarray,
251
+ n_classes: int,
252
+ epochs: int,
253
+ lr: float,
254
+ l2: float,
255
+ use_class_weights: bool,
256
+ seed: int,
257
+ ) -> tuple[np.ndarray, np.ndarray, list[dict]]:
258
+ rng = np.random.default_rng(seed)
259
+ n, d = X.shape
260
+ W = rng.normal(0.0, 0.01, size=(d, n_classes)).astype(np.float32)
261
+ b = np.zeros(n_classes, dtype=np.float32)
262
+ onehot = np.eye(n_classes, dtype=np.float32)[y]
263
+
264
+ if use_class_weights:
265
+ counts = np.bincount(y, minlength=n_classes).astype(np.float32)
266
+ weights_by_class = n / np.maximum(counts, 1.0) / n_classes
267
+ sample_weights = weights_by_class[y]
268
+ else:
269
+ sample_weights = np.ones(n, dtype=np.float32)
270
+ sample_weights = sample_weights / sample_weights.mean()
271
+
272
+ history = []
273
+ report_every = max(1, epochs // 10)
274
+ for epoch in range(1, epochs + 1):
275
+ logits = X @ W + b
276
+ probs = softmax(logits)
277
+ weighted_diff = (probs - onehot) * sample_weights[:, None] / n
278
+ grad_W = X.T @ weighted_diff + l2 * W
279
+ grad_b = weighted_diff.sum(axis=0)
280
+ W -= lr * grad_W
281
+ b -= lr * grad_b
282
+
283
+ if epoch == 1 or epoch == epochs or epoch % report_every == 0:
284
+ p_true = np.clip(probs[np.arange(n), y], 1e-9, 1.0)
285
+ loss = float(-(sample_weights * np.log(p_true)).mean() + 0.5 * l2 * float(np.sum(W * W)))
286
+ acc = float(np.mean(np.argmax(probs, axis=1) == y))
287
+ history.append({"epoch": epoch, "loss": loss, "train_accuracy": acc})
288
+ return W.astype(np.float32), b.astype(np.float32), history
289
+
290
+
291
+ def predict(X: np.ndarray, W: np.ndarray, b: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
292
+ probs = softmax(X @ W + b)
293
+ return np.argmax(probs, axis=1), probs
294
+
295
+
296
+ def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray, class_names: list[str]) -> tuple[dict, list[dict], np.ndarray]:
297
+ n_classes = len(class_names)
298
+ cm = np.zeros((n_classes, n_classes), dtype=np.int64)
299
+ for t, p in zip(y_true, y_pred):
300
+ cm[int(t), int(p)] += 1
301
+
302
+ rows = []
303
+ recalls, f1s, weighted_f1_total = [], [], 0.0
304
+ support_total = int(cm.sum())
305
+ for i, name in enumerate(class_names):
306
+ tp = int(cm[i, i])
307
+ support = int(cm[i, :].sum())
308
+ pred_count = int(cm[:, i].sum())
309
+ precision = tp / pred_count if pred_count else 0.0
310
+ recall = tp / support if support else 0.0
311
+ f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
312
+ if support:
313
+ recalls.append(recall)
314
+ f1s.append(f1)
315
+ weighted_f1_total += f1 * support
316
+ rows.append({
317
+ "class_id": i,
318
+ "class_name": name,
319
+ "support": support,
320
+ "predicted": pred_count,
321
+ "precision": precision,
322
+ "recall": recall,
323
+ "f1": f1,
324
+ })
325
+
326
+ accuracy = float(np.mean(y_true == y_pred)) if len(y_true) else 0.0
327
+ macro_f1 = float(np.mean(f1s)) if f1s else 0.0
328
+ balanced_accuracy = float(np.mean(recalls)) if recalls else 0.0
329
+ weighted_f1 = float(weighted_f1_total / support_total) if support_total else 0.0
330
+ metrics = {
331
+ "accuracy": accuracy,
332
+ "balanced_accuracy": balanced_accuracy,
333
+ "macro_f1": macro_f1,
334
+ "weighted_f1": weighted_f1,
335
+ "num_eval_windows": int(len(y_true)),
336
+ "num_classes": n_classes,
337
+ }
338
+ return metrics, rows, cm
339
+
340
+
341
+ def write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> None:
342
+ with path.open("w", newline="", encoding="utf-8") as fp:
343
+ writer = csv.DictWriter(fp, fieldnames=fieldnames)
344
+ writer.writeheader()
345
+ writer.writerows(rows)
346
+
347
+
348
+ def save_artifacts(
349
+ output_dir: Path,
350
+ X: np.ndarray,
351
+ y: np.ndarray,
352
+ y_labels: np.ndarray,
353
+ starts: np.ndarray,
354
+ ends: np.ndarray,
355
+ label_fracs: np.ndarray,
356
+ train_idx: np.ndarray,
357
+ test_idx: np.ndarray,
358
+ class_names: list[str],
359
+ mean: np.ndarray,
360
+ std: np.ndarray,
361
+ W: np.ndarray,
362
+ b: np.ndarray,
363
+ history: list[dict],
364
+ metrics: dict,
365
+ per_class_rows: list[dict],
366
+ cm: np.ndarray,
367
+ y_pred: np.ndarray,
368
+ probs: np.ndarray,
369
+ args: argparse.Namespace,
370
+ ) -> None:
371
+ output_dir.mkdir(parents=True, exist_ok=True)
372
+
373
+ np.savez_compressed(
374
+ output_dir / "feature_dataset.npz",
375
+ X=X,
376
+ y=y,
377
+ labels=y_labels.astype(str),
378
+ start_frame=starts,
379
+ end_frame=ends,
380
+ label_fraction=label_fracs,
381
+ train_idx=train_idx,
382
+ test_idx=test_idx,
383
+ class_names=np.asarray(class_names, dtype=object),
384
+ )
385
+ np.savez_compressed(output_dir / "model.npz", mean=mean, std=std, W=W, b=b, class_names=np.asarray(class_names, dtype=object))
386
+
387
+ metadata = {
388
+ "annotation": portable_path(args.annotation, args.workspace),
389
+ "target": args.target,
390
+ "window_frames": args.window_frames,
391
+ "stride_frames": args.stride_frames,
392
+ "min_label_fraction": args.min_label_fraction,
393
+ "test_fraction": args.test_fraction,
394
+ "epochs": args.epochs,
395
+ "learning_rate": args.learning_rate,
396
+ "l2": args.l2,
397
+ "class_weights": not args.no_class_weights,
398
+ "num_windows": int(len(y)),
399
+ "num_features": int(X.shape[1]),
400
+ "num_train_windows": int(len(train_idx)),
401
+ "num_test_windows": int(len(test_idx)),
402
+ "classes": class_names,
403
+ "history": history,
404
+ }
405
+ (output_dir / "metadata.json").write_text(json.dumps(metadata, indent=2), encoding="utf-8")
406
+ (output_dir / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")
407
+
408
+ write_csv(
409
+ output_dir / "per_class_metrics.csv",
410
+ per_class_rows,
411
+ ["class_id", "class_name", "support", "predicted", "precision", "recall", "f1"],
412
+ )
413
+
414
+ with (output_dir / "confusion_matrix.csv").open("w", newline="", encoding="utf-8") as fp:
415
+ writer = csv.writer(fp)
416
+ writer.writerow(["true\\pred"] + class_names)
417
+ for i, name in enumerate(class_names):
418
+ writer.writerow([name] + [int(v) for v in cm[i]])
419
+
420
+ pred_rows = []
421
+ pred_lookup = {int(idx): k for k, idx in enumerate(test_idx)}
422
+ for idx in test_idx:
423
+ idx = int(idx)
424
+ k = pred_lookup[idx]
425
+ pred_id = int(y_pred[k])
426
+ true_id = int(y[idx])
427
+ pred_rows.append({
428
+ "window_index": idx,
429
+ "start_frame": int(starts[idx]),
430
+ "end_frame": int(ends[idx]),
431
+ "true_label": class_names[true_id],
432
+ "predicted_label": class_names[pred_id],
433
+ "confidence": float(probs[k, pred_id]),
434
+ "correct": int(pred_id == true_id),
435
+ "label_fraction": float(label_fracs[idx]),
436
+ })
437
+ write_csv(
438
+ output_dir / "predictions.csv",
439
+ pred_rows,
440
+ ["window_index", "start_frame", "end_frame", "true_label", "predicted_label", "confidence", "correct", "label_fraction"],
441
+ )
442
+
443
+
444
+ def main() -> int:
445
+ args = parse_args()
446
+ add_toolkit_to_path(args.workspace)
447
+ from data_loader import load_from_annotation_hdf5
448
+
449
+ if not args.annotation.exists():
450
+ raise FileNotFoundError(f"annotation.hdf5 not found: {args.annotation}")
451
+
452
+ print(f"Loading annotation: {args.annotation}")
453
+ ann = load_from_annotation_hdf5(args.annotation, 0, None, load_slam_point_cloud=False)
454
+
455
+ print("Building windowed feature dataset")
456
+ X, y_labels, starts, ends, label_fracs = build_feature_dataset(
457
+ ann,
458
+ target=args.target,
459
+ window_frames=args.window_frames,
460
+ stride_frames=args.stride_frames,
461
+ min_label_fraction=args.min_label_fraction,
462
+ )
463
+ y, class_names = encode_labels(y_labels)
464
+ train_idx, test_idx = stratified_split(y, args.test_fraction, args.seed)
465
+ if len(test_idx) == 0:
466
+ raise ValueError("No test windows available. Lower --test-fraction or use more data.")
467
+
468
+ mean, std = fit_scaler(X[train_idx])
469
+ X_scaled = (X - mean) / std
470
+
471
+ print(f"Windows: {len(y)} total, {len(train_idx)} train, {len(test_idx)} test")
472
+ print(f"Features: {X.shape[1]}, classes: {len(class_names)}")
473
+ for name, count in Counter(y_labels).most_common():
474
+ print(f" {count:4d} windows {name}")
475
+
476
+ print("Training softmax classifier")
477
+ W, b, history = train_softmax_classifier(
478
+ X_scaled[train_idx],
479
+ y[train_idx],
480
+ n_classes=len(class_names),
481
+ epochs=args.epochs,
482
+ lr=args.learning_rate,
483
+ l2=args.l2,
484
+ use_class_weights=not args.no_class_weights,
485
+ seed=args.seed,
486
+ )
487
+
488
+ y_pred, probs = predict(X_scaled[test_idx], W, b)
489
+ metrics, per_class_rows, cm = compute_metrics(y[test_idx], y_pred, class_names)
490
+
491
+ majority_class = Counter(y[train_idx]).most_common(1)[0][0]
492
+ metrics["majority_baseline_accuracy"] = float(np.mean(y[test_idx] == majority_class))
493
+ metrics["train_final_accuracy"] = history[-1]["train_accuracy"] if history else math.nan
494
+ metrics["train_final_loss"] = history[-1]["loss"] if history else math.nan
495
+
496
+ save_artifacts(
497
+ args.output_dir,
498
+ X,
499
+ y,
500
+ y_labels,
501
+ starts,
502
+ ends,
503
+ label_fracs,
504
+ train_idx,
505
+ test_idx,
506
+ class_names,
507
+ mean,
508
+ std,
509
+ W,
510
+ b,
511
+ history,
512
+ metrics,
513
+ per_class_rows,
514
+ cm,
515
+ y_pred,
516
+ probs,
517
+ args,
518
+ )
519
+
520
+ print("\nEvaluation")
521
+ print(f" accuracy: {metrics['accuracy']:.4f}")
522
+ print(f" balanced_accuracy: {metrics['balanced_accuracy']:.4f}")
523
+ print(f" macro_f1: {metrics['macro_f1']:.4f}")
524
+ print(f" weighted_f1: {metrics['weighted_f1']:.4f}")
525
+ print(f" majority_baseline: {metrics['majority_baseline_accuracy']:.4f}")
526
+ print(f"\nArtifacts written to: {args.output_dir}")
527
+ return 0
528
+
529
+
530
+ if __name__ == "__main__":
531
+ raise SystemExit(main())